From: Tom Stellard Date: Sat, 13 Jun 2015 03:28:10 +0000 (+0000) Subject: R600 -> AMDGPU rename X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=953c6814730951ad9a286d7991e9c8c481433d45;p=oota-llvm.git R600 -> AMDGPU rename git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@239657 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 026fe479abd..da731497997 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -176,6 +176,7 @@ set(LLVM_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/include) set(LLVM_ALL_TARGETS AArch64 + AMDGPU ARM BPF CppBackend @@ -184,7 +185,6 @@ set(LLVM_ALL_TARGETS MSP430 NVPTX PowerPC - R600 Sparc SystemZ X86 diff --git a/autoconf/configure.ac b/autoconf/configure.ac index 11ba0511799..5b70fbd1bbf 100644 --- a/autoconf/configure.ac +++ b/autoconf/configure.ac @@ -1097,7 +1097,7 @@ if test "$llvm_cv_enable_crash_overrides" = "yes" ; then fi dnl List all possible targets -ALL_TARGETS="X86 Sparc PowerPC ARM AArch64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ R600 BPF" +ALL_TARGETS="X86 Sparc PowerPC ARM AArch64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ AMDGPU BPF" AC_SUBST(ALL_TARGETS,$ALL_TARGETS) dnl Allow specific targets to be specified for building (or not) @@ -1132,7 +1132,8 @@ case "$enableval" in hexagon) TARGETS_TO_BUILD="Hexagon $TARGETS_TO_BUILD" ;; nvptx) TARGETS_TO_BUILD="NVPTX $TARGETS_TO_BUILD" ;; systemz) TARGETS_TO_BUILD="SystemZ $TARGETS_TO_BUILD" ;; - r600) TARGETS_TO_BUILD="R600 $TARGETS_TO_BUILD" ;; + amdgpu) ;& + r600) TARGETS_TO_BUILD="AMDGPU $TARGETS_TO_BUILD" ;; host) case "$llvm_cv_target_arch" in x86) TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;; x86_64) TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;; diff --git a/configure b/configure index 6cb9f2d9281..73fce67b058 100755 --- a/configure +++ b/configure @@ -5628,7 +5628,7 @@ _ACEOF fi -ALL_TARGETS="X86 Sparc PowerPC ARM AArch64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ R600 BPF" +ALL_TARGETS="X86 Sparc PowerPC ARM AArch64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ AMDGPU BPF" ALL_TARGETS=$ALL_TARGETS @@ -5665,7 +5665,8 @@ case "$enableval" in hexagon) TARGETS_TO_BUILD="Hexagon $TARGETS_TO_BUILD" ;; nvptx) TARGETS_TO_BUILD="NVPTX $TARGETS_TO_BUILD" ;; systemz) TARGETS_TO_BUILD="SystemZ $TARGETS_TO_BUILD" ;; - r600) TARGETS_TO_BUILD="R600 $TARGETS_TO_BUILD" ;; + amdgpu) ;& + r600) TARGETS_TO_BUILD="AMDGPU $TARGETS_TO_BUILD" ;; host) case "$llvm_cv_target_arch" in x86) TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;; x86_64) TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;; diff --git a/docs/AMDGPUUsage.rst b/docs/AMDGPUUsage.rst new file mode 100644 index 00000000000..3cb41cebfff --- /dev/null +++ b/docs/AMDGPUUsage.rst @@ -0,0 +1,94 @@ +============================== +User Guide for AMDGPU Back-end +============================== + +Introduction +============ + +The AMDGPU back-end provides ISA code generation for AMD GPUs, starting with +the R600 family up until the current Volcanic Islands (GCN Gen 3). + + +Assembler +========= + +The assembler is currently considered experimental. + +For syntax examples look in test/MC/AMDGPU. + +Below some of the currently supported features (modulo bugs). These +all apply to the Southern Islands ISA, Sea Islands and Volcanic Islands +are also supported but may be missing some instructions and have more bugs: + +DS Instructions +--------------- +All DS instructions are supported. + +FLAT Instructions +------------------ +These instructions are only present in the Sea Islands and Volcanic Islands +instruction set. All FLAT instructions are supported for these architectures + +MUBUF Instructions +------------------ +All non-atomic MUBUF instructions are supported. + +SMRD Instructions +----------------- +Only the s_load_dword* SMRD instructions are supported. + +SOP1 Instructions +----------------- +All SOP1 instructions are supported. + +SOP2 Instructions +----------------- +All SOP2 instructions are supported. + +SOPC Instructions +----------------- +All SOPC instructions are supported. + +SOPP Instructions +----------------- + +Unless otherwise mentioned, all SOPP instructions that have one or more +operands accept integer operands only. No verification is performed +on the operands, so it is up to the programmer to be familiar with the +range or acceptable values. + +s_waitcnt +^^^^^^^^^ + +s_waitcnt accepts named arguments to specify which memory counter(s) to +wait for. + +.. code-block:: nasm + + // Wait for all counters to be 0 + s_waitcnt 0 + + // Equivalent to s_waitcnt 0. Counter names can also be delimited by + // '&' or ','. + s_waitcnt vmcnt(0) expcnt(0) lgkcmt(0) + + // Wait for vmcnt counter to be 1. + s_waitcnt vmcnt(1) + +VOP1, VOP2, VOP3, VOPC Instructions +----------------------------------- + +All 32-bit and 64-bit encodings should work. + +The assembler will automatically detect which encoding size to use for +VOP1, VOP2, and VOPC instructions based on the operands. If you want to force +a specific encoding size, you can add an _e32 (for 32-bit encoding) or +_e64 (for 64-bit encoding) suffix to the instruction. Most, but not all +instructions support an explicit suffix. These are all valid assembly +strings: + +.. code-block:: nasm + + v_mul_i32_i24 v1, v2, v3 + v_mul_i32_i24_e32 v1, v2, v3 + v_mul_i32_i24_e64 v1, v2, v3 diff --git a/docs/CompilerWriterInfo.rst b/docs/CompilerWriterInfo.rst index 2dfdc9b142d..900ba24e230 100644 --- a/docs/CompilerWriterInfo.rst +++ b/docs/CompilerWriterInfo.rst @@ -68,8 +68,8 @@ Other documents, collections, notes * `PowerPC64 alignment of long doubles (from GCC) `_ * `Long branch stubs for powerpc64-linux (from binutils) `_ -R600 ----- +AMDGPU +------ * `AMD R6xx shader ISA `_ * `AMD R7xx shader ISA `_ diff --git a/docs/GettingStarted.rst b/docs/GettingStarted.rst index 18b3c1d87cc..212fa0b5833 100644 --- a/docs/GettingStarted.rst +++ b/docs/GettingStarted.rst @@ -711,7 +711,7 @@ used by people developing LLVM. | | as ``LLVM_ALL_TARGETS``, and can be set to include | | | out-of-tree targets. The default value includes: | | | ``AArch64, ARM, CppBackend, Hexagon, | -| | Mips, MSP430, NVPTX, PowerPC, R600, Sparc, | +| | Mips, MSP430, NVPTX, PowerPC, AMDGPU, Sparc, | | | SystemZ, X86, XCore``. | +-------------------------+----------------------------------------------------+ | LLVM_ENABLE_DOXYGEN | Build doxygen-based documentation from the source | diff --git a/docs/R600Usage.rst b/docs/R600Usage.rst deleted file mode 100644 index 9bd16f46098..00000000000 --- a/docs/R600Usage.rst +++ /dev/null @@ -1,94 +0,0 @@ -============================ -User Guide for R600 Back-end -============================ - -Introduction -============ - -The R600 back-end provides ISA code generation for AMD GPUs, starting with -the R600 family up until the current Volcanic Islands (GCN Gen 3). - - -Assembler -========= - -The assembler is currently considered experimental. - -For syntax examples look in test/MC/R600. - -Below some of the currently supported features (modulo bugs). These -all apply to the Southern Islands ISA, Sea Islands and Volcanic Islands -are also supported but may be missing some instructions and have more bugs: - -DS Instructions ---------------- -All DS instructions are supported. - -FLAT Instructions ------------------- -These instructions are only present in the Sea Islands and Volcanic Islands -instruction set. All FLAT instructions are supported for these architectures - -MUBUF Instructions ------------------- -All non-atomic MUBUF instructions are supported. - -SMRD Instructions ------------------ -Only the s_load_dword* SMRD instructions are supported. - -SOP1 Instructions ------------------ -All SOP1 instructions are supported. - -SOP2 Instructions ------------------ -All SOP2 instructions are supported. - -SOPC Instructions ------------------ -All SOPC instructions are supported. - -SOPP Instructions ------------------ - -Unless otherwise mentioned, all SOPP instructions that have one or more -operands accept integer operands only. No verification is performed -on the operands, so it is up to the programmer to be familiar with the -range or acceptable values. - -s_waitcnt -^^^^^^^^^ - -s_waitcnt accepts named arguments to specify which memory counter(s) to -wait for. - -.. code-block:: nasm - - // Wait for all counters to be 0 - s_waitcnt 0 - - // Equivalent to s_waitcnt 0. Counter names can also be delimited by - // '&' or ','. - s_waitcnt vmcnt(0) expcnt(0) lgkcmt(0) - - // Wait for vmcnt counter to be 1. - s_waitcnt vmcnt(1) - -VOP1, VOP2, VOP3, VOPC Instructions ------------------------------------ - -All 32-bit and 64-bit encodings should work. - -The assembler will automatically detect which encoding size to use for -VOP1, VOP2, and VOPC instructions based on the operands. If you want to force -a specific encoding size, you can add an _e32 (for 32-bit encoding) or -_e64 (for 64-bit encoding) suffix to the instruction. Most, but not all -instructions support an explicit suffix. These are all valid assembly -strings: - -.. code-block:: nasm - - v_mul_i32_i24 v1, v2, v3 - v_mul_i32_i24_e32 v1, v2, v3 - v_mul_i32_i24_e64 v1, v2, v3 diff --git a/docs/index.rst b/docs/index.rst index 2cc5b8bf095..0b681180970 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -252,7 +252,7 @@ For API clients and LLVM developers. WritingAnLLVMPass HowToUseAttributes NVPTXUsage - R600Usage + AMDGPUUsage StackMaps InAlloca BigEndianNEON @@ -338,8 +338,8 @@ For API clients and LLVM developers. :doc:`NVPTXUsage` This document describes using the NVPTX back-end to compile GPU kernels. -:doc:`R600Usage` - This document describes how to use the R600 back-end. +:doc:`AMDGPUUsage` + This document describes how to use the AMDGPU back-end. :doc:`StackMaps` LLVM support for mapping instruction addresses to the location of diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h new file mode 100644 index 00000000000..0a05d25189b --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -0,0 +1,148 @@ +//===-- AMDGPU.h - MachineFunction passes hw codegen --------------*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPU_H +#define LLVM_LIB_TARGET_R600_AMDGPU_H + +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { + +class AMDGPUInstrPrinter; +class AMDGPUSubtarget; +class AMDGPUTargetMachine; +class FunctionPass; +class MCAsmInfo; +class raw_ostream; +class Target; +class TargetMachine; + +// R600 Passes +FunctionPass *createR600VectorRegMerger(TargetMachine &tm); +FunctionPass *createR600TextureIntrinsicsReplacer(); +FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm); +FunctionPass *createR600EmitClauseMarkers(); +FunctionPass *createR600ClauseMergePass(TargetMachine &tm); +FunctionPass *createR600Packetizer(TargetMachine &tm); +FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm); +FunctionPass *createAMDGPUCFGStructurizerPass(); + +// SI Passes +FunctionPass *createSITypeRewriter(); +FunctionPass *createSIAnnotateControlFlowPass(); +FunctionPass *createSIFoldOperandsPass(); +FunctionPass *createSILowerI1CopiesPass(); +FunctionPass *createSIShrinkInstructionsPass(); +FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm); +FunctionPass *createSILowerControlFlowPass(TargetMachine &tm); +FunctionPass *createSIFixControlFlowLiveIntervalsPass(); +FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm); +FunctionPass *createSIFixSGPRLiveRangesPass(); +FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); +FunctionPass *createSIInsertWaits(TargetMachine &tm); +FunctionPass *createSIPrepareScratchRegs(); + +void initializeSIFoldOperandsPass(PassRegistry &); +extern char &SIFoldOperandsID; + +void initializeSILowerI1CopiesPass(PassRegistry &); +extern char &SILowerI1CopiesID; + +void initializeSILoadStoreOptimizerPass(PassRegistry &); +extern char &SILoadStoreOptimizerID; + +// Passes common to R600 and SI +FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST); +Pass *createAMDGPUStructurizeCFGPass(); +FunctionPass *createAMDGPUISelDag(TargetMachine &tm); +ModulePass *createAMDGPUAlwaysInlinePass(); + +void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&); +extern char &SIFixControlFlowLiveIntervalsID; + +void initializeSIFixSGPRLiveRangesPass(PassRegistry&); +extern char &SIFixSGPRLiveRangesID; + + +extern Target TheAMDGPUTarget; +extern Target TheGCNTarget; + +namespace AMDGPU { +enum TargetIndex { + TI_CONSTDATA_START, + TI_SCRATCH_RSRC_DWORD0, + TI_SCRATCH_RSRC_DWORD1, + TI_SCRATCH_RSRC_DWORD2, + TI_SCRATCH_RSRC_DWORD3 +}; +} + +#define END_OF_TEXT_LABEL_NAME "EndOfTextLabel" + +} // End namespace llvm + +namespace ShaderType { + enum Type { + PIXEL = 0, + VERTEX = 1, + GEOMETRY = 2, + COMPUTE = 3 + }; +} + +/// OpenCL uses address spaces to differentiate between +/// various memory regions on the hardware. On the CPU +/// all of the address spaces point to the same memory, +/// however on the GPU, each address space points to +/// a separate piece of memory that is unique from other +/// memory locations. +namespace AMDGPUAS { +enum AddressSpaces : unsigned { + PRIVATE_ADDRESS = 0, ///< Address space for private memory. + GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). + CONSTANT_ADDRESS = 2, ///< Address space for constant memory + LOCAL_ADDRESS = 3, ///< Address space for local memory. + FLAT_ADDRESS = 4, ///< Address space for flat memory. + REGION_ADDRESS = 5, ///< Address space for region memory. + PARAM_D_ADDRESS = 6, ///< Address space for direct addressible parameter memory (CONST0) + PARAM_I_ADDRESS = 7, ///< Address space for indirect addressible parameter memory (VTX1) + + // Do not re-order the CONSTANT_BUFFER_* enums. Several places depend on this + // order to be able to dynamically index a constant buffer, for example: + // + // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx + + CONSTANT_BUFFER_0 = 8, + CONSTANT_BUFFER_1 = 9, + CONSTANT_BUFFER_2 = 10, + CONSTANT_BUFFER_3 = 11, + CONSTANT_BUFFER_4 = 12, + CONSTANT_BUFFER_5 = 13, + CONSTANT_BUFFER_6 = 14, + CONSTANT_BUFFER_7 = 15, + CONSTANT_BUFFER_8 = 16, + CONSTANT_BUFFER_9 = 17, + CONSTANT_BUFFER_10 = 18, + CONSTANT_BUFFER_11 = 19, + CONSTANT_BUFFER_12 = 20, + CONSTANT_BUFFER_13 = 21, + CONSTANT_BUFFER_14 = 22, + CONSTANT_BUFFER_15 = 23, + ADDRESS_NONE = 24, ///< Address space for unknown memory. + LAST_ADDRESS = ADDRESS_NONE, + + // Some places use this if the address space can't be determined. + UNKNOWN_ADDRESS_SPACE = ~0u +}; + +} // namespace AMDGPUAS + +#endif diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td new file mode 100644 index 00000000000..2e7e39a54d3 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -0,0 +1,266 @@ +//===-- AMDGPU.td - AMDGPU Tablegen files ------------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +include "llvm/Target/Target.td" + +//===----------------------------------------------------------------------===// +// Subtarget Features +//===----------------------------------------------------------------------===// + +// Debugging Features + +def FeatureDumpCode : SubtargetFeature <"DumpCode", + "DumpCode", + "true", + "Dump MachineInstrs in the CodeEmitter">; + +def FeatureDumpCodeLower : SubtargetFeature <"dumpcode", + "DumpCode", + "true", + "Dump MachineInstrs in the CodeEmitter">; + +def FeatureIRStructurizer : SubtargetFeature <"disable-irstructurizer", + "EnableIRStructurizer", + "false", + "Disable IR Structurizer">; + +def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca", + "EnablePromoteAlloca", + "true", + "Enable promote alloca pass">; + +// Target features + +def FeatureIfCvt : SubtargetFeature <"disable-ifcvt", + "EnableIfCvt", + "false", + "Disable the if conversion pass">; + +def FeatureFP64 : SubtargetFeature<"fp64", + "FP64", + "true", + "Enable double precision operations">; + +def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals", + "FP64Denormals", + "true", + "Enable double precision denormal handling", + [FeatureFP64]>; + +def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf", + "FastFMAF32", + "true", + "Assuming f32 fma is at least as fast as mul + add", + []>; + +// Some instructions do not support denormals despite this flag. Using +// fp32 denormals also causes instructions to run at the double +// precision rate for the device. +def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals", + "FP32Denormals", + "true", + "Enable single precision denormal handling">; + +def Feature64BitPtr : SubtargetFeature<"64BitPtr", + "Is64bit", + "true", + "Specify if 64-bit addressing should be used">; + +def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst", + "R600ALUInst", + "false", + "Older version of ALU instructions encoding">; + +def FeatureVertexCache : SubtargetFeature<"HasVertexCache", + "HasVertexCache", + "true", + "Specify use of dedicated vertex cache">; + +def FeatureCaymanISA : SubtargetFeature<"caymanISA", + "CaymanISA", + "true", + "Use Cayman ISA">; + +def FeatureCFALUBug : SubtargetFeature<"cfalubug", + "CFALUBug", + "true", + "GPU has CF_ALU bug">; + +// XXX - This should probably be removed once enabled by default +def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt", + "EnableLoadStoreOpt", + "true", + "Enable SI load/store optimizer pass">; + +def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", + "FlatAddressSpace", + "true", + "Support flat address space">; + +def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling", + "EnableVGPRSpilling", + "true", + "Enable spilling of VGPRs to scratch memory">; + +def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug", + "SGPRInitBug", + "true", + "VI SGPR initilization bug requiring a fixed SGPR allocation size">; + +class SubtargetFeatureFetchLimit : + SubtargetFeature <"fetch"#Value, + "TexVTXClauseSize", + Value, + "Limit the maximum number of fetches in a clause to "#Value>; + +def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">; +def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">; + +class SubtargetFeatureWavefrontSize : SubtargetFeature< + "wavefrontsize"#Value, + "WavefrontSize", + !cast(Value), + "The number of threads per wavefront">; + +def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>; +def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>; +def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>; + +class SubtargetFeatureLDSBankCount : SubtargetFeature < + "ldsbankcount"#Value, + "LDSBankCount", + !cast(Value), + "The number of LDS banks per compute unit.">; + +def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>; +def FeatureLDSBankCount32 : SubtargetFeatureLDSBankCount<32>; + +class SubtargetFeatureLocalMemorySize : SubtargetFeature< + "localmemorysize"#Value, + "LocalMemorySize", + !cast(Value), + "The size of local memory in bytes">; + +def FeatureGCN : SubtargetFeature<"gcn", + "IsGCN", + "true", + "GCN or newer GPU">; + +def FeatureGCN1Encoding : SubtargetFeature<"gcn1-encoding", + "GCN1Encoding", + "true", + "Encoding format for SI and CI">; + +def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding", + "GCN3Encoding", + "true", + "Encoding format for VI">; + +def FeatureCIInsts : SubtargetFeature<"ci-insts", + "CIInsts", + "true", + "Additional intstructions for CI+">; + +// Dummy feature used to disable assembler instructions. +def FeatureDisable : SubtargetFeature<"", + "FeatureDisable","true", + "Dummy feature to disable assembler" + " instructions">; + +class SubtargetFeatureGeneration Implies> : + SubtargetFeature ; + +def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>; +def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>; +def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>; + +def FeatureR600 : SubtargetFeatureGeneration<"R600", + [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]>; + +def FeatureR700 : SubtargetFeatureGeneration<"R700", + [FeatureFetchLimit16, FeatureLocalMemorySize0]>; + +def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN", + [FeatureFetchLimit16, FeatureLocalMemorySize32768]>; + +def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS", + [FeatureFetchLimit16, FeatureWavefrontSize64, + FeatureLocalMemorySize32768] +>; + +def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS", + [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768, + FeatureWavefrontSize64, FeatureGCN, FeatureGCN1Encoding, + FeatureLDSBankCount32]>; + +def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS", + [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536, + FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace, + FeatureGCN1Encoding, FeatureCIInsts]>; + +def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS", + [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536, + FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, + FeatureGCN3Encoding, FeatureCIInsts, FeatureLDSBankCount32]>; + +//===----------------------------------------------------------------------===// + +def AMDGPUInstrInfo : InstrInfo { + let guessInstructionProperties = 1; + let noNamedPositionallyEncodedOperands = 1; +} + +def AMDGPUAsmParser : AsmParser { + // Some of the R600 registers have the same name, so this crashes. + // For example T0_XYZW and T0_XY both have the asm name T0. + let ShouldEmitMatchRegisterName = 0; +} + +def AMDGPU : Target { + // Pull in Instruction Info: + let InstructionSet = AMDGPUInstrInfo; + let AssemblyParsers = [AMDGPUAsmParser]; +} + +// Dummy Instruction itineraries for pseudo instructions +def ALU_NULL : FuncUnit; +def NullALU : InstrItinClass; + +//===----------------------------------------------------------------------===// +// Predicate helper class +//===----------------------------------------------------------------------===// + +def TruePredicate : Predicate<"true">; +def isSICI : Predicate< + "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS" +>, AssemblerPredicate<"FeatureGCN1Encoding">; + +class PredicateControl { + Predicate SubtargetPredicate; + Predicate SIAssemblerPredicate = isSICI; + list AssemblerPredicates = []; + Predicate AssemblerPredicate = TruePredicate; + list OtherPredicates = []; + list Predicates = !listconcat([SubtargetPredicate, AssemblerPredicate], + AssemblerPredicates, + OtherPredicates); +} + +// Include AMDGPU TD files +include "R600Schedule.td" +include "SISchedule.td" +include "Processors.td" +include "AMDGPUInstrInfo.td" +include "AMDGPUIntrinsics.td" +include "AMDGPURegisterInfo.td" +include "AMDGPUInstructions.td" +include "AMDGPUCallingConv.td" diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp new file mode 100644 index 00000000000..0b426bc63dd --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -0,0 +1,67 @@ +//===-- AMDGPUAlwaysInlinePass.cpp - Promote Allocas ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass marks all internal functions as always_inline and creates +/// duplicates of all other functions a marks the duplicates as always_inline. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/IR/Module.h" +#include "llvm/Transforms/Utils/Cloning.h" + +using namespace llvm; + +namespace { + +class AMDGPUAlwaysInline : public ModulePass { + + static char ID; + +public: + AMDGPUAlwaysInline() : ModulePass(ID) { } + bool runOnModule(Module &M) override; + const char *getPassName() const override { return "AMDGPU Always Inline Pass"; } +}; + +} // End anonymous namespace + +char AMDGPUAlwaysInline::ID = 0; + +bool AMDGPUAlwaysInline::runOnModule(Module &M) { + + std::vector FuncsToClone; + for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { + Function &F = *I; + if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty() && + !F.hasFnAttribute(Attribute::NoInline)) + FuncsToClone.push_back(&F); + } + + for (Function *F : FuncsToClone) { + ValueToValueMapTy VMap; + Function *NewFunc = CloneFunction(F, VMap, false); + NewFunc->setLinkage(GlobalValue::InternalLinkage); + F->getParent()->getFunctionList().push_back(NewFunc); + F->replaceAllUsesWith(NewFunc); + } + + for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { + Function &F = *I; + if (F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::NoInline)) { + F.addFnAttr(Attribute::AlwaysInline); + } + } + return false; +} + +ModulePass *llvm::createAMDGPUAlwaysInlinePass() { + return new AMDGPUAlwaysInline(); +} diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp new file mode 100644 index 00000000000..29c2da61add --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -0,0 +1,600 @@ +//===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// +/// The AMDGPUAsmPrinter is used to print both assembly string and also binary +/// code. When passed an MCAsmStreamer it prints assembly and when passed +/// an MCObjectStreamer it outputs binary code. +// +//===----------------------------------------------------------------------===// +// + +#include "AMDGPUAsmPrinter.h" +#include "InstPrinter/AMDGPUInstPrinter.h" +#include "AMDGPU.h" +#include "AMDKernelCodeT.h" +#include "AMDGPUSubtarget.h" +#include "R600Defines.h" +#include "R600MachineFunctionInfo.h" +#include "R600RegisterInfo.h" +#include "SIDefines.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetLoweringObjectFile.h" + +using namespace llvm; + +// TODO: This should get the default rounding mode from the kernel. We just set +// the default here, but this could change if the OpenCL rounding mode pragmas +// are used. +// +// The denormal mode here should match what is reported by the OpenCL runtime +// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but +// can also be override to flush with the -cl-denorms-are-zero compiler flag. +// +// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double +// precision, and leaves single precision to flush all and does not report +// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports +// CL_FP_DENORM for both. +// +// FIXME: It seems some instructions do not support single precision denormals +// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32, +// and sin_f32, cos_f32 on most parts). + +// We want to use these instructions, and using fp32 denormals also causes +// instructions to run at the double precision rate for the device so it's +// probably best to just report no single precision denormals. +static uint32_t getFPMode(const MachineFunction &F) { + const AMDGPUSubtarget& ST = F.getSubtarget(); + // TODO: Is there any real use for the flush in only / flush out only modes? + + uint32_t FP32Denormals = + ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; + + uint32_t FP64Denormals = + ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; + + return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) | + FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) | + FP_DENORM_MODE_SP(FP32Denormals) | + FP_DENORM_MODE_DP(FP64Denormals); +} + +static AsmPrinter * +createAMDGPUAsmPrinterPass(TargetMachine &tm, + std::unique_ptr &&Streamer) { + return new AMDGPUAsmPrinter(tm, std::move(Streamer)); +} + +extern "C" void LLVMInitializeAMDGPUAsmPrinter() { + TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass); + TargetRegistry::RegisterAsmPrinter(TheGCNTarget, createAMDGPUAsmPrinterPass); +} + +AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, + std::unique_ptr Streamer) + : AsmPrinter(TM, std::move(Streamer)) {} + +void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { + + // This label is used to mark the end of the .text section. + const TargetLoweringObjectFile &TLOF = getObjFileLowering(); + OutStreamer->SwitchSection(TLOF.getTextSection()); + MCSymbol *EndOfTextLabel = + OutContext.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); + OutStreamer->EmitLabel(EndOfTextLabel); +} + +bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { + + // The starting address of all shader programs must be 256 bytes aligned. + MF.setAlignment(8); + + SetupMachineFunction(MF); + + MCContext &Context = getObjFileLowering().getContext(); + MCSectionELF *ConfigSection = + Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0); + OutStreamer->SwitchSection(ConfigSection); + + const AMDGPUSubtarget &STM = MF.getSubtarget(); + SIProgramInfo KernelInfo; + if (STM.isAmdHsaOS()) { + getSIProgramInfo(KernelInfo, MF); + EmitAmdKernelCodeT(MF, KernelInfo); + OutStreamer->EmitCodeAlignment(2 << (MF.getAlignment() - 1)); + } else if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { + getSIProgramInfo(KernelInfo, MF); + EmitProgramInfoSI(MF, KernelInfo); + } else { + EmitProgramInfoR600(MF); + } + + DisasmLines.clear(); + HexLines.clear(); + DisasmLineMaxLen = 0; + + EmitFunctionBody(); + + if (isVerbose()) { + MCSectionELF *CommentSection = + Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0); + OutStreamer->SwitchSection(CommentSection); + + if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { + OutStreamer->emitRawComment(" Kernel info:", false); + OutStreamer->emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen), + false); + OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR), + false); + OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR), + false); + OutStreamer->emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode), + false); + OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode), + false); + OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize), + false); + } else { + R600MachineFunctionInfo *MFI = MF.getInfo(); + OutStreamer->emitRawComment( + Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->StackSize))); + } + } + + if (STM.dumpCode()) { + + OutStreamer->SwitchSection( + Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0)); + + for (size_t i = 0; i < DisasmLines.size(); ++i) { + std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' '); + Comment += " ; " + HexLines[i] + "\n"; + + OutStreamer->EmitBytes(StringRef(DisasmLines[i])); + OutStreamer->EmitBytes(StringRef(Comment)); + } + } + + return false; +} + +void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { + unsigned MaxGPR = 0; + bool killPixel = false; + const AMDGPUSubtarget &STM = MF.getSubtarget(); + const R600RegisterInfo *RI = + static_cast(STM.getRegisterInfo()); + const R600MachineFunctionInfo *MFI = MF.getInfo(); + + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + if (MI.getOpcode() == AMDGPU::KILLGT) + killPixel = true; + unsigned numOperands = MI.getNumOperands(); + for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { + const MachineOperand &MO = MI.getOperand(op_idx); + if (!MO.isReg()) + continue; + unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff; + + // Register with value > 127 aren't GPR + if (HWReg > 127) + continue; + MaxGPR = std::max(MaxGPR, HWReg); + } + } + } + + unsigned RsrcReg; + if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) { + // Evergreen / Northern Islands + switch (MFI->getShaderType()) { + default: // Fall through + case ShaderType::COMPUTE: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break; + case ShaderType::GEOMETRY: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break; + case ShaderType::PIXEL: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break; + case ShaderType::VERTEX: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break; + } + } else { + // R600 / R700 + switch (MFI->getShaderType()) { + default: // Fall through + case ShaderType::GEOMETRY: // Fall through + case ShaderType::COMPUTE: // Fall through + case ShaderType::VERTEX: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break; + case ShaderType::PIXEL: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break; + } + } + + OutStreamer->EmitIntValue(RsrcReg, 4); + OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) | + S_STACK_SIZE(MFI->StackSize), 4); + OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4); + OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4); + + if (MFI->getShaderType() == ShaderType::COMPUTE) { + OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4); + OutStreamer->EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4); + } +} + +void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, + const MachineFunction &MF) const { + const AMDGPUSubtarget &STM = MF.getSubtarget(); + const SIMachineFunctionInfo *MFI = MF.getInfo(); + uint64_t CodeSize = 0; + unsigned MaxSGPR = 0; + unsigned MaxVGPR = 0; + bool VCCUsed = false; + bool FlatUsed = false; + const SIRegisterInfo *RI = + static_cast(STM.getRegisterInfo()); + + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + // TODO: CodeSize should account for multiple functions. + CodeSize += MI.getDesc().Size; + + unsigned numOperands = MI.getNumOperands(); + for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { + const MachineOperand &MO = MI.getOperand(op_idx); + unsigned width = 0; + bool isSGPR = false; + + if (!MO.isReg()) { + continue; + } + unsigned reg = MO.getReg(); + if (reg == AMDGPU::VCC || reg == AMDGPU::VCC_LO || + reg == AMDGPU::VCC_HI) { + VCCUsed = true; + continue; + } else if (reg == AMDGPU::FLAT_SCR || + reg == AMDGPU::FLAT_SCR_LO || + reg == AMDGPU::FLAT_SCR_HI) { + FlatUsed = true; + continue; + } + + switch (reg) { + default: break; + case AMDGPU::SCC: + case AMDGPU::EXEC: + case AMDGPU::M0: + continue; + } + + if (AMDGPU::SReg_32RegClass.contains(reg)) { + isSGPR = true; + width = 1; + } else if (AMDGPU::VGPR_32RegClass.contains(reg)) { + isSGPR = false; + width = 1; + } else if (AMDGPU::SReg_64RegClass.contains(reg)) { + isSGPR = true; + width = 2; + } else if (AMDGPU::VReg_64RegClass.contains(reg)) { + isSGPR = false; + width = 2; + } else if (AMDGPU::VReg_96RegClass.contains(reg)) { + isSGPR = false; + width = 3; + } else if (AMDGPU::SReg_128RegClass.contains(reg)) { + isSGPR = true; + width = 4; + } else if (AMDGPU::VReg_128RegClass.contains(reg)) { + isSGPR = false; + width = 4; + } else if (AMDGPU::SReg_256RegClass.contains(reg)) { + isSGPR = true; + width = 8; + } else if (AMDGPU::VReg_256RegClass.contains(reg)) { + isSGPR = false; + width = 8; + } else if (AMDGPU::SReg_512RegClass.contains(reg)) { + isSGPR = true; + width = 16; + } else if (AMDGPU::VReg_512RegClass.contains(reg)) { + isSGPR = false; + width = 16; + } else { + llvm_unreachable("Unknown register class"); + } + unsigned hwReg = RI->getEncodingValue(reg) & 0xff; + unsigned maxUsed = hwReg + width - 1; + if (isSGPR) { + MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR; + } else { + MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR; + } + } + } + } + + if (VCCUsed) + MaxSGPR += 2; + + if (FlatUsed) + MaxSGPR += 2; + + // We found the maximum register index. They start at 0, so add one to get the + // number of registers. + ProgInfo.NumVGPR = MaxVGPR + 1; + ProgInfo.NumSGPR = MaxSGPR + 1; + + if (STM.hasSGPRInitBug()) { + if (ProgInfo.NumSGPR > AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG) + llvm_unreachable("Too many SGPRs used with the SGPR init bug"); + + ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; + } + + ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4; + ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8; + // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode + // register. + ProgInfo.FloatMode = getFPMode(MF); + + // XXX: Not quite sure what this does, but sc seems to unset this. + ProgInfo.IEEEMode = 0; + + // Do not clamp NAN to 0. + ProgInfo.DX10Clamp = 0; + + const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF); + + ProgInfo.FlatUsed = FlatUsed; + ProgInfo.VCCUsed = VCCUsed; + ProgInfo.CodeLen = CodeSize; + + unsigned LDSAlignShift; + if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { + // LDS is allocated in 64 dword blocks. + LDSAlignShift = 8; + } else { + // LDS is allocated in 128 dword blocks. + LDSAlignShift = 9; + } + + unsigned LDSSpillSize = MFI->LDSWaveSpillSize * + MFI->getMaximumWorkGroupSize(MF); + + ProgInfo.LDSSize = MFI->LDSSize + LDSSpillSize; + ProgInfo.LDSBlocks = + RoundUpToAlignment(ProgInfo.LDSSize, 1 << LDSAlignShift) >> LDSAlignShift; + + // Scratch is allocated in 256 dword blocks. + unsigned ScratchAlignShift = 10; + // We need to program the hardware with the amount of scratch memory that + // is used by the entire wave. ProgInfo.ScratchSize is the amount of + // scratch memory used per thread. + ProgInfo.ScratchBlocks = + RoundUpToAlignment(ProgInfo.ScratchSize * STM.getWavefrontSize(), + 1 << ScratchAlignShift) >> ScratchAlignShift; + + ProgInfo.ComputePGMRSrc1 = + S_00B848_VGPRS(ProgInfo.VGPRBlocks) | + S_00B848_SGPRS(ProgInfo.SGPRBlocks) | + S_00B848_PRIORITY(ProgInfo.Priority) | + S_00B848_FLOAT_MODE(ProgInfo.FloatMode) | + S_00B848_PRIV(ProgInfo.Priv) | + S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) | + S_00B848_IEEE_MODE(ProgInfo.DebugMode) | + S_00B848_IEEE_MODE(ProgInfo.IEEEMode); + + ProgInfo.ComputePGMRSrc2 = + S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) | + S_00B84C_USER_SGPR(MFI->NumUserSGPRs) | + S_00B84C_TGID_X_EN(1) | + S_00B84C_TGID_Y_EN(1) | + S_00B84C_TGID_Z_EN(1) | + S_00B84C_TG_SIZE_EN(1) | + S_00B84C_TIDIG_COMP_CNT(2) | + S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks); +} + +static unsigned getRsrcReg(unsigned ShaderType) { + switch (ShaderType) { + default: // Fall through + case ShaderType::COMPUTE: return R_00B848_COMPUTE_PGM_RSRC1; + case ShaderType::GEOMETRY: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; + case ShaderType::PIXEL: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; + case ShaderType::VERTEX: return R_00B128_SPI_SHADER_PGM_RSRC1_VS; + } +} + +void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, + const SIProgramInfo &KernelInfo) { + const AMDGPUSubtarget &STM = MF.getSubtarget(); + const SIMachineFunctionInfo *MFI = MF.getInfo(); + unsigned RsrcReg = getRsrcReg(MFI->getShaderType()); + + if (MFI->getShaderType() == ShaderType::COMPUTE) { + OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4); + + OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4); + + OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); + OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc2, 4); + + OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4); + OutStreamer->EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4); + + // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = + // 0" comment but I don't see a corresponding field in the register spec. + } else { + OutStreamer->EmitIntValue(RsrcReg, 4); + OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) | + S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4); + if (STM.isVGPRSpillingEnabled(MFI)) { + OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4); + OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4); + } + } + + if (MFI->getShaderType() == ShaderType::PIXEL) { + OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); + OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4); + OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); + OutStreamer->EmitIntValue(MFI->PSInputAddr, 4); + } +} + +void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, + const SIProgramInfo &KernelInfo) const { + const SIMachineFunctionInfo *MFI = MF.getInfo(); + const AMDGPUSubtarget &STM = MF.getSubtarget(); + amd_kernel_code_t header; + + memset(&header, 0, sizeof(header)); + + header.amd_code_version_major = AMD_CODE_VERSION_MAJOR; + header.amd_code_version_minor = AMD_CODE_VERSION_MINOR; + + header.struct_byte_size = sizeof(amd_kernel_code_t); + + header.target_chip = STM.getAmdKernelCodeChipID(); + + header.kernel_code_entry_byte_offset = (1ULL << MF.getAlignment()); + + header.compute_pgm_resource_registers = + KernelInfo.ComputePGMRSrc1 | + (KernelInfo.ComputePGMRSrc2 << 32); + + // Code Properties: + header.code_properties = AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR | + AMD_CODE_PROPERTY_IS_PTR64; + + if (KernelInfo.FlatUsed) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; + + if (KernelInfo.ScratchBlocks) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE; + + header.workitem_private_segment_byte_size = KernelInfo.ScratchSize; + header.workgroup_group_segment_byte_size = KernelInfo.LDSSize; + + // MFI->ABIArgOffset is the number of bytes for the kernel arguments + // plus 36. 36 is the number of bytes reserved at the begining of the + // input buffer to store work-group size information. + // FIXME: We should be adding the size of the implicit arguments + // to this value. + header.kernarg_segment_byte_size = MFI->ABIArgOffset; + + header.wavefront_sgpr_count = KernelInfo.NumSGPR; + header.workitem_vgpr_count = KernelInfo.NumVGPR; + + // FIXME: What values do I put for these alignments + header.kernarg_segment_alignment = 0; + header.group_segment_alignment = 0; + header.private_segment_alignment = 0; + + header.code_type = 1; // HSA_EXT_CODE_KERNEL + + header.wavefront_size = STM.getWavefrontSize(); + + MCSectionELF *VersionSection = + OutContext.getELFSection(".hsa.version", ELF::SHT_PROGBITS, 0); + OutStreamer->SwitchSection(VersionSection); + OutStreamer->EmitBytes(Twine("HSA Code Unit:" + + Twine(header.hsail_version_major) + "." + + Twine(header.hsail_version_minor) + ":" + + "AMD:" + + Twine(header.amd_code_version_major) + "." + + Twine(header.amd_code_version_minor) + ":" + + "GFX8.1:0").str()); + + OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); + + if (isVerbose()) { + OutStreamer->emitRawComment("amd_code_version_major = " + + Twine(header.amd_code_version_major), false); + OutStreamer->emitRawComment("amd_code_version_minor = " + + Twine(header.amd_code_version_minor), false); + OutStreamer->emitRawComment("struct_byte_size = " + + Twine(header.struct_byte_size), false); + OutStreamer->emitRawComment("target_chip = " + + Twine(header.target_chip), false); + OutStreamer->emitRawComment(" compute_pgm_rsrc1: " + + Twine::utohexstr(KernelInfo.ComputePGMRSrc1), + false); + OutStreamer->emitRawComment(" compute_pgm_rsrc2: " + + Twine::utohexstr(KernelInfo.ComputePGMRSrc2), + false); + OutStreamer->emitRawComment("enable_sgpr_private_segment_buffer = " + + Twine((bool)(header.code_properties & + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE)), false); + OutStreamer->emitRawComment("enable_sgpr_kernarg_segment_ptr = " + + Twine((bool)(header.code_properties & + AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)), false); + OutStreamer->emitRawComment("private_element_size = 2 ", false); + OutStreamer->emitRawComment("is_ptr64 = " + + Twine((bool)(header.code_properties & AMD_CODE_PROPERTY_IS_PTR64)), false); + OutStreamer->emitRawComment("workitem_private_segment_byte_size = " + + Twine(header.workitem_private_segment_byte_size), + false); + OutStreamer->emitRawComment("workgroup_group_segment_byte_size = " + + Twine(header.workgroup_group_segment_byte_size), + false); + OutStreamer->emitRawComment("gds_segment_byte_size = " + + Twine(header.gds_segment_byte_size), false); + OutStreamer->emitRawComment("kernarg_segment_byte_size = " + + Twine(header.kernarg_segment_byte_size), false); + OutStreamer->emitRawComment("wavefront_sgpr_count = " + + Twine(header.wavefront_sgpr_count), false); + OutStreamer->emitRawComment("workitem_vgpr_count = " + + Twine(header.workitem_vgpr_count), false); + OutStreamer->emitRawComment("code_type = " + Twine(header.code_type), false); + OutStreamer->emitRawComment("wavefront_size = " + + Twine((int)header.wavefront_size), false); + OutStreamer->emitRawComment("optimization_level = " + + Twine(header.optimization_level), false); + OutStreamer->emitRawComment("hsail_profile = " + + Twine(header.hsail_profile), false); + OutStreamer->emitRawComment("hsail_machine_model = " + + Twine(header.hsail_machine_model), false); + OutStreamer->emitRawComment("hsail_version_major = " + + Twine(header.hsail_version_major), false); + OutStreamer->emitRawComment("hsail_version_minor = " + + Twine(header.hsail_version_minor), false); + } + + OutStreamer->EmitBytes(StringRef((char*)&header, sizeof(header))); +} + +bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, + const char *ExtraCode, raw_ostream &O) { + if (ExtraCode && ExtraCode[0]) { + if (ExtraCode[1] != 0) + return true; // Unknown modifier. + + switch (ExtraCode[0]) { + default: + // See if this is a generic print operand + return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O); + case 'r': + break; + } + } + + AMDGPUInstPrinter::printRegOperand(MI->getOperand(OpNo).getReg(), O, + *TM.getSubtargetImpl(*MF->getFunction())->getRegisterInfo()); + return false; +} diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h new file mode 100644 index 00000000000..1acff3a3222 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -0,0 +1,113 @@ +//===-- AMDGPUAsmPrinter.h - Print AMDGPU assembly code ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief AMDGPU Assembly printer class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H +#define LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H + +#include "llvm/CodeGen/AsmPrinter.h" +#include + +namespace llvm { + +class AMDGPUAsmPrinter : public AsmPrinter { +private: + struct SIProgramInfo { + SIProgramInfo() : + VGPRBlocks(0), + SGPRBlocks(0), + Priority(0), + FloatMode(0), + Priv(0), + DX10Clamp(0), + DebugMode(0), + IEEEMode(0), + ScratchSize(0), + ComputePGMRSrc1(0), + LDSBlocks(0), + ScratchBlocks(0), + ComputePGMRSrc2(0), + NumVGPR(0), + NumSGPR(0), + FlatUsed(false), + VCCUsed(false), + CodeLen(0) {} + + // Fields set in PGM_RSRC1 pm4 packet. + uint32_t VGPRBlocks; + uint32_t SGPRBlocks; + uint32_t Priority; + uint32_t FloatMode; + uint32_t Priv; + uint32_t DX10Clamp; + uint32_t DebugMode; + uint32_t IEEEMode; + uint32_t ScratchSize; + + uint64_t ComputePGMRSrc1; + + // Fields set in PGM_RSRC2 pm4 packet. + uint32_t LDSBlocks; + uint32_t ScratchBlocks; + + uint64_t ComputePGMRSrc2; + + uint32_t NumVGPR; + uint32_t NumSGPR; + uint32_t LDSSize; + bool FlatUsed; + + // Bonus information for debugging. + bool VCCUsed; + uint64_t CodeLen; + }; + + void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const; + void findNumUsedRegistersSI(const MachineFunction &MF, + unsigned &NumSGPR, + unsigned &NumVGPR) const; + + /// \brief Emit register usage information so that the GPU driver + /// can correctly setup the GPU state. + void EmitProgramInfoR600(const MachineFunction &MF); + void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo); + void EmitAmdKernelCodeT(const MachineFunction &MF, + const SIProgramInfo &KernelInfo) const; + +public: + explicit AMDGPUAsmPrinter(TargetMachine &TM, + std::unique_ptr Streamer); + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "AMDGPU Assembly Printer"; + } + + /// Implemented in AMDGPUMCInstLower.cpp + void EmitInstruction(const MachineInstr *MI) override; + + void EmitEndOfAsmFile(Module &M) override; + + bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &O) override; + +protected: + std::vector DisasmLines, HexLines; + size_t DisasmLineMaxLen; +}; + +} // End anonymous llvm + +#endif diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td new file mode 100644 index 00000000000..6ffa7a08358 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -0,0 +1,82 @@ +//===---- AMDCallingConv.td - Calling Conventions for Radeon GPUs ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This describes the calling conventions for the AMD Radeon GPUs. +// +//===----------------------------------------------------------------------===// + +// Inversion of CCIfInReg +class CCIfNotInReg : CCIf<"!ArgFlags.isInReg()", A> {} + +// Calling convention for SI +def CC_SI : CallingConv<[ + + CCIfInReg>>, + + CCIfInReg>>, + + CCIfNotInReg>>, + + CCIfByVal>> + +]>; + +// Calling convention for R600 +def CC_R600 : CallingConv<[ + CCIfInReg>> +]>; + +// Calling convention for compute kernels +def CC_AMDGPU_Kernel : CallingConv<[ + CCCustom<"allocateStack"> +]>; + +def CC_AMDGPU : CallingConv<[ + CCIf<"static_cast" + "(State.getMachineFunction().getSubtarget()).getGeneration() >=" + "AMDGPUSubtarget::SOUTHERN_ISLANDS && " + "State.getMachineFunction().getInfo()" + "->getShaderType() == ShaderType::COMPUTE", + CCDelegateTo>, + CCIf<"static_cast" + "(State.getMachineFunction().getSubtarget()).getGeneration() < " + "AMDGPUSubtarget::SOUTHERN_ISLANDS && " + "State.getMachineFunction().getInfo()" + "->getShaderType() == ShaderType::COMPUTE", + CCDelegateTo>, + CCIf<"static_cast" + "(State.getMachineFunction().getSubtarget()).getGeneration() >= " + "AMDGPUSubtarget::SOUTHERN_ISLANDS", + CCDelegateTo>, + CCIf<"static_cast" + "(State.getMachineFunction().getSubtarget()).getGeneration() < " + "AMDGPUSubtarget::SOUTHERN_ISLANDS", + CCDelegateTo> +]>; diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp new file mode 100644 index 00000000000..8175786fb9b --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp @@ -0,0 +1,112 @@ +//===----------------------- AMDGPUFrameLowering.cpp ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// Interface to describe a layout of a stack frame on a AMDIL target machine +// +//===----------------------------------------------------------------------===// +#include "AMDGPUFrameLowering.h" +#include "AMDGPURegisterInfo.h" +#include "R600MachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Instructions.h" + +using namespace llvm; +AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl, + int LAO, unsigned TransAl) + : TargetFrameLowering(D, StackAl, LAO, TransAl) { } + +AMDGPUFrameLowering::~AMDGPUFrameLowering() { } + +unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const { + + // XXX: Hardcoding to 1 for now. + // + // I think the StackWidth should stored as metadata associated with the + // MachineFunction. This metadata can either be added by a frontend, or + // calculated by a R600 specific LLVM IR pass. + // + // The StackWidth determines how stack objects are laid out in memory. + // For a vector stack variable, like: int4 stack[2], the data will be stored + // in the following ways depending on the StackWidth. + // + // StackWidth = 1: + // + // T0.X = stack[0].x + // T1.X = stack[0].y + // T2.X = stack[0].z + // T3.X = stack[0].w + // T4.X = stack[1].x + // T5.X = stack[1].y + // T6.X = stack[1].z + // T7.X = stack[1].w + // + // StackWidth = 2: + // + // T0.X = stack[0].x + // T0.Y = stack[0].y + // T1.X = stack[0].z + // T1.Y = stack[0].w + // T2.X = stack[1].x + // T2.Y = stack[1].y + // T3.X = stack[1].z + // T3.Y = stack[1].w + // + // StackWidth = 4: + // T0.X = stack[0].x + // T0.Y = stack[0].y + // T0.Z = stack[0].z + // T0.W = stack[0].w + // T1.X = stack[1].x + // T1.Y = stack[1].y + // T1.Z = stack[1].z + // T1.W = stack[1].w + return 1; +} + +/// \returns The number of registers allocated for \p FI. +int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF, + int FI) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + // Start the offset at 2 so we don't overwrite work group information. + // XXX: We should only do this when the shader actually uses this + // information. + unsigned OffsetBytes = 2 * (getStackWidth(MF) * 4); + int UpperBound = FI == -1 ? MFI->getNumObjects() : FI; + + for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) { + OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(i)); + OffsetBytes += MFI->getObjectSize(i); + // Each register holds 4 bytes, so we must always align the offset to at + // least 4 bytes, so that 2 frame objects won't share the same register. + OffsetBytes = RoundUpToAlignment(OffsetBytes, 4); + } + + if (FI != -1) + OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(FI)); + + return OffsetBytes / (getStackWidth(MF) * 4); +} + +const TargetFrameLowering::SpillSlot * +AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const { + NumEntries = 0; + return nullptr; +} +void AMDGPUFrameLowering::emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const {} +void +AMDGPUFrameLowering::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { +} + +bool +AMDGPUFrameLowering::hasFP(const MachineFunction &MF) const { + return false; +} diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h new file mode 100644 index 00000000000..9f31be1af79 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h @@ -0,0 +1,45 @@ +//===--------------------- AMDGPUFrameLowering.h ----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface to describe a layout of a stack frame on a AMDIL target +/// machine. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H +#define LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H + +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/Target/TargetFrameLowering.h" + +namespace llvm { + +/// \brief Information about the stack frame layout on the AMDGPU targets. +/// +/// It holds the direction of the stack growth, the known stack alignment on +/// entry to each function, and the offset to the locals area. +/// See TargetFrameInfo for more comments. +class AMDGPUFrameLowering : public TargetFrameLowering { +public: + AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO, + unsigned TransAl = 1); + virtual ~AMDGPUFrameLowering(); + + /// \returns The number of 32-bit sub-registers that are used when storing + /// values to the stack. + unsigned getStackWidth(const MachineFunction &MF) const; + int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; + const SpillSlot * + getCalleeSavedSpillSlots(unsigned &NumEntries) const override; + void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + bool hasFP(const MachineFunction &MF) const override; +}; +} // namespace llvm +#endif diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp new file mode 100644 index 00000000000..df4461eac4d --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -0,0 +1,1371 @@ +//===-- AMDILISelDAGToDAG.cpp - A dag to dag inst selector for AMDIL ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief Defines an instruction selector for the AMDGPU target. +// +//===----------------------------------------------------------------------===// +#include "AMDGPUInstrInfo.h" +#include "AMDGPUISelLowering.h" // For AMDGPUISD +#include "AMDGPURegisterInfo.h" +#include "AMDGPUSubtarget.h" +#include "R600InstrInfo.h" +#include "SIDefines.h" +#include "SIISelLowering.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/FunctionLoweringInfo.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/IR/Function.h" + +using namespace llvm; + +//===----------------------------------------------------------------------===// +// Instruction Selector Implementation +//===----------------------------------------------------------------------===// + +namespace { +/// AMDGPU specific code to select AMDGPU machine instructions for +/// SelectionDAG operations. +class AMDGPUDAGToDAGISel : public SelectionDAGISel { + // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can + // make the right decision when generating code for different targets. + const AMDGPUSubtarget *Subtarget; +public: + AMDGPUDAGToDAGISel(TargetMachine &TM); + virtual ~AMDGPUDAGToDAGISel(); + bool runOnMachineFunction(MachineFunction &MF) override; + SDNode *Select(SDNode *N) override; + const char *getPassName() const override; + void PostprocessISelDAG() override; + +private: + bool isInlineImmediate(SDNode *N) const; + bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs, + const R600InstrInfo *TII); + bool FoldOperands(unsigned, const R600InstrInfo *, std::vector &); + bool FoldDotOperands(unsigned, const R600InstrInfo *, std::vector &); + + // Complex pattern selectors + bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2); + bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2); + bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2); + + static bool checkType(const Value *ptr, unsigned int addrspace); + static bool checkPrivateAddress(const MachineMemOperand *Op); + + static bool isGlobalStore(const StoreSDNode *N); + static bool isFlatStore(const StoreSDNode *N); + static bool isPrivateStore(const StoreSDNode *N); + static bool isLocalStore(const StoreSDNode *N); + static bool isRegionStore(const StoreSDNode *N); + + bool isCPLoad(const LoadSDNode *N) const; + bool isConstantLoad(const LoadSDNode *N, int cbID) const; + bool isGlobalLoad(const LoadSDNode *N) const; + bool isFlatLoad(const LoadSDNode *N) const; + bool isParamLoad(const LoadSDNode *N) const; + bool isPrivateLoad(const LoadSDNode *N) const; + bool isLocalLoad(const LoadSDNode *N) const; + bool isRegionLoad(const LoadSDNode *N) const; + + SDNode *glueCopyToM0(SDNode *N) const; + + const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; + bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr); + bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg, + SDValue& Offset); + bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); + bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); + bool isDSOffsetLegal(const SDValue &Base, unsigned Offset, + unsigned OffsetBits) const; + bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; + bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, + SDValue &Offset1) const; + void SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, + SDValue &SOffset, SDValue &Offset, SDValue &Offen, + SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, + SDValue &TFE) const; + bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, + SDValue &SOffset, SDValue &Offset, SDValue &GLC, + SDValue &SLC, SDValue &TFE) const; + bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, + SDValue &VAddr, SDValue &SOffset, SDValue &Offset, + SDValue &SLC) const; + bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr, + SDValue &SOffset, SDValue &ImmOffset) const; + bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, + SDValue &Offset, SDValue &GLC, SDValue &SLC, + SDValue &TFE) const; + bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, + SDValue &Offset, SDValue &GLC) const; + SDNode *SelectAddrSpaceCast(SDNode *N); + bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, + SDValue &Clamp, SDValue &Omod) const; + + bool SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods, + SDValue &Omod) const; + bool SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, SDValue &SrcMods, + SDValue &Clamp, + SDValue &Omod) const; + + SDNode *SelectADD_SUB_I64(SDNode *N); + SDNode *SelectDIV_SCALE(SDNode *N); + + SDNode *getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val, + uint32_t Offset, uint32_t Width); + SDNode *SelectS_BFEFromShifts(SDNode *N); + SDNode *SelectS_BFE(SDNode *N); + + // Include the pieces autogenerated from the target description. +#include "AMDGPUGenDAGISel.inc" +}; +} // end anonymous namespace + +/// \brief This pass converts a legalized DAG into a AMDGPU-specific +// DAG, ready for instruction scheduling. +FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM) { + return new AMDGPUDAGToDAGISel(TM); +} + +AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM) + : SelectionDAGISel(TM) {} + +bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { + Subtarget = &static_cast(MF.getSubtarget()); + return SelectionDAGISel::runOnMachineFunction(MF); +} + +AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() { +} + +bool AMDGPUDAGToDAGISel::isInlineImmediate(SDNode *N) const { + const SITargetLowering *TL + = static_cast(getTargetLowering()); + return TL->analyzeImmediate(N) == 0; +} + +/// \brief Determine the register class for \p OpNo +/// \returns The register class of the virtual register that will be used for +/// the given operand number \OpNo or NULL if the register class cannot be +/// determined. +const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, + unsigned OpNo) const { + if (!N->isMachineOpcode()) + return nullptr; + + switch (N->getMachineOpcode()) { + default: { + const MCInstrDesc &Desc = + Subtarget->getInstrInfo()->get(N->getMachineOpcode()); + unsigned OpIdx = Desc.getNumDefs() + OpNo; + if (OpIdx >= Desc.getNumOperands()) + return nullptr; + int RegClass = Desc.OpInfo[OpIdx].RegClass; + if (RegClass == -1) + return nullptr; + + return Subtarget->getRegisterInfo()->getRegClass(RegClass); + } + case AMDGPU::REG_SEQUENCE: { + unsigned RCID = cast(N->getOperand(0))->getZExtValue(); + const TargetRegisterClass *SuperRC = + Subtarget->getRegisterInfo()->getRegClass(RCID); + + SDValue SubRegOp = N->getOperand(OpNo + 1); + unsigned SubRegIdx = cast(SubRegOp)->getZExtValue(); + return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC, + SubRegIdx); + } + } +} + +bool AMDGPUDAGToDAGISel::SelectADDRParam( + SDValue Addr, SDValue& R1, SDValue& R2) { + + if (Addr.getOpcode() == ISD::FrameIndex) { + if (FrameIndexSDNode *FIN = dyn_cast(Addr)) { + R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32); + R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); + } else { + R1 = Addr; + R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); + } + } else if (Addr.getOpcode() == ISD::ADD) { + R1 = Addr.getOperand(0); + R2 = Addr.getOperand(1); + } else { + R1 = Addr; + R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); + } + return true; +} + +bool AMDGPUDAGToDAGISel::SelectADDR(SDValue Addr, SDValue& R1, SDValue& R2) { + if (Addr.getOpcode() == ISD::TargetExternalSymbol || + Addr.getOpcode() == ISD::TargetGlobalAddress) { + return false; + } + return SelectADDRParam(Addr, R1, R2); +} + + +bool AMDGPUDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) { + if (Addr.getOpcode() == ISD::TargetExternalSymbol || + Addr.getOpcode() == ISD::TargetGlobalAddress) { + return false; + } + + if (Addr.getOpcode() == ISD::FrameIndex) { + if (FrameIndexSDNode *FIN = dyn_cast(Addr)) { + R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64); + R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64); + } else { + R1 = Addr; + R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64); + } + } else if (Addr.getOpcode() == ISD::ADD) { + R1 = Addr.getOperand(0); + R2 = Addr.getOperand(1); + } else { + R1 = Addr; + R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64); + } + return true; +} + +SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const { + if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || + !checkType(cast(N)->getMemOperand()->getValue(), + AMDGPUAS::LOCAL_ADDRESS)) + return N; + + const SITargetLowering& Lowering = + *static_cast(getTargetLowering()); + + // Write max value to m0 before each load operation + + SDValue M0 = Lowering.copyToM0(*CurDAG, CurDAG->getEntryNode(), SDLoc(N), + CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32)); + + SDValue Glue = M0.getValue(1); + + SmallVector Ops; + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { + Ops.push_back(N->getOperand(i)); + } + Ops.push_back(Glue); + CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops); + + return N; +} + +SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { + unsigned int Opc = N->getOpcode(); + if (N->isMachineOpcode()) { + N->setNodeId(-1); + return nullptr; // Already selected. + } + + if (isa(N)) + N = glueCopyToM0(N); + + switch (Opc) { + default: break; + // We are selecting i64 ADD here instead of custom lower it during + // DAG legalization, so we can fold some i64 ADDs used for address + // calculation into the LOAD and STORE instructions. + case ISD::ADD: + case ISD::SUB: { + if (N->getValueType(0) != MVT::i64 || + Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) + break; + + return SelectADD_SUB_I64(N); + } + case ISD::SCALAR_TO_VECTOR: + case AMDGPUISD::BUILD_VERTICAL_VECTOR: + case ISD::BUILD_VECTOR: { + unsigned RegClassID; + const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo(); + EVT VT = N->getValueType(0); + unsigned NumVectorElts = VT.getVectorNumElements(); + EVT EltVT = VT.getVectorElementType(); + assert(EltVT.bitsEq(MVT::i32)); + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { + bool UseVReg = true; + for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); + U != E; ++U) { + if (!U->isMachineOpcode()) { + continue; + } + const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo()); + if (!RC) { + continue; + } + if (static_cast(TRI)->isSGPRClass(RC)) { + UseVReg = false; + } + } + switch(NumVectorElts) { + case 1: RegClassID = UseVReg ? AMDGPU::VGPR_32RegClassID : + AMDGPU::SReg_32RegClassID; + break; + case 2: RegClassID = UseVReg ? AMDGPU::VReg_64RegClassID : + AMDGPU::SReg_64RegClassID; + break; + case 4: RegClassID = UseVReg ? AMDGPU::VReg_128RegClassID : + AMDGPU::SReg_128RegClassID; + break; + case 8: RegClassID = UseVReg ? AMDGPU::VReg_256RegClassID : + AMDGPU::SReg_256RegClassID; + break; + case 16: RegClassID = UseVReg ? AMDGPU::VReg_512RegClassID : + AMDGPU::SReg_512RegClassID; + break; + default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR"); + } + } else { + // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG + // that adds a 128 bits reg copy when going through TwoAddressInstructions + // pass. We want to avoid 128 bits copies as much as possible because they + // can't be bundled by our scheduler. + switch(NumVectorElts) { + case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break; + case 4: + if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR) + RegClassID = AMDGPU::R600_Reg128VerticalRegClassID; + else + RegClassID = AMDGPU::R600_Reg128RegClassID; + break; + default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR"); + } + } + + SDLoc DL(N); + SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); + + if (NumVectorElts == 1) { + return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, + N->getOperand(0), RegClass); + } + + assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not " + "supported yet"); + // 16 = Max Num Vector Elements + // 2 = 2 REG_SEQUENCE operands per element (value, subreg index) + // 1 = Vector Register Class + SmallVector RegSeqArgs(NumVectorElts * 2 + 1); + + RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); + bool IsRegSeq = true; + unsigned NOps = N->getNumOperands(); + for (unsigned i = 0; i < NOps; i++) { + // XXX: Why is this here? + if (isa(N->getOperand(i))) { + IsRegSeq = false; + break; + } + RegSeqArgs[1 + (2 * i)] = N->getOperand(i); + RegSeqArgs[1 + (2 * i) + 1] = + CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, + MVT::i32); + } + + if (NOps != NumVectorElts) { + // Fill in the missing undef elements if this was a scalar_to_vector. + assert(Opc == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts); + + MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, + DL, EltVT); + for (unsigned i = NOps; i < NumVectorElts; ++i) { + RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0); + RegSeqArgs[1 + (2 * i) + 1] = + CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, MVT::i32); + } + } + + if (!IsRegSeq) + break; + return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), + RegSeqArgs); + } + case ISD::BUILD_PAIR: { + SDValue RC, SubReg0, SubReg1; + if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { + break; + } + SDLoc DL(N); + if (N->getValueType(0) == MVT::i128) { + RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32); + SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32); + SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32); + } else if (N->getValueType(0) == MVT::i64) { + RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32); + SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); + SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); + } else { + llvm_unreachable("Unhandled value type for BUILD_PAIR"); + } + const SDValue Ops[] = { RC, N->getOperand(0), SubReg0, + N->getOperand(1), SubReg1 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, + DL, N->getValueType(0), Ops); + } + + case ISD::Constant: + case ISD::ConstantFP: { + if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || + N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N)) + break; + + uint64_t Imm; + if (ConstantFPSDNode *FP = dyn_cast(N)) + Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue(); + else { + ConstantSDNode *C = cast(N); + Imm = C->getZExtValue(); + } + + SDLoc DL(N); + SDNode *Lo = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getConstant(Imm & 0xFFFFFFFF, DL, + MVT::i32)); + SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getConstant(Imm >> 32, DL, MVT::i32)); + const SDValue Ops[] = { + CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), + SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), + SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32) + }; + + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, + N->getValueType(0), Ops); + } + + case ISD::LOAD: { + LoadSDNode *LD = cast(N); + SDLoc SL(N); + EVT VT = N->getValueType(0); + + if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) { + N = glueCopyToM0(N); + break; + } + + // To simplify the TableGen patters, we replace all i64 loads with + // v2i32 loads. Alternatively, we could promote i64 loads to v2i32 + // during DAG legalization, however, so places (ExpandUnalignedLoad) + // in the DAG legalizer assume that if i64 is legal, so doing this + // promotion early can cause problems. + + SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SDLoc(N), LD->getChain(), + LD->getBasePtr(), LD->getMemOperand()); + SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL, + MVT::i64, NewLoad); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLoad.getValue(1)); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), BitCast); + SDNode *Load = glueCopyToM0(NewLoad.getNode()); + SelectCode(Load); + N = BitCast.getNode(); + break; + } + + case ISD::STORE: { + // Handle i64 stores here for the same reason mentioned above for loads. + StoreSDNode *ST = cast(N); + SDValue Value = ST->getValue(); + if (Value.getValueType() == MVT::i64 && !ST->isTruncatingStore()) { + + SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N), + MVT::v2i32, Value); + SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue, + ST->getBasePtr(), ST->getMemOperand()); + + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore); + + if (NewValue.getOpcode() == ISD::BITCAST) { + Select(NewStore.getNode()); + return SelectCode(NewValue.getNode()); + } + + // getNode() may fold the bitcast if its input was another bitcast. If that + // happens we should only select the new store. + N = NewStore.getNode(); + } + + N = glueCopyToM0(N); + break; + } + + case AMDGPUISD::REGISTER_LOAD: { + if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) + break; + SDValue Addr, Offset; + + SDLoc DL(N); + SelectADDRIndirect(N->getOperand(1), Addr, Offset); + const SDValue Ops[] = { + Addr, + Offset, + CurDAG->getTargetConstant(0, DL, MVT::i32), + N->getOperand(0), + }; + return CurDAG->getMachineNode(AMDGPU::SI_RegisterLoad, DL, + CurDAG->getVTList(MVT::i32, MVT::i64, + MVT::Other), + Ops); + } + case AMDGPUISD::REGISTER_STORE: { + if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) + break; + SDValue Addr, Offset; + SelectADDRIndirect(N->getOperand(2), Addr, Offset); + SDLoc DL(N); + const SDValue Ops[] = { + N->getOperand(1), + Addr, + Offset, + CurDAG->getTargetConstant(0, DL, MVT::i32), + N->getOperand(0), + }; + return CurDAG->getMachineNode(AMDGPU::SI_RegisterStorePseudo, DL, + CurDAG->getVTList(MVT::Other), + Ops); + } + + case AMDGPUISD::BFE_I32: + case AMDGPUISD::BFE_U32: { + if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) + break; + + // There is a scalar version available, but unlike the vector version which + // has a separate operand for the offset and width, the scalar version packs + // the width and offset into a single operand. Try to move to the scalar + // version if the offsets are constant, so that we can try to keep extended + // loads of kernel arguments in SGPRs. + + // TODO: Technically we could try to pattern match scalar bitshifts of + // dynamic values, but it's probably not useful. + ConstantSDNode *Offset = dyn_cast(N->getOperand(1)); + if (!Offset) + break; + + ConstantSDNode *Width = dyn_cast(N->getOperand(2)); + if (!Width) + break; + + bool Signed = Opc == AMDGPUISD::BFE_I32; + + uint32_t OffsetVal = Offset->getZExtValue(); + uint32_t WidthVal = Width->getZExtValue(); + + return getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, SDLoc(N), + N->getOperand(0), OffsetVal, WidthVal); + + } + case AMDGPUISD::DIV_SCALE: { + return SelectDIV_SCALE(N); + } + case ISD::CopyToReg: { + const SITargetLowering& Lowering = + *static_cast(getTargetLowering()); + Lowering.legalizeTargetIndependentNode(N, *CurDAG); + break; + } + case ISD::ADDRSPACECAST: + return SelectAddrSpaceCast(N); + case ISD::AND: + case ISD::SRL: + case ISD::SRA: + if (N->getValueType(0) != MVT::i32 || + Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) + break; + + return SelectS_BFE(N); + } + + return SelectCode(N); +} + + +bool AMDGPUDAGToDAGISel::checkType(const Value *Ptr, unsigned AS) { + assert(AS != 0 && "Use checkPrivateAddress instead."); + if (!Ptr) + return false; + + return Ptr->getType()->getPointerAddressSpace() == AS; +} + +bool AMDGPUDAGToDAGISel::checkPrivateAddress(const MachineMemOperand *Op) { + if (Op->getPseudoValue()) + return true; + + if (PointerType *PT = dyn_cast(Op->getValue()->getType())) + return PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; + + return false; +} + +bool AMDGPUDAGToDAGISel::isGlobalStore(const StoreSDNode *N) { + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isPrivateStore(const StoreSDNode *N) { + const Value *MemVal = N->getMemOperand()->getValue(); + return (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) && + !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) && + !checkType(MemVal, AMDGPUAS::REGION_ADDRESS)); +} + +bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) { + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isFlatStore(const StoreSDNode *N) { + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) { + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int CbId) const { + const Value *MemVal = N->getMemOperand()->getValue(); + if (CbId == -1) + return checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS); + + return checkType(MemVal, AMDGPUAS::CONSTANT_BUFFER_0 + CbId); +} + +bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) const { + if (N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) + if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || + N->getMemoryVT().bitsLT(MVT::i32)) + return true; + + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) const { + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::PARAM_I_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isLocalLoad(const LoadSDNode *N) const { + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isFlatLoad(const LoadSDNode *N) const { + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isRegionLoad(const LoadSDNode *N) const { + return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS); +} + +bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const { + MachineMemOperand *MMO = N->getMemOperand(); + if (checkPrivateAddress(N->getMemOperand())) { + if (MMO) { + const PseudoSourceValue *PSV = MMO->getPseudoValue(); + if (PSV && PSV == PseudoSourceValue::getConstantPool()) { + return true; + } + } + } + return false; +} + +bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) const { + if (checkPrivateAddress(N->getMemOperand())) { + // Check to make sure we are not a constant pool load or a constant load + // that is marked as a private load + if (isCPLoad(N) || isConstantLoad(N, -1)) { + return false; + } + } + + const Value *MemVal = N->getMemOperand()->getValue(); + if (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) && + !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) && + !checkType(MemVal, AMDGPUAS::FLAT_ADDRESS) && + !checkType(MemVal, AMDGPUAS::REGION_ADDRESS) && + !checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS) && + !checkType(MemVal, AMDGPUAS::PARAM_D_ADDRESS) && + !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)) { + return true; + } + return false; +} + +const char *AMDGPUDAGToDAGISel::getPassName() const { + return "AMDGPU DAG->DAG Pattern Instruction Selection"; +} + +#ifdef DEBUGTMP +#undef INT64_C +#endif +#undef DEBUGTMP + +//===----------------------------------------------------------------------===// +// Complex Patterns +//===----------------------------------------------------------------------===// + +bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr, + SDValue& IntPtr) { + if (ConstantSDNode *Cst = dyn_cast(Addr)) { + IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr), + true); + return true; + } + return false; +} + +bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr, + SDValue& BaseReg, SDValue &Offset) { + if (!isa(Addr)) { + BaseReg = Addr; + Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true); + return true; + } + return false; +} + +bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base, + SDValue &Offset) { + ConstantSDNode *IMMOffset; + + if (Addr.getOpcode() == ISD::ADD + && (IMMOffset = dyn_cast(Addr.getOperand(1))) + && isInt<16>(IMMOffset->getZExtValue())) { + + Base = Addr.getOperand(0); + Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr), + MVT::i32); + return true; + // If the pointer address is constant, we can move it to the offset field. + } else if ((IMMOffset = dyn_cast(Addr)) + && isInt<16>(IMMOffset->getZExtValue())) { + Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), + SDLoc(CurDAG->getEntryNode()), + AMDGPU::ZERO, MVT::i32); + Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr), + MVT::i32); + return true; + } + + // Default case, no offset + Base = Addr; + Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, + SDValue &Offset) { + ConstantSDNode *C; + SDLoc DL(Addr); + + if ((C = dyn_cast(Addr))) { + Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32); + Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); + } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && + (C = dyn_cast(Addr.getOperand(1)))) { + Base = Addr.getOperand(0); + Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); + } else { + Base = Addr; + Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); + } + + return true; +} + +SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { + SDLoc DL(N); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + bool IsAdd = (N->getOpcode() == ISD::ADD); + + SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); + SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); + + SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, LHS, Sub0); + SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, LHS, Sub1); + + SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, RHS, Sub0); + SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, RHS, Sub1); + + SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue); + SDValue AddLoArgs[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) }; + + + unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; + unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; + + SDNode *AddLo = CurDAG->getMachineNode( Opc, DL, VTList, AddLoArgs); + SDValue Carry(AddLo, 1); + SDNode *AddHi + = CurDAG->getMachineNode(CarryOpc, DL, MVT::i32, + SDValue(Hi0, 0), SDValue(Hi1, 0), Carry); + + SDValue Args[5] = { + CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), + SDValue(AddLo,0), + Sub0, + SDValue(AddHi,0), + Sub1, + }; + return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args); +} + +// We need to handle this here because tablegen doesn't support matching +// instructions with multiple outputs. +SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { + SDLoc SL(N); + EVT VT = N->getValueType(0); + + assert(VT == MVT::f32 || VT == MVT::f64); + + unsigned Opc + = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32; + + // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod + SDValue Ops[8]; + + SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]); + SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]); + SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]); + return CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops); +} + +bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset, + unsigned OffsetBits) const { + if ((OffsetBits == 16 && !isUInt<16>(Offset)) || + (OffsetBits == 8 && !isUInt<8>(Offset))) + return false; + + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) + return true; + + // On Southern Islands instruction with a negative base value and an offset + // don't seem to work. + return CurDAG->SignBitIsZero(Base); +} + +bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, + SDValue &Offset) const { + if (CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + ConstantSDNode *C1 = cast(N1); + if (isDSOffsetLegal(N0, C1->getSExtValue(), 16)) { + // (add n0, c0) + Base = N0; + Offset = N1; + return true; + } + } + + SDLoc DL(Addr); + + // If we have a constant address, prefer to put the constant into the + // offset. This can save moves to load the constant address since multiple + // operations can share the zero base address register, and enables merging + // into read2 / write2 instructions. + if (const ConstantSDNode *CAddr = dyn_cast(Addr)) { + if (isUInt<16>(CAddr->getZExtValue())) { + SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); + MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, + DL, MVT::i32, Zero); + Base = SDValue(MovZero, 0); + Offset = Addr; + return true; + } + } + + // default case + Base = Addr; + Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, + SDValue &Offset0, + SDValue &Offset1) const { + SDLoc DL(Addr); + + if (CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + ConstantSDNode *C1 = cast(N1); + unsigned DWordOffset0 = C1->getZExtValue() / 4; + unsigned DWordOffset1 = DWordOffset0 + 1; + // (add n0, c0) + if (isDSOffsetLegal(N0, DWordOffset1, 8)) { + Base = N0; + Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); + Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); + return true; + } + } + + if (const ConstantSDNode *CAddr = dyn_cast(Addr)) { + unsigned DWordOffset0 = CAddr->getZExtValue() / 4; + unsigned DWordOffset1 = DWordOffset0 + 1; + assert(4 * DWordOffset0 == CAddr->getZExtValue()); + + if (isUInt<8>(DWordOffset0) && isUInt<8>(DWordOffset1)) { + SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); + MachineSDNode *MovZero + = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, + DL, MVT::i32, Zero); + Base = SDValue(MovZero, 0); + Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); + Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); + return true; + } + } + + // default case + Base = Addr; + Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8); + Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8); + return true; +} + +static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) { + return isUInt<12>(Imm->getZExtValue()); +} + +void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, + SDValue &VAddr, SDValue &SOffset, + SDValue &Offset, SDValue &Offen, + SDValue &Idxen, SDValue &Addr64, + SDValue &GLC, SDValue &SLC, + SDValue &TFE) const { + SDLoc DL(Addr); + + GLC = CurDAG->getTargetConstant(0, DL, MVT::i1); + SLC = CurDAG->getTargetConstant(0, DL, MVT::i1); + TFE = CurDAG->getTargetConstant(0, DL, MVT::i1); + + Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1); + Offen = CurDAG->getTargetConstant(0, DL, MVT::i1); + Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1); + SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); + + if (CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + ConstantSDNode *C1 = cast(N1); + + if (N0.getOpcode() == ISD::ADD) { + // (add (add N2, N3), C1) -> addr64 + SDValue N2 = N0.getOperand(0); + SDValue N3 = N0.getOperand(1); + Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); + Ptr = N2; + VAddr = N3; + } else { + + // (add N0, C1) -> offset + VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32); + Ptr = N0; + } + + if (isLegalMUBUFImmOffset(C1)) { + Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); + return; + } else if (isUInt<32>(C1->getZExtValue())) { + // Illegal offset, store it in soffset. + Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)), + 0); + return; + } + } + + if (Addr.getOpcode() == ISD::ADD) { + // (add N0, N1) -> addr64 + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); + Ptr = N0; + VAddr = N1; + Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + return; + } + + // default case -> offset + VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32); + Ptr = Addr; + Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, + SDValue &VAddr, SDValue &SOffset, + SDValue &Offset, SDValue &GLC, + SDValue &SLC, SDValue &TFE) const { + SDValue Ptr, Offen, Idxen, Addr64; + + SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, + GLC, SLC, TFE); + + ConstantSDNode *C = cast(Addr64); + if (C->getSExtValue()) { + SDLoc DL(Addr); + + const SITargetLowering& Lowering = + *static_cast(getTargetLowering()); + + SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0); + return true; + } + + return false; +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, + SDValue &VAddr, SDValue &SOffset, + SDValue &Offset, + SDValue &SLC) const { + SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1); + SDValue GLC, TFE; + + return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE); +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, + SDValue &VAddr, SDValue &SOffset, + SDValue &ImmOffset) const { + + SDLoc DL(Addr); + MachineFunction &MF = CurDAG->getMachineFunction(); + const SIRegisterInfo *TRI = + static_cast(Subtarget->getRegisterInfo()); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const SITargetLowering& Lowering = + *static_cast(getTargetLowering()); + + unsigned ScratchOffsetReg = + TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); + Lowering.CreateLiveInRegister(*CurDAG, &AMDGPU::SReg_32RegClass, + ScratchOffsetReg, MVT::i32); + SDValue Sym0 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD0", MVT::i32); + SDValue ScratchRsrcDword0 = + SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym0), 0); + + SDValue Sym1 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD1", MVT::i32); + SDValue ScratchRsrcDword1 = + SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym1), 0); + + const SDValue RsrcOps[] = { + CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), + ScratchRsrcDword0, + CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), + ScratchRsrcDword1, + CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32), + }; + SDValue ScratchPtr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::v2i32, RsrcOps), 0); + Rsrc = SDValue(Lowering.buildScratchRSRC(*CurDAG, DL, ScratchPtr), 0); + SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, + MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32); + + // (add n0, c1) + if (CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N1 = Addr.getOperand(1); + ConstantSDNode *C1 = cast(N1); + + if (isLegalMUBUFImmOffset(C1)) { + VAddr = Addr.getOperand(0); + ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); + return true; + } + } + + // (node) + VAddr = Addr; + ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, + SDValue &SOffset, SDValue &Offset, + SDValue &GLC, SDValue &SLC, + SDValue &TFE) const { + SDValue Ptr, VAddr, Offen, Idxen, Addr64; + const SIInstrInfo *TII = + static_cast(Subtarget->getInstrInfo()); + + SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, + GLC, SLC, TFE); + + if (!cast(Offen)->getSExtValue() && + !cast(Idxen)->getSExtValue() && + !cast(Addr64)->getSExtValue()) { + uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | + APInt::getAllOnesValue(32).getZExtValue(); // Size + SDLoc DL(Addr); + + const SITargetLowering& Lowering = + *static_cast(getTargetLowering()); + + SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0); + return true; + } + return false; +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, + SDValue &Soffset, SDValue &Offset, + SDValue &GLC) const { + SDValue SLC, TFE; + + return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE); +} + +// FIXME: This is incorrect and only enough to be able to compile. +SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { + AddrSpaceCastSDNode *ASC = cast(N); + SDLoc DL(N); + + assert(Subtarget->hasFlatAddressSpace() && + "addrspacecast only supported with flat address space!"); + + assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS && + ASC->getDestAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) && + "Cannot cast address space to / from constant address!"); + + assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS || + ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) && + "Can only cast to / from flat address space!"); + + // The flat instructions read the address as the index of the VGPR holding the + // address, so casting should just be reinterpreting the base VGPR, so just + // insert trunc / bitcast / zext. + + SDValue Src = ASC->getOperand(0); + EVT DestVT = ASC->getValueType(0); + EVT SrcVT = Src.getValueType(); + + unsigned SrcSize = SrcVT.getSizeInBits(); + unsigned DestSize = DestVT.getSizeInBits(); + + if (SrcSize > DestSize) { + assert(SrcSize == 64 && DestSize == 32); + return CurDAG->getMachineNode( + TargetOpcode::EXTRACT_SUBREG, + DL, + DestVT, + Src, + CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32)); + } + + + if (DestSize > SrcSize) { + assert(SrcSize == 32 && DestSize == 64); + + // FIXME: This is probably wrong, we should never be defining + // a register class with both VGPRs and SGPRs + SDValue RC = CurDAG->getTargetConstant(AMDGPU::VS_64RegClassID, DL, + MVT::i32); + + const SDValue Ops[] = { + RC, + Src, + CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), + SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getConstant(0, DL, MVT::i32)), 0), + CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32) + }; + + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, + DL, N->getValueType(0), Ops); + } + + assert(SrcSize == 64 && DestSize == 64); + return CurDAG->getNode(ISD::BITCAST, DL, DestVT, Src).getNode(); +} + +SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val, + uint32_t Offset, uint32_t Width) { + // Transformation function, pack the offset and width of a BFE into + // the format expected by the S_BFE_I32 / S_BFE_U32. In the second + // source, bits [5:0] contain the offset and bits [22:16] the width. + uint32_t PackedVal = Offset | (Width << 16); + SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32); + + return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst); +} + +SDNode *AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) { + // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c) + // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c) + // Predicate: 0 < b <= c < 32 + + const SDValue &Shl = N->getOperand(0); + ConstantSDNode *B = dyn_cast(Shl->getOperand(1)); + ConstantSDNode *C = dyn_cast(N->getOperand(1)); + + if (B && C) { + uint32_t BVal = B->getZExtValue(); + uint32_t CVal = C->getZExtValue(); + + if (0 < BVal && BVal <= CVal && CVal < 32) { + bool Signed = N->getOpcode() == ISD::SRA; + unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; + + return getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0), + CVal - BVal, 32 - CVal); + } + } + return SelectCode(N); +} + +SDNode *AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) { + switch (N->getOpcode()) { + case ISD::AND: + if (N->getOperand(0).getOpcode() == ISD::SRL) { + // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)" + // Predicate: isMask(mask) + const SDValue &Srl = N->getOperand(0); + ConstantSDNode *Shift = dyn_cast(Srl.getOperand(1)); + ConstantSDNode *Mask = dyn_cast(N->getOperand(1)); + + if (Shift && Mask) { + uint32_t ShiftVal = Shift->getZExtValue(); + uint32_t MaskVal = Mask->getZExtValue(); + + if (isMask_32(MaskVal)) { + uint32_t WidthVal = countPopulation(MaskVal); + + return getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), Srl.getOperand(0), + ShiftVal, WidthVal); + } + } + } + break; + case ISD::SRL: + if (N->getOperand(0).getOpcode() == ISD::AND) { + // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)" + // Predicate: isMask(mask >> b) + const SDValue &And = N->getOperand(0); + ConstantSDNode *Shift = dyn_cast(N->getOperand(1)); + ConstantSDNode *Mask = dyn_cast(And->getOperand(1)); + + if (Shift && Mask) { + uint32_t ShiftVal = Shift->getZExtValue(); + uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal; + + if (isMask_32(MaskVal)) { + uint32_t WidthVal = countPopulation(MaskVal); + + return getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), And.getOperand(0), + ShiftVal, WidthVal); + } + } + } else if (N->getOperand(0).getOpcode() == ISD::SHL) + return SelectS_BFEFromShifts(N); + break; + case ISD::SRA: + if (N->getOperand(0).getOpcode() == ISD::SHL) + return SelectS_BFEFromShifts(N); + break; + } + + return SelectCode(N); +} + +bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + + unsigned Mods = 0; + + Src = In; + + if (Src.getOpcode() == ISD::FNEG) { + Mods |= SISrcMods::NEG; + Src = Src.getOperand(0); + } + + if (Src.getOpcode() == ISD::FABS) { + Mods |= SISrcMods::ABS; + Src = Src.getOperand(0); + } + + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + + return true; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src, + SDValue &SrcMods, SDValue &Clamp, + SDValue &Omod) const { + SDLoc DL(In); + // FIXME: Handle Clamp and Omod + Clamp = CurDAG->getTargetConstant(0, DL, MVT::i32); + Omod = CurDAG->getTargetConstant(0, DL, MVT::i32); + + return SelectVOP3Mods(In, Src, SrcMods); +} + +bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, + SDValue &SrcMods, + SDValue &Omod) const { + // FIXME: Handle Omod + Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32); + + return SelectVOP3Mods(In, Src, SrcMods); +} + +bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, + SDValue &SrcMods, + SDValue &Clamp, + SDValue &Omod) const { + Clamp = Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32); + return SelectVOP3Mods(In, Src, SrcMods); +} + +void AMDGPUDAGToDAGISel::PostprocessISelDAG() { + const AMDGPUTargetLowering& Lowering = + *static_cast(getTargetLowering()); + bool IsModified = false; + do { + IsModified = false; + // Go over all selected nodes and try to fold them a bit more + for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), + E = CurDAG->allnodes_end(); I != E; ++I) { + + SDNode *Node = I; + + MachineSDNode *MachineNode = dyn_cast(I); + if (!MachineNode) + continue; + + SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG); + if (ResNode != Node) { + ReplaceUses(Node, ResNode); + IsModified = true; + } + } + CurDAG->RemoveDeadNodes(); + } while (IsModified); +} diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp new file mode 100644 index 00000000000..d56838ec201 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -0,0 +1,2866 @@ +//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This is the parent TargetLowering class for hardware code gen +/// targets. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUISelLowering.h" +#include "AMDGPU.h" +#include "AMDGPUFrameLowering.h" +#include "AMDGPUIntrinsicInfo.h" +#include "AMDGPURegisterInfo.h" +#include "AMDGPUSubtarget.h" +#include "R600MachineFunctionInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/DiagnosticPrinter.h" + +using namespace llvm; + +namespace { + +/// Diagnostic information for unimplemented or unsupported feature reporting. +class DiagnosticInfoUnsupported : public DiagnosticInfo { +private: + const Twine &Description; + const Function &Fn; + + static int KindID; + + static int getKindID() { + if (KindID == 0) + KindID = llvm::getNextAvailablePluginDiagnosticKind(); + return KindID; + } + +public: + DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc, + DiagnosticSeverity Severity = DS_Error) + : DiagnosticInfo(getKindID(), Severity), + Description(Desc), + Fn(Fn) { } + + const Function &getFunction() const { return Fn; } + const Twine &getDescription() const { return Description; } + + void print(DiagnosticPrinter &DP) const override { + DP << "unsupported " << getDescription() << " in " << Fn.getName(); + } + + static bool classof(const DiagnosticInfo *DI) { + return DI->getKind() == getKindID(); + } +}; + +int DiagnosticInfoUnsupported::KindID = 0; +} + + +static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State) { + unsigned Offset = State.AllocateStack(ValVT.getStoreSize(), + ArgFlags.getOrigAlign()); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); + + return true; +} + +#include "AMDGPUGenCallingConv.inc" + +// Find a larger type to do a load / store of a vector with. +EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { + unsigned StoreSize = VT.getStoreSizeInBits(); + if (StoreSize <= 32) + return EVT::getIntegerVT(Ctx, StoreSize); + + assert(StoreSize % 32 == 0 && "Store size not a multiple of 32"); + return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); +} + +// Type for a vector that will be loaded to. +EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) { + unsigned StoreSize = VT.getStoreSizeInBits(); + if (StoreSize <= 32) + return EVT::getIntegerVT(Ctx, 32); + + return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); +} + +AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, + const AMDGPUSubtarget &STI) + : TargetLowering(TM), Subtarget(&STI) { + setOperationAction(ISD::Constant, MVT::i32, Legal); + setOperationAction(ISD::Constant, MVT::i64, Legal); + setOperationAction(ISD::ConstantFP, MVT::f32, Legal); + setOperationAction(ISD::ConstantFP, MVT::f64, Legal); + + setOperationAction(ISD::BR_JT, MVT::Other, Expand); + setOperationAction(ISD::BRIND, MVT::Other, Expand); + + // We need to custom lower some of the intrinsics + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + + // Library functions. These default to Expand, but we have instructions + // for them. + setOperationAction(ISD::FCEIL, MVT::f32, Legal); + setOperationAction(ISD::FEXP2, MVT::f32, Legal); + setOperationAction(ISD::FPOW, MVT::f32, Legal); + setOperationAction(ISD::FLOG2, MVT::f32, Legal); + setOperationAction(ISD::FABS, MVT::f32, Legal); + setOperationAction(ISD::FFLOOR, MVT::f32, Legal); + setOperationAction(ISD::FRINT, MVT::f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::f32, Legal); + setOperationAction(ISD::FMINNUM, MVT::f32, Legal); + setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); + + setOperationAction(ISD::FROUND, MVT::f32, Custom); + setOperationAction(ISD::FROUND, MVT::f64, Custom); + + setOperationAction(ISD::FREM, MVT::f32, Custom); + setOperationAction(ISD::FREM, MVT::f64, Custom); + + // v_mad_f32 does not support denormals according to some sources. + if (!Subtarget->hasFP32Denormals()) + setOperationAction(ISD::FMAD, MVT::f32, Legal); + + // Expand to fneg + fadd. + setOperationAction(ISD::FSUB, MVT::f64, Expand); + + // Lower floating point store/load to integer store/load to reduce the number + // of patterns in tablegen. + setOperationAction(ISD::STORE, MVT::f32, Promote); + AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32); + + setOperationAction(ISD::STORE, MVT::v2f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32); + + setOperationAction(ISD::STORE, MVT::v4f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); + + setOperationAction(ISD::STORE, MVT::v8f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32); + + setOperationAction(ISD::STORE, MVT::v16f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32); + + setOperationAction(ISD::STORE, MVT::f64, Promote); + AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64); + + setOperationAction(ISD::STORE, MVT::v2f64, Promote); + AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v2i64); + + // Custom lowering of vector stores is required for local address space + // stores. + setOperationAction(ISD::STORE, MVT::v4i32, Custom); + + setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); + setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); + setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom); + + // XXX: This can be change to Custom, once ExpandVectorStores can + // handle 64-bit stores. + setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); + + setTruncStoreAction(MVT::i64, MVT::i16, Expand); + setTruncStoreAction(MVT::i64, MVT::i8, Expand); + setTruncStoreAction(MVT::i64, MVT::i1, Expand); + setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand); + setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand); + + + setOperationAction(ISD::LOAD, MVT::f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); + + setOperationAction(ISD::LOAD, MVT::v2f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); + + setOperationAction(ISD::LOAD, MVT::v4f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); + + setOperationAction(ISD::LOAD, MVT::v8f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32); + + setOperationAction(ISD::LOAD, MVT::v16f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32); + + setOperationAction(ISD::LOAD, MVT::f64, Promote); + AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64); + + setOperationAction(ISD::LOAD, MVT::v2f64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v2i64); + + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); + + // There are no 64-bit extloads. These should be done as a 32-bit extload and + // an extension to 64-bit. + for (MVT VT : MVT::integer_valuetypes()) { + setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand); + setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand); + } + + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand); + } + + setOperationAction(ISD::BR_CC, MVT::i1, Expand); + + if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { + setOperationAction(ISD::FCEIL, MVT::f64, Custom); + setOperationAction(ISD::FTRUNC, MVT::f64, Custom); + setOperationAction(ISD::FRINT, MVT::f64, Custom); + setOperationAction(ISD::FFLOOR, MVT::f64, Custom); + } + + if (!Subtarget->hasBFI()) { + // fcopysign can be done in a single instruction with BFI. + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); + } + + setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); + + setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand); + + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand); + + setTruncStoreAction(MVT::f32, MVT::f16, Expand); + setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand); + setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand); + setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand); + + setTruncStoreAction(MVT::f64, MVT::f16, Expand); + setTruncStoreAction(MVT::f64, MVT::f32, Expand); + + const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; + for (MVT VT : ScalarIntVTs) { + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::SDIV, VT, Expand); + + // GPU does not have divrem function for signed or unsigned. + setOperationAction(ISD::SDIVREM, VT, Custom); + setOperationAction(ISD::UDIVREM, VT, Custom); + + // GPU does not have [S|U]MUL_LOHI functions as a single instruction. + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::UMUL_LOHI, VT, Expand); + + setOperationAction(ISD::BSWAP, VT, Expand); + setOperationAction(ISD::CTTZ, VT, Expand); + setOperationAction(ISD::CTLZ, VT, Expand); + } + + if (!Subtarget->hasBCNT(32)) + setOperationAction(ISD::CTPOP, MVT::i32, Expand); + + if (!Subtarget->hasBCNT(64)) + setOperationAction(ISD::CTPOP, MVT::i64, Expand); + + // The hardware supports 32-bit ROTR, but not ROTL. + setOperationAction(ISD::ROTL, MVT::i32, Expand); + setOperationAction(ISD::ROTL, MVT::i64, Expand); + setOperationAction(ISD::ROTR, MVT::i64, Expand); + + setOperationAction(ISD::MUL, MVT::i64, Expand); + setOperationAction(ISD::MULHU, MVT::i64, Expand); + setOperationAction(ISD::MULHS, MVT::i64, Expand); + setOperationAction(ISD::UDIV, MVT::i32, Expand); + setOperationAction(ISD::UREM, MVT::i32, Expand); + setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); + setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); + + setOperationAction(ISD::SMIN, MVT::i32, Legal); + setOperationAction(ISD::UMIN, MVT::i32, Legal); + setOperationAction(ISD::SMAX, MVT::i32, Legal); + setOperationAction(ISD::UMAX, MVT::i32, Legal); + + if (!Subtarget->hasFFBH()) + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); + + if (!Subtarget->hasFFBL()) + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); + + static const MVT::SimpleValueType VectorIntTypes[] = { + MVT::v2i32, MVT::v4i32 + }; + + for (MVT VT : VectorIntTypes) { + // Expand the following operations for the current type by default. + setOperationAction(ISD::ADD, VT, Expand); + setOperationAction(ISD::AND, VT, Expand); + setOperationAction(ISD::FP_TO_SINT, VT, Expand); + setOperationAction(ISD::FP_TO_UINT, VT, Expand); + setOperationAction(ISD::MUL, VT, Expand); + setOperationAction(ISD::OR, VT, Expand); + setOperationAction(ISD::SHL, VT, Expand); + setOperationAction(ISD::SRA, VT, Expand); + setOperationAction(ISD::SRL, VT, Expand); + setOperationAction(ISD::ROTL, VT, Expand); + setOperationAction(ISD::ROTR, VT, Expand); + setOperationAction(ISD::SUB, VT, Expand); + setOperationAction(ISD::SINT_TO_FP, VT, Expand); + setOperationAction(ISD::UINT_TO_FP, VT, Expand); + setOperationAction(ISD::SDIV, VT, Expand); + setOperationAction(ISD::UDIV, VT, Expand); + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::UREM, VT, Expand); + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::UMUL_LOHI, VT, Expand); + setOperationAction(ISD::SDIVREM, VT, Custom); + setOperationAction(ISD::UDIVREM, VT, Custom); + setOperationAction(ISD::ADDC, VT, Expand); + setOperationAction(ISD::SUBC, VT, Expand); + setOperationAction(ISD::ADDE, VT, Expand); + setOperationAction(ISD::SUBE, VT, Expand); + setOperationAction(ISD::SELECT, VT, Expand); + setOperationAction(ISD::VSELECT, VT, Expand); + setOperationAction(ISD::SELECT_CC, VT, Expand); + setOperationAction(ISD::XOR, VT, Expand); + setOperationAction(ISD::BSWAP, VT, Expand); + setOperationAction(ISD::CTPOP, VT, Expand); + setOperationAction(ISD::CTTZ, VT, Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); + setOperationAction(ISD::CTLZ, VT, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); + } + + static const MVT::SimpleValueType FloatVectorTypes[] = { + MVT::v2f32, MVT::v4f32 + }; + + for (MVT VT : FloatVectorTypes) { + setOperationAction(ISD::FABS, VT, Expand); + setOperationAction(ISD::FMINNUM, VT, Expand); + setOperationAction(ISD::FMAXNUM, VT, Expand); + setOperationAction(ISD::FADD, VT, Expand); + setOperationAction(ISD::FCEIL, VT, Expand); + setOperationAction(ISD::FCOS, VT, Expand); + setOperationAction(ISD::FDIV, VT, Expand); + setOperationAction(ISD::FEXP2, VT, Expand); + setOperationAction(ISD::FLOG2, VT, Expand); + setOperationAction(ISD::FREM, VT, Expand); + setOperationAction(ISD::FPOW, VT, Expand); + setOperationAction(ISD::FFLOOR, VT, Expand); + setOperationAction(ISD::FTRUNC, VT, Expand); + setOperationAction(ISD::FMUL, VT, Expand); + setOperationAction(ISD::FMA, VT, Expand); + setOperationAction(ISD::FRINT, VT, Expand); + setOperationAction(ISD::FNEARBYINT, VT, Expand); + setOperationAction(ISD::FSQRT, VT, Expand); + setOperationAction(ISD::FSIN, VT, Expand); + setOperationAction(ISD::FSUB, VT, Expand); + setOperationAction(ISD::FNEG, VT, Expand); + setOperationAction(ISD::SELECT, VT, Expand); + setOperationAction(ISD::VSELECT, VT, Expand); + setOperationAction(ISD::SELECT_CC, VT, Expand); + setOperationAction(ISD::FCOPYSIGN, VT, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); + } + + setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom); + setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); + + setTargetDAGCombine(ISD::MUL); + setTargetDAGCombine(ISD::SELECT); + setTargetDAGCombine(ISD::SELECT_CC); + setTargetDAGCombine(ISD::STORE); + + setTargetDAGCombine(ISD::FADD); + setTargetDAGCombine(ISD::FSUB); + + setBooleanContents(ZeroOrNegativeOneBooleanContent); + setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); + + setSchedulingPreference(Sched::RegPressure); + setJumpIsExpensive(true); + + // SI at least has hardware support for floating point exceptions, but no way + // of using or handling them is implemented. They are also optional in OpenCL + // (Section 7.3) + setHasFloatingPointExceptions(false); + + setSelectIsExpensive(false); + PredictableSelectIsExpensive = false; + + // There are no integer divide instructions, and these expand to a pretty + // large sequence of instructions. + setIntDivIsCheap(false); + setPow2SDivIsCheap(false); + setFsqrtIsCheap(true); + + // FIXME: Need to really handle these. + MaxStoresPerMemcpy = 4096; + MaxStoresPerMemmove = 4096; + MaxStoresPerMemset = 4096; +} + +//===----------------------------------------------------------------------===// +// Target Information +//===----------------------------------------------------------------------===// + +MVT AMDGPUTargetLowering::getVectorIdxTy() const { + return MVT::i32; +} + +bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const { + return true; +} + +// The backend supports 32 and 64 bit floating point immediates. +// FIXME: Why are we reporting vectors of FP immediates as legal? +bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { + EVT ScalarVT = VT.getScalarType(); + return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64); +} + +// We don't want to shrink f64 / f32 constants. +bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const { + EVT ScalarVT = VT.getScalarType(); + return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64); +} + +bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N, + ISD::LoadExtType, + EVT NewVT) const { + + unsigned NewSize = NewVT.getStoreSizeInBits(); + + // If we are reducing to a 32-bit load, this is always better. + if (NewSize == 32) + return true; + + EVT OldVT = N->getValueType(0); + unsigned OldSize = OldVT.getStoreSizeInBits(); + + // Don't produce extloads from sub 32-bit types. SI doesn't have scalar + // extloads, so doing one requires using a buffer_load. In cases where we + // still couldn't use a scalar load, using the wider load shouldn't really + // hurt anything. + + // If the old size already had to be an extload, there's no harm in continuing + // to reduce the width. + return (OldSize < 32); +} + +bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, + EVT CastTy) const { + if (LoadTy.getSizeInBits() != CastTy.getSizeInBits()) + return true; + + unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits(); + unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits(); + + return ((LScalarSize <= CastScalarSize) || + (CastScalarSize >= 32) || + (LScalarSize < 32)); +} + +// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also +// profitable with the expansion for 64-bit since it's generally good to +// speculate things. +// FIXME: These should really have the size as a parameter. +bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const { + return true; +} + +bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const { + return true; +} + +//===---------------------------------------------------------------------===// +// Target Properties +//===---------------------------------------------------------------------===// + +bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { + assert(VT.isFloatingPoint()); + return VT == MVT::f32 || VT == MVT::f64; +} + +bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { + assert(VT.isFloatingPoint()); + return VT == MVT::f32 || VT == MVT::f64; +} + +bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT, + unsigned NumElem, + unsigned AS) const { + return true; +} + +bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { + // Truncate is just accessing a subregister. + return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0); +} + +bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const { + // Truncate is just accessing a subregister. + return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() && + (Dest->getPrimitiveSizeInBits() % 32 == 0); +} + +bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const { + const DataLayout *DL = getDataLayout(); + unsigned SrcSize = DL->getTypeSizeInBits(Src->getScalarType()); + unsigned DestSize = DL->getTypeSizeInBits(Dest->getScalarType()); + + return SrcSize == 32 && DestSize == 64; +} + +bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const { + // Any register load of a 64-bit value really requires 2 32-bit moves. For all + // practical purposes, the extra mov 0 to load a 64-bit is free. As used, + // this will enable reducing 64-bit operations the 32-bit, which is always + // good. + return Src == MVT::i32 && Dest == MVT::i64; +} + +bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { + return isZExtFree(Val.getValueType(), VT2); +} + +bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { + // There aren't really 64-bit registers, but pairs of 32-bit ones and only a + // limited number of native 64-bit operations. Shrinking an operation to fit + // in a single 32-bit register should always be helpful. As currently used, + // this is much less general than the name suggests, and is only used in + // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is + // not profitable, and may actually be harmful. + return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32; +} + +//===---------------------------------------------------------------------===// +// TargetLowering Callbacks +//===---------------------------------------------------------------------===// + +void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State, + const SmallVectorImpl &Ins) const { + + State.AnalyzeFormalArguments(Ins, CC_AMDGPU); +} + +SDValue AMDGPUTargetLowering::LowerReturn( + SDValue Chain, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, + SDLoc DL, SelectionDAG &DAG) const { + return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain); +} + +//===---------------------------------------------------------------------===// +// Target specific lowering +//===---------------------------------------------------------------------===// + +SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, + SmallVectorImpl &InVals) const { + SDValue Callee = CLI.Callee; + SelectionDAG &DAG = CLI.DAG; + + const Function &Fn = *DAG.getMachineFunction().getFunction(); + + StringRef FuncName(""); + + if (const ExternalSymbolSDNode *G = dyn_cast(Callee)) + FuncName = G->getSymbol(); + else if (const GlobalAddressSDNode *G = dyn_cast(Callee)) + FuncName = G->getGlobal()->getName(); + + DiagnosticInfoUnsupported NoCalls(Fn, "call to function " + FuncName); + DAG.getContext()->diagnose(NoCalls); + return SDValue(); +} + +SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, + SelectionDAG &DAG) const { + switch (Op.getOpcode()) { + default: + Op.getNode()->dump(); + llvm_unreachable("Custom lowering code for this" + "instruction is not implemented yet!"); + break; + case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); + case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); + case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); + case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); + case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); + case ISD::SDIVREM: return LowerSDIVREM(Op, DAG); + case ISD::FREM: return LowerFREM(Op, DAG); + case ISD::FCEIL: return LowerFCEIL(Op, DAG); + case ISD::FTRUNC: return LowerFTRUNC(Op, DAG); + case ISD::FRINT: return LowerFRINT(Op, DAG); + case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG); + case ISD::FROUND: return LowerFROUND(Op, DAG); + case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); + case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); + case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); + case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); + case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); + } + return Op; +} + +void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, + SmallVectorImpl &Results, + SelectionDAG &DAG) const { + switch (N->getOpcode()) { + case ISD::SIGN_EXTEND_INREG: + // Different parts of legalization seem to interpret which type of + // sign_extend_inreg is the one to check for custom lowering. The extended + // from type is what really matters, but some places check for custom + // lowering of the result type. This results in trying to use + // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do + // nothing here and let the illegal result integer be handled normally. + return; + case ISD::LOAD: { + SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode(); + if (!Node) + return; + + Results.push_back(SDValue(Node, 0)); + Results.push_back(SDValue(Node, 1)); + // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode + // function + DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1)); + return; + } + case ISD::STORE: { + SDValue Lowered = LowerSTORE(SDValue(N, 0), DAG); + if (Lowered.getNode()) + Results.push_back(Lowered); + return; + } + default: + return; + } +} + +// FIXME: This implements accesses to initialized globals in the constant +// address space by copying them to private and accessing that. It does not +// properly handle illegal types or vectors. The private vector loads are not +// scalarized, and the illegal scalars hit an assertion. This technique will not +// work well with large initializers, and this should eventually be +// removed. Initialized globals should be placed into a data section that the +// runtime will load into a buffer before the kernel is executed. Uses of the +// global need to be replaced with a pointer loaded from an implicit kernel +// argument into this buffer holding the copy of the data, which will remove the +// need for any of this. +SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, + const GlobalValue *GV, + const SDValue &InitPtr, + SDValue Chain, + SelectionDAG &DAG) const { + const DataLayout *TD = getDataLayout(); + SDLoc DL(InitPtr); + Type *InitTy = Init->getType(); + + if (const ConstantInt *CI = dyn_cast(Init)) { + EVT VT = EVT::getEVT(InitTy); + PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); + return DAG.getStore(Chain, DL, DAG.getConstant(*CI, DL, VT), InitPtr, + MachinePointerInfo(UndefValue::get(PtrTy)), false, false, + TD->getPrefTypeAlignment(InitTy)); + } + + if (const ConstantFP *CFP = dyn_cast(Init)) { + EVT VT = EVT::getEVT(CFP->getType()); + PointerType *PtrTy = PointerType::get(CFP->getType(), 0); + return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, DL, VT), InitPtr, + MachinePointerInfo(UndefValue::get(PtrTy)), false, false, + TD->getPrefTypeAlignment(CFP->getType())); + } + + if (StructType *ST = dyn_cast(InitTy)) { + const StructLayout *SL = TD->getStructLayout(ST); + + EVT PtrVT = InitPtr.getValueType(); + SmallVector Chains; + + for (unsigned I = 0, N = ST->getNumElements(); I != N; ++I) { + SDValue Offset = DAG.getConstant(SL->getElementOffset(I), DL, PtrVT); + SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset); + + Constant *Elt = Init->getAggregateElement(I); + Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG)); + } + + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + } + + if (SequentialType *SeqTy = dyn_cast(InitTy)) { + EVT PtrVT = InitPtr.getValueType(); + + unsigned NumElements; + if (ArrayType *AT = dyn_cast(SeqTy)) + NumElements = AT->getNumElements(); + else if (VectorType *VT = dyn_cast(SeqTy)) + NumElements = VT->getNumElements(); + else + llvm_unreachable("Unexpected type"); + + unsigned EltSize = TD->getTypeAllocSize(SeqTy->getElementType()); + SmallVector Chains; + for (unsigned i = 0; i < NumElements; ++i) { + SDValue Offset = DAG.getConstant(i * EltSize, DL, PtrVT); + SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset); + + Constant *Elt = Init->getAggregateElement(i); + Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG)); + } + + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + } + + if (isa(Init)) { + EVT VT = EVT::getEVT(InitTy); + PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); + return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr, + MachinePointerInfo(UndefValue::get(PtrTy)), false, false, + TD->getPrefTypeAlignment(InitTy)); + } + + Init->dump(); + llvm_unreachable("Unhandled constant initializer"); +} + +static bool hasDefinedInitializer(const GlobalValue *GV) { + const GlobalVariable *GVar = dyn_cast(GV); + if (!GVar || !GVar->hasInitializer()) + return false; + + if (isa(GVar->getInitializer())) + return false; + + return true; +} + +SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, + SDValue Op, + SelectionDAG &DAG) const { + + const DataLayout *TD = getDataLayout(); + GlobalAddressSDNode *G = cast(Op); + const GlobalValue *GV = G->getGlobal(); + + switch (G->getAddressSpace()) { + case AMDGPUAS::LOCAL_ADDRESS: { + // XXX: What does the value of G->getOffset() mean? + assert(G->getOffset() == 0 && + "Do not know what to do with an non-zero offset"); + + // TODO: We could emit code to handle the initialization somewhere. + if (hasDefinedInitializer(GV)) + break; + + unsigned Offset; + if (MFI->LocalMemoryObjects.count(GV) == 0) { + uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType()); + Offset = MFI->LDSSize; + MFI->LocalMemoryObjects[GV] = Offset; + // XXX: Account for alignment? + MFI->LDSSize += Size; + } else { + Offset = MFI->LocalMemoryObjects[GV]; + } + + return DAG.getConstant(Offset, SDLoc(Op), + getPointerTy(AMDGPUAS::LOCAL_ADDRESS)); + } + case AMDGPUAS::CONSTANT_ADDRESS: { + MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); + Type *EltType = GV->getType()->getElementType(); + unsigned Size = TD->getTypeAllocSize(EltType); + unsigned Alignment = TD->getPrefTypeAlignment(EltType); + + MVT PrivPtrVT = getPointerTy(AMDGPUAS::PRIVATE_ADDRESS); + MVT ConstPtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS); + + int FI = FrameInfo->CreateStackObject(Size, Alignment, false); + SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT); + + const GlobalVariable *Var = cast(GV); + if (!Var->hasInitializer()) { + // This has no use, but bugpoint will hit it. + return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT); + } + + const Constant *Init = Var->getInitializer(); + SmallVector WorkList; + + for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(), + E = DAG.getEntryNode()->use_end(); I != E; ++I) { + if (I->getOpcode() != AMDGPUISD::REGISTER_LOAD && I->getOpcode() != ISD::LOAD) + continue; + WorkList.push_back(*I); + } + SDValue Chain = LowerConstantInitializer(Init, GV, InitPtr, DAG.getEntryNode(), DAG); + for (SmallVector::iterator I = WorkList.begin(), + E = WorkList.end(); I != E; ++I) { + SmallVector Ops; + Ops.push_back(Chain); + for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) { + Ops.push_back((*I)->getOperand(i)); + } + DAG.UpdateNodeOperands(*I, Ops); + } + return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT); + } + } + + const Function &Fn = *DAG.getMachineFunction().getFunction(); + DiagnosticInfoUnsupported BadInit(Fn, + "initializer for address space"); + DAG.getContext()->diagnose(BadInit); + return SDValue(); +} + +SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, + SelectionDAG &DAG) const { + SmallVector Args; + + for (const SDUse &U : Op->ops()) + DAG.ExtractVectorElements(U.get(), Args); + + return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args); +} + +SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, + SelectionDAG &DAG) const { + + SmallVector Args; + unsigned Start = cast(Op.getOperand(1))->getZExtValue(); + EVT VT = Op.getValueType(); + DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, + VT.getVectorNumElements()); + + return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args); +} + +SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op, + SelectionDAG &DAG) const { + + MachineFunction &MF = DAG.getMachineFunction(); + const AMDGPUFrameLowering *TFL = Subtarget->getFrameLowering(); + + FrameIndexSDNode *FIN = cast(Op); + + unsigned FrameIndex = FIN->getIndex(); + unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex); + return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op), + Op.getValueType()); +} + +SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, + SelectionDAG &DAG) const { + unsigned IntrinsicID = cast(Op.getOperand(0))->getZExtValue(); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + switch (IntrinsicID) { + default: return Op; + case AMDGPUIntrinsic::AMDGPU_abs: + case AMDGPUIntrinsic::AMDIL_abs: // Legacy name. + return LowerIntrinsicIABS(Op, DAG); + case AMDGPUIntrinsic::AMDGPU_lrp: + return LowerIntrinsicLRP(Op, DAG); + + case AMDGPUIntrinsic::AMDGPU_clamp: + case AMDGPUIntrinsic::AMDIL_clamp: // Legacy name. + return DAG.getNode(AMDGPUISD::CLAMP, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case Intrinsic::AMDGPU_div_scale: { + // 3rd parameter required to be a constant. + const ConstantSDNode *Param = dyn_cast(Op.getOperand(3)); + if (!Param) + return DAG.getUNDEF(VT); + + // Translate to the operands expected by the machine instruction. The + // first parameter must be the same as the first instruction. + SDValue Numerator = Op.getOperand(1); + SDValue Denominator = Op.getOperand(2); + + // Note this order is opposite of the machine instruction's operations, + // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The + // intrinsic has the numerator as the first operand to match a normal + // division operation. + + SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator; + + return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0, + Denominator, Numerator); + } + + case Intrinsic::AMDGPU_div_fmas: + return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), + Op.getOperand(4)); + + case Intrinsic::AMDGPU_div_fixup: + return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case Intrinsic::AMDGPU_trig_preop: + return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::AMDGPU_rcp: + return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); + + case Intrinsic::AMDGPU_rsq: + return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDGPU_legacy_rsq: + return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); + + case Intrinsic::AMDGPU_rsq_clamped: + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + Type *Type = VT.getTypeForEVT(*DAG.getContext()); + APFloat Max = APFloat::getLargest(Type->getFltSemantics()); + APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true); + + SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); + SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, + DAG.getConstantFP(Max, DL, VT)); + return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp, + DAG.getConstantFP(Min, DL, VT)); + } else { + return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1)); + } + + case Intrinsic::AMDGPU_ldexp: + return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, Op.getOperand(1), + Op.getOperand(2)); + + case AMDGPUIntrinsic::AMDGPU_imax: + return DAG.getNode(ISD::SMAX, DL, VT, Op.getOperand(1), + Op.getOperand(2)); + case AMDGPUIntrinsic::AMDGPU_umax: + return DAG.getNode(ISD::UMAX, DL, VT, Op.getOperand(1), + Op.getOperand(2)); + case AMDGPUIntrinsic::AMDGPU_imin: + return DAG.getNode(ISD::SMIN, DL, VT, Op.getOperand(1), + Op.getOperand(2)); + case AMDGPUIntrinsic::AMDGPU_umin: + return DAG.getNode(ISD::UMIN, DL, VT, Op.getOperand(1), + Op.getOperand(2)); + + case AMDGPUIntrinsic::AMDGPU_umul24: + return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + + case AMDGPUIntrinsic::AMDGPU_imul24: + return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + + case AMDGPUIntrinsic::AMDGPU_umad24: + return DAG.getNode(AMDGPUISD::MAD_U24, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_imad24: + return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte0: + return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte1: + return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE1, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte2: + return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE2, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte3: + return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE3, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDGPU_bfe_i32: + return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_bfe_u32: + return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_bfi: + return DAG.getNode(AMDGPUISD::BFI, DL, VT, + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_bfm: + return DAG.getNode(AMDGPUISD::BFM, DL, VT, + Op.getOperand(1), + Op.getOperand(2)); + + case AMDGPUIntrinsic::AMDGPU_brev: + return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1)); + + case Intrinsic::AMDGPU_class: + return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + + case AMDGPUIntrinsic::AMDIL_exp: // Legacy name. + return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name. + return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1)); + case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name. + return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1)); + } +} + +///IABS(a) = SMAX(sub(0, a), a) +SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), + Op.getOperand(1)); + + return DAG.getNode(ISD::SMAX, DL, VT, Neg, Op.getOperand(1)); +} + +/// Linear Interpolation +/// LRP(a, b, c) = muladd(a, b, (1 - a) * c) +SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT, + DAG.getConstantFP(1.0f, DL, MVT::f32), + Op.getOperand(1)); + SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA, + Op.getOperand(3)); + return DAG.getNode(ISD::FADD, DL, VT, + DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), Op.getOperand(2)), + OneSubAC); +} + +/// \brief Generate Min/Max node +SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL, + EVT VT, + SDValue LHS, + SDValue RHS, + SDValue True, + SDValue False, + SDValue CC, + DAGCombinerInfo &DCI) const { + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + return SDValue(); + + if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + ISD::CondCode CCOpcode = cast(CC)->get(); + switch (CCOpcode) { + case ISD::SETOEQ: + case ISD::SETONE: + case ISD::SETUNE: + case ISD::SETNE: + case ISD::SETUEQ: + case ISD::SETEQ: + case ISD::SETFALSE: + case ISD::SETFALSE2: + case ISD::SETTRUE: + case ISD::SETTRUE2: + case ISD::SETUO: + case ISD::SETO: + break; + case ISD::SETULE: + case ISD::SETULT: { + if (LHS == True) + return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); + return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); + } + case ISD::SETOLE: + case ISD::SETOLT: + case ISD::SETLE: + case ISD::SETLT: { + // Ordered. Assume ordered for undefined. + + // Only do this after legalization to avoid interfering with other combines + // which might occur. + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG && + !DCI.isCalledByLegalizer()) + return SDValue(); + + // We need to permute the operands to get the correct NaN behavior. The + // selected operand is the second one based on the failing compare with NaN, + // so permute it based on the compare type the hardware uses. + if (LHS == True) + return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); + return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); + } + case ISD::SETUGE: + case ISD::SETUGT: { + if (LHS == True) + return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); + return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); + } + case ISD::SETGT: + case ISD::SETGE: + case ISD::SETOGE: + case ISD::SETOGT: { + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG && + !DCI.isCalledByLegalizer()) + return SDValue(); + + if (LHS == True) + return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); + return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); + } + case ISD::SETCC_INVALID: + llvm_unreachable("Invalid setcc condcode!"); + } + return SDValue(); +} + +// FIXME: Remove this when combines added to DAGCombiner. +SDValue AMDGPUTargetLowering::CombineIMinMax(SDLoc DL, + EVT VT, + SDValue LHS, + SDValue RHS, + SDValue True, + SDValue False, + SDValue CC, + SelectionDAG &DAG) const { + if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) + return SDValue(); + + ISD::CondCode CCOpcode = cast(CC)->get(); + switch (CCOpcode) { + case ISD::SETULE: + case ISD::SETULT: { + unsigned Opc = (LHS == True) ? ISD::UMIN : ISD::UMAX; + return DAG.getNode(Opc, DL, VT, LHS, RHS); + } + case ISD::SETLE: + case ISD::SETLT: { + unsigned Opc = (LHS == True) ? ISD::SMIN : ISD::SMAX; + return DAG.getNode(Opc, DL, VT, LHS, RHS); + } + case ISD::SETGT: + case ISD::SETGE: { + unsigned Opc = (LHS == True) ? ISD::SMAX : ISD::SMIN; + return DAG.getNode(Opc, DL, VT, LHS, RHS); + } + case ISD::SETUGE: + case ISD::SETUGT: { + unsigned Opc = (LHS == True) ? ISD::UMAX : ISD::UMIN; + return DAG.getNode(Opc, DL, VT, LHS, RHS); + } + default: + return SDValue(); + } +} + +SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op, + SelectionDAG &DAG) const { + LoadSDNode *Load = cast(Op); + EVT MemVT = Load->getMemoryVT(); + EVT MemEltVT = MemVT.getVectorElementType(); + + EVT LoadVT = Op.getValueType(); + EVT EltVT = LoadVT.getVectorElementType(); + EVT PtrVT = Load->getBasePtr().getValueType(); + + unsigned NumElts = Load->getMemoryVT().getVectorNumElements(); + SmallVector Loads; + SmallVector Chains; + + SDLoc SL(Op); + unsigned MemEltSize = MemEltVT.getStoreSize(); + MachinePointerInfo SrcValue(Load->getMemOperand()->getValue()); + + for (unsigned i = 0; i < NumElts; ++i) { + SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(), + DAG.getConstant(i * MemEltSize, SL, PtrVT)); + + SDValue NewLoad + = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT, + Load->getChain(), Ptr, + SrcValue.getWithOffset(i * MemEltSize), + MemEltVT, Load->isVolatile(), Load->isNonTemporal(), + Load->isInvariant(), Load->getAlignment()); + Loads.push_back(NewLoad.getValue(0)); + Chains.push_back(NewLoad.getValue(1)); + } + + SDValue Ops[] = { + DAG.getNode(ISD::BUILD_VECTOR, SL, LoadVT, Loads), + DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains) + }; + + return DAG.getMergeValues(Ops, SL); +} + +SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + // If this is a 2 element vector, we really want to scalarize and not create + // weird 1 element vectors. + if (VT.getVectorNumElements() == 2) + return ScalarizeVectorLoad(Op, DAG); + + LoadSDNode *Load = cast(Op); + SDValue BasePtr = Load->getBasePtr(); + EVT PtrVT = BasePtr.getValueType(); + EVT MemVT = Load->getMemoryVT(); + SDLoc SL(Op); + MachinePointerInfo SrcValue(Load->getMemOperand()->getValue()); + + EVT LoVT, HiVT; + EVT LoMemVT, HiMemVT; + SDValue Lo, Hi; + + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); + std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); + std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT); + SDValue LoLoad + = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT, + Load->getChain(), BasePtr, + SrcValue, + LoMemVT, Load->isVolatile(), Load->isNonTemporal(), + Load->isInvariant(), Load->getAlignment()); + + SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, + DAG.getConstant(LoMemVT.getStoreSize(), SL, + PtrVT)); + + SDValue HiLoad + = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, + Load->getChain(), HiPtr, + SrcValue.getWithOffset(LoMemVT.getStoreSize()), + HiMemVT, Load->isVolatile(), Load->isNonTemporal(), + Load->isInvariant(), Load->getAlignment()); + + SDValue Ops[] = { + DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad), + DAG.getNode(ISD::TokenFactor, SL, MVT::Other, + LoLoad.getValue(1), HiLoad.getValue(1)) + }; + + return DAG.getMergeValues(Ops, SL); +} + +SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, + SelectionDAG &DAG) const { + StoreSDNode *Store = cast(Op); + EVT MemVT = Store->getMemoryVT(); + unsigned MemBits = MemVT.getSizeInBits(); + + // Byte stores are really expensive, so if possible, try to pack 32-bit vector + // truncating store into an i32 store. + // XXX: We could also handle optimize other vector bitwidths. + if (!MemVT.isVector() || MemBits > 32) { + return SDValue(); + } + + SDLoc DL(Op); + SDValue Value = Store->getValue(); + EVT VT = Value.getValueType(); + EVT ElemVT = VT.getVectorElementType(); + SDValue Ptr = Store->getBasePtr(); + EVT MemEltVT = MemVT.getVectorElementType(); + unsigned MemEltBits = MemEltVT.getSizeInBits(); + unsigned MemNumElements = MemVT.getVectorNumElements(); + unsigned PackedSize = MemVT.getStoreSizeInBits(); + SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, DL, MVT::i32); + + assert(Value.getValueType().getScalarSizeInBits() >= 32); + + SDValue PackedValue; + for (unsigned i = 0; i < MemNumElements; ++i) { + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value, + DAG.getConstant(i, DL, MVT::i32)); + Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32); + Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg + + SDValue Shift = DAG.getConstant(MemEltBits * i, DL, MVT::i32); + Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift); + + if (i == 0) { + PackedValue = Elt; + } else { + PackedValue = DAG.getNode(ISD::OR, DL, MVT::i32, PackedValue, Elt); + } + } + + if (PackedSize < 32) { + EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize); + return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr, + Store->getMemOperand()->getPointerInfo(), + PackedVT, + Store->isNonTemporal(), Store->isVolatile(), + Store->getAlignment()); + } + + return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr, + Store->getMemOperand()->getPointerInfo(), + Store->isVolatile(), Store->isNonTemporal(), + Store->getAlignment()); +} + +SDValue AMDGPUTargetLowering::ScalarizeVectorStore(SDValue Op, + SelectionDAG &DAG) const { + StoreSDNode *Store = cast(Op); + EVT MemEltVT = Store->getMemoryVT().getVectorElementType(); + EVT EltVT = Store->getValue().getValueType().getVectorElementType(); + EVT PtrVT = Store->getBasePtr().getValueType(); + unsigned NumElts = Store->getMemoryVT().getVectorNumElements(); + SDLoc SL(Op); + + SmallVector Chains; + + unsigned EltSize = MemEltVT.getStoreSize(); + MachinePointerInfo SrcValue(Store->getMemOperand()->getValue()); + + for (unsigned i = 0, e = NumElts; i != e; ++i) { + SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, + Store->getValue(), + DAG.getConstant(i, SL, MVT::i32)); + + SDValue Offset = DAG.getConstant(i * MemEltVT.getStoreSize(), SL, PtrVT); + SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Store->getBasePtr(), Offset); + SDValue NewStore = + DAG.getTruncStore(Store->getChain(), SL, Val, Ptr, + SrcValue.getWithOffset(i * EltSize), + MemEltVT, Store->isNonTemporal(), Store->isVolatile(), + Store->getAlignment()); + Chains.push_back(NewStore); + } + + return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains); +} + +SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, + SelectionDAG &DAG) const { + StoreSDNode *Store = cast(Op); + SDValue Val = Store->getValue(); + EVT VT = Val.getValueType(); + + // If this is a 2 element vector, we really want to scalarize and not create + // weird 1 element vectors. + if (VT.getVectorNumElements() == 2) + return ScalarizeVectorStore(Op, DAG); + + EVT MemVT = Store->getMemoryVT(); + SDValue Chain = Store->getChain(); + SDValue BasePtr = Store->getBasePtr(); + SDLoc SL(Op); + + EVT LoVT, HiVT; + EVT LoMemVT, HiMemVT; + SDValue Lo, Hi; + + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); + std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); + std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT); + + EVT PtrVT = BasePtr.getValueType(); + SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, + DAG.getConstant(LoMemVT.getStoreSize(), SL, + PtrVT)); + + MachinePointerInfo SrcValue(Store->getMemOperand()->getValue()); + SDValue LoStore + = DAG.getTruncStore(Chain, SL, Lo, + BasePtr, + SrcValue, + LoMemVT, + Store->isNonTemporal(), + Store->isVolatile(), + Store->getAlignment()); + SDValue HiStore + = DAG.getTruncStore(Chain, SL, Hi, + HiPtr, + SrcValue.getWithOffset(LoMemVT.getStoreSize()), + HiMemVT, + Store->isNonTemporal(), + Store->isVolatile(), + Store->getAlignment()); + + return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore); +} + + +SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + LoadSDNode *Load = cast(Op); + ISD::LoadExtType ExtType = Load->getExtensionType(); + EVT VT = Op.getValueType(); + EVT MemVT = Load->getMemoryVT(); + + if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) { + assert(VT == MVT::i1 && "Only i1 non-extloads expected"); + // FIXME: Copied from PPC + // First, load into 32 bits, then truncate to 1 bit. + + SDValue Chain = Load->getChain(); + SDValue BasePtr = Load->getBasePtr(); + MachineMemOperand *MMO = Load->getMemOperand(); + + SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, + BasePtr, MVT::i8, MMO); + + SDValue Ops[] = { + DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD), + NewLD.getValue(1) + }; + + return DAG.getMergeValues(Ops, DL); + } + + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS || + Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS || + ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32)) + return SDValue(); + + // getBasePtr(), + DAG.getConstant(2, DL, MVT::i32)); + // Load the Register. + SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), + Load->getChain(), Ptr, + DAG.getTargetConstant(0, DL, MVT::i32), + Op.getOperand(2)); + + // Get offset within the register. + SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, + Load->getBasePtr(), + DAG.getConstant(0x3, DL, MVT::i32)); + + // Bit offset of target byte (byteIdx * 8). + SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, + DAG.getConstant(3, DL, MVT::i32)); + + // Shift to the right. + Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt); + + // Eliminate the upper bits by setting them to ... + EVT MemEltVT = MemVT.getScalarType(); + + // ... ones. + if (ExtType == ISD::SEXTLOAD) { + SDValue MemEltVTNode = DAG.getValueType(MemEltVT); + + SDValue Ops[] = { + DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode), + Load->getChain() + }; + + return DAG.getMergeValues(Ops, DL); + } + + // ... or zeros. + SDValue Ops[] = { + DAG.getZeroExtendInReg(Ret, DL, MemEltVT), + Load->getChain() + }; + + return DAG.getMergeValues(Ops, DL); +} + +SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG); + if (Result.getNode()) { + return Result; + } + + StoreSDNode *Store = cast(Op); + SDValue Chain = Store->getChain(); + if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && + Store->getValue().getValueType().isVector()) { + return ScalarizeVectorStore(Op, DAG); + } + + EVT MemVT = Store->getMemoryVT(); + if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS && + MemVT.bitsLT(MVT::i32)) { + unsigned Mask = 0; + if (Store->getMemoryVT() == MVT::i8) { + Mask = 0xff; + } else if (Store->getMemoryVT() == MVT::i16) { + Mask = 0xffff; + } + SDValue BasePtr = Store->getBasePtr(); + SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr, + DAG.getConstant(2, DL, MVT::i32)); + SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, + Chain, Ptr, + DAG.getTargetConstant(0, DL, MVT::i32)); + + SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr, + DAG.getConstant(0x3, DL, MVT::i32)); + + SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, + DAG.getConstant(3, DL, MVT::i32)); + + SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, + Store->getValue()); + + SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT); + + SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, + MaskedValue, ShiftAmt); + + SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, + DAG.getConstant(Mask, DL, MVT::i32), + ShiftAmt); + DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask, + DAG.getConstant(0xffffffff, DL, MVT::i32)); + Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); + + SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); + return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, + Chain, Value, Ptr, + DAG.getTargetConstant(0, DL, MVT::i32)); + } + return SDValue(); +} + +// This is a shortcut for integer division because we have fast i32<->f32 +// conversions, and fast f32 reciprocal instructions. The fractional part of a +// float is enough to accurately represent up to a 24-bit integer. +SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + MVT IntVT = MVT::i32; + MVT FltVT = MVT::f32; + + ISD::NodeType ToFp = sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP; + ISD::NodeType ToInt = sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT; + + if (VT.isVector()) { + unsigned NElts = VT.getVectorNumElements(); + IntVT = MVT::getVectorVT(MVT::i32, NElts); + FltVT = MVT::getVectorVT(MVT::f32, NElts); + } + + unsigned BitSize = VT.getScalarType().getSizeInBits(); + + SDValue jq = DAG.getConstant(1, DL, IntVT); + + if (sign) { + // char|short jq = ia ^ ib; + jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS); + + // jq = jq >> (bitsize - 2) + jq = DAG.getNode(ISD::SRA, DL, VT, jq, + DAG.getConstant(BitSize - 2, DL, VT)); + + // jq = jq | 0x1 + jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT)); + + // jq = (int)jq + jq = DAG.getSExtOrTrunc(jq, DL, IntVT); + } + + // int ia = (int)LHS; + SDValue ia = sign ? + DAG.getSExtOrTrunc(LHS, DL, IntVT) : DAG.getZExtOrTrunc(LHS, DL, IntVT); + + // int ib, (int)RHS; + SDValue ib = sign ? + DAG.getSExtOrTrunc(RHS, DL, IntVT) : DAG.getZExtOrTrunc(RHS, DL, IntVT); + + // float fa = (float)ia; + SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia); + + // float fb = (float)ib; + SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib); + + // float fq = native_divide(fa, fb); + SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT, + fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb)); + + // fq = trunc(fq); + fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq); + + // float fqneg = -fq; + SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq); + + // float fr = mad(fqneg, fb, fa); + SDValue fr = DAG.getNode(ISD::FADD, DL, FltVT, + DAG.getNode(ISD::FMUL, DL, FltVT, fqneg, fb), fa); + + // int iq = (int)fq; + SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq); + + // fr = fabs(fr); + fr = DAG.getNode(ISD::FABS, DL, FltVT, fr); + + // fb = fabs(fb); + fb = DAG.getNode(ISD::FABS, DL, FltVT, fb); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), VT); + + // int cv = fr >= fb; + SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE); + + // jq = (cv ? jq : 0); + jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT)); + + // dst = trunc/extend to legal type + iq = sign ? DAG.getSExtOrTrunc(iq, DL, VT) : DAG.getZExtOrTrunc(iq, DL, VT); + + // dst = iq + jq; + SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq); + + // Rem needs compensation, it's easier to recompute it + SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS); + Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem); + + SDValue Res[2] = { + Div, + Rem + }; + return DAG.getMergeValues(Res, DL); +} + +void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, + SelectionDAG &DAG, + SmallVectorImpl &Results) const { + assert(Op.getValueType() == MVT::i64); + + SDLoc DL(Op); + EVT VT = Op.getValueType(); + EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); + + SDValue one = DAG.getConstant(1, DL, HalfVT); + SDValue zero = DAG.getConstant(0, DL, HalfVT); + + //HiLo split + SDValue LHS = Op.getOperand(0); + SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero); + SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one); + + SDValue RHS = Op.getOperand(1); + SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero); + SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one); + + if (VT == MVT::i64 && + DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) && + DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) { + + SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), + LHS_Lo, RHS_Lo); + + SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(0), zero); + SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(1), zero); + Results.push_back(DIV); + Results.push_back(REM); + return; + } + + // Get Speculative values + SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); + SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); + + SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ); + SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, zero); + + SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ); + SDValue DIV_Lo = zero; + + const unsigned halfBitWidth = HalfVT.getSizeInBits(); + + for (unsigned i = 0; i < halfBitWidth; ++i) { + const unsigned bitPos = halfBitWidth - i - 1; + SDValue POS = DAG.getConstant(bitPos, DL, HalfVT); + // Get value of high bit + SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); + HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one); + HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit); + + // Shift + REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT)); + // Add LHS high bit + REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit); + + SDValue BIT = DAG.getConstant(1 << bitPos, DL, HalfVT); + SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE); + + DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); + + // Update REM + SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS); + REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE); + } + + SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi); + Results.push_back(DIV); + Results.push_back(REM); +} + +SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + if (VT == MVT::i64) { + SmallVector Results; + LowerUDIVREM64(Op, DAG, Results); + return DAG.getMergeValues(Results, DL); + } + + SDValue Num = Op.getOperand(0); + SDValue Den = Op.getOperand(1); + + if (VT == MVT::i32) { + if (DAG.MaskedValueIsZero(Num, APInt::getHighBitsSet(32, 8)) && + DAG.MaskedValueIsZero(Den, APInt::getHighBitsSet(32, 8))) { + // TODO: We technically could do this for i64, but shouldn't that just be + // handled by something generally reducing 64-bit division on 32-bit + // values to 32-bit? + return LowerDIVREM24(Op, DAG, false); + } + } + + // RCP = URECIP(Den) = 2^32 / Den + e + // e is rounding error. + SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den); + + // RCP_LO = mul(RCP, Den) */ + SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den); + + // RCP_HI = mulhu (RCP, Den) */ + SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den); + + // NEG_RCP_LO = -RCP_LO + SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), + RCP_LO); + + // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) + SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT), + NEG_RCP_LO, RCP_LO, + ISD::SETEQ); + // Calculate the rounding error from the URECIP instruction + // E = mulhu(ABS_RCP_LO, RCP) + SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP); + + // RCP_A_E = RCP + E + SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E); + + // RCP_S_E = RCP - E + SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E); + + // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) + SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT), + RCP_A_E, RCP_S_E, + ISD::SETEQ); + // Quotient = mulhu(Tmp0, Num) + SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num); + + // Num_S_Remainder = Quotient * Den + SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den); + + // Remainder = Num - Num_S_Remainder + SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder); + + // Remainder_GE_Den = (Remainder >= Den ? -1 : 0) + SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den, + DAG.getConstant(-1, DL, VT), + DAG.getConstant(0, DL, VT), + ISD::SETUGE); + // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0) + SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num, + Num_S_Remainder, + DAG.getConstant(-1, DL, VT), + DAG.getConstant(0, DL, VT), + ISD::SETUGE); + // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero + SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den, + Remainder_GE_Zero); + + // Calculate Division result: + + // Quotient_A_One = Quotient + 1 + SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient, + DAG.getConstant(1, DL, VT)); + + // Quotient_S_One = Quotient - 1 + SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient, + DAG.getConstant(1, DL, VT)); + + // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One) + SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT), + Quotient, Quotient_A_One, ISD::SETEQ); + + // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div) + Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT), + Quotient_S_One, Div, ISD::SETEQ); + + // Calculate Rem result: + + // Remainder_S_Den = Remainder - Den + SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den); + + // Remainder_A_Den = Remainder + Den + SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den); + + // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den) + SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT), + Remainder, Remainder_S_Den, ISD::SETEQ); + + // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) + Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT), + Remainder_A_Den, Rem, ISD::SETEQ); + SDValue Ops[2] = { + Div, + Rem + }; + return DAG.getMergeValues(Ops, DL); +} + +SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + + SDValue Zero = DAG.getConstant(0, DL, VT); + SDValue NegOne = DAG.getConstant(-1, DL, VT); + + if (VT == MVT::i32 && + DAG.ComputeNumSignBits(LHS) > 8 && + DAG.ComputeNumSignBits(RHS) > 8) { + return LowerDIVREM24(Op, DAG, true); + } + if (VT == MVT::i64 && + DAG.ComputeNumSignBits(LHS) > 32 && + DAG.ComputeNumSignBits(RHS) > 32) { + EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); + + //HiLo split + SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero); + SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero); + SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), + LHS_Lo, RHS_Lo); + SDValue Res[2] = { + DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)), + DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1)) + }; + return DAG.getMergeValues(Res, DL); + } + + SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT); + SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT); + SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign); + SDValue RSign = LHSign; // Remainder sign is the same as LHS + + LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign); + RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign); + + LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign); + RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign); + + SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS); + SDValue Rem = Div.getValue(1); + + Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign); + Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign); + + Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign); + Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign); + + SDValue Res[2] = { + Div, + Rem + }; + return DAG.getMergeValues(Res, DL); +} + +// (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y)) +SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + EVT VT = Op.getValueType(); + SDValue X = Op.getOperand(0); + SDValue Y = Op.getOperand(1); + + SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y); + SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y); + + return DAG.getNode(ISD::FSUB, SL, VT, X, Mul); +} + +SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + + // result = trunc(src) + // if (src > 0.0 && src != result) + // result += 1.0 + + SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); + + const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64); + const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); + + SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT); + SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); + SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); + + SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero); + return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); +} + +static SDValue extractF64Exponent(SDValue Hi, SDLoc SL, SelectionDAG &DAG) { + const unsigned FractBits = 52; + const unsigned ExpBits = 11; + + SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, + Hi, + DAG.getConstant(FractBits - 32, SL, MVT::i32), + DAG.getConstant(ExpBits, SL, MVT::i32)); + SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart, + DAG.getConstant(1023, SL, MVT::i32)); + + return Exp; +} + +SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + + assert(Op.getValueType() == MVT::f64); + + const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + const SDValue One = DAG.getConstant(1, SL, MVT::i32); + + SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); + + // Extract the upper half, since this is where we will find the sign and + // exponent. + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One); + + SDValue Exp = extractF64Exponent(Hi, SL, DAG); + + const unsigned FractBits = 52; + + // Extract the sign bit. + const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32); + SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask); + + // Extend back to to 64-bits. + SDValue SignBit64 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, + Zero, SignBit); + SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64); + + SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src); + const SDValue FractMask + = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64); + + SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp); + SDValue Not = DAG.getNOT(SL, Shr, MVT::i64); + SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32); + + const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32); + + SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); + SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); + + SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0); + SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1); + + return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2); +} + +SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + + assert(Op.getValueType() == MVT::f64); + + APFloat C1Val(APFloat::IEEEdouble, "0x1.0p+52"); + SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64); + SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src); + + SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign); + SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign); + + SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src); + + APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51"); + SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); + SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT); + + return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2); +} + +SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const { + // FNEARBYINT and FRINT are the same, except in their handling of FP + // exceptions. Those aren't really meaningful for us, and OpenCL only has + // rint, so just treat them as equivalent. + return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0)); +} + +// XXX - May require not supporting f32 denormals? +SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue X = Op.getOperand(0); + + SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X); + + SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T); + + SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff); + + const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f32); + const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); + const SDValue Half = DAG.getConstantFP(0.5, SL, MVT::f32); + + SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); + + SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE); + + SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero); + + return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel); +} + +SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue X = Op.getOperand(0); + + SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X); + + const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + const SDValue One = DAG.getConstant(1, SL, MVT::i32); + const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32); + const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32); + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32); + + + SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); + + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One); + + SDValue Exp = extractF64Exponent(Hi, SL, DAG); + + const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL, + MVT::i64); + + SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp); + SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64, + DAG.getConstant(INT64_C(0x0008000000000000), SL, + MVT::i64), + Exp); + + SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M); + SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT, + DAG.getConstant(0, SL, MVT::i64), Tmp0, + ISD::SETNE); + + SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1, + D, DAG.getConstant(0, SL, MVT::i64)); + SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2); + + K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64)); + K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K); + + SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); + SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); + SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ); + + SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64, + ExpEqNegOne, + DAG.getConstantFP(1.0, SL, MVT::f64), + DAG.getConstantFP(0.0, SL, MVT::f64)); + + SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X); + + K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K); + K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K); + + return K; +} + +SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + if (VT == MVT::f32) + return LowerFROUND32(Op, DAG); + + if (VT == MVT::f64) + return LowerFROUND64(Op, DAG); + + llvm_unreachable("unhandled type"); +} + +SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + + // result = trunc(src); + // if (src < 0.0 && src != result) + // result += -1.0. + + SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); + + const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64); + const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); + + SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT); + SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); + SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); + + SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero); + return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); +} + +SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, + bool Signed) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + + SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); + + SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, + DAG.getConstant(0, SL, MVT::i32)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, + DAG.getConstant(1, SL, MVT::i32)); + + SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP, + SL, MVT::f64, Hi); + + SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo); + + SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi, + DAG.getConstant(32, SL, MVT::i32)); + + return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo); +} + +SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, + SelectionDAG &DAG) const { + SDValue S0 = Op.getOperand(0); + if (S0.getValueType() != MVT::i64) + return SDValue(); + + EVT DestVT = Op.getValueType(); + if (DestVT == MVT::f64) + return LowerINT_TO_FP64(Op, DAG, false); + + assert(DestVT == MVT::f32); + + SDLoc DL(Op); + + // f32 uint_to_fp i64 + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, + DAG.getConstant(0, DL, MVT::i32)); + SDValue FloatLo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Lo); + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, + DAG.getConstant(1, DL, MVT::i32)); + SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi); + FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi, + DAG.getConstantFP(4294967296.0f, DL, MVT::f32)); // 2^32 + return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi); +} + +SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op, + SelectionDAG &DAG) const { + SDValue Src = Op.getOperand(0); + if (Src.getValueType() == MVT::i64 && Op.getValueType() == MVT::f64) + return LowerINT_TO_FP64(Op, DAG, true); + + return SDValue(); +} + +SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, + bool Signed) const { + SDLoc SL(Op); + + SDValue Src = Op.getOperand(0); + + SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); + + SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL, + MVT::f64); + SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL, + MVT::f64); + + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0); + + SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul); + + + SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc); + + SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL, + MVT::i32, FloorMul); + SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma); + + SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Lo, Hi); + + return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result); +} + +SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op, + SelectionDAG &DAG) const { + SDValue Src = Op.getOperand(0); + + if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) + return LowerFP64_TO_INT(Op, DAG, true); + + return SDValue(); +} + +SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op, + SelectionDAG &DAG) const { + SDValue Src = Op.getOperand(0); + + if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) + return LowerFP64_TO_INT(Op, DAG, false); + + return SDValue(); +} + +SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, + SelectionDAG &DAG) const { + EVT ExtraVT = cast(Op.getOperand(1))->getVT(); + MVT VT = Op.getSimpleValueType(); + MVT ScalarVT = VT.getScalarType(); + + if (!VT.isVector()) + return SDValue(); + + SDValue Src = Op.getOperand(0); + SDLoc DL(Op); + + // TODO: Don't scalarize on Evergreen? + unsigned NElts = VT.getVectorNumElements(); + SmallVector Args; + DAG.ExtractVectorElements(Src, Args, 0, NElts); + + SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType()); + for (unsigned I = 0; I < NElts; ++I) + Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp); + + return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args); +} + +//===----------------------------------------------------------------------===// +// Custom DAG optimizations +//===----------------------------------------------------------------------===// + +static bool isU24(SDValue Op, SelectionDAG &DAG) { + APInt KnownZero, KnownOne; + EVT VT = Op.getValueType(); + DAG.computeKnownBits(Op, KnownZero, KnownOne); + + return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24; +} + +static bool isI24(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + + // In order for this to be a signed 24-bit value, bit 23, must + // be a sign bit. + return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated + // as unsigned 24-bit values. + (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24; +} + +static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) { + + SelectionDAG &DAG = DCI.DAG; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT VT = Op.getValueType(); + + APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24); + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, true, true); + if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO)) + DCI.CommitTargetLoweringOpt(TLO); +} + +template +static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, + uint32_t Offset, uint32_t Width, SDLoc DL) { + if (Width + Offset < 32) { + uint32_t Shl = static_cast(Src0) << (32 - Offset - Width); + IntTy Result = static_cast(Shl) >> (32 - Width); + return DAG.getConstant(Result, DL, MVT::i32); + } + + return DAG.getConstant(Src0 >> Offset, DL, MVT::i32); +} + +static bool usesAllNormalStores(SDNode *LoadVal) { + for (SDNode::use_iterator I = LoadVal->use_begin(); !I.atEnd(); ++I) { + if (!ISD::isNormalStore(*I)) + return false; + } + + return true; +} + +// If we have a copy of an illegal type, replace it with a load / store of an +// equivalently sized legal type. This avoids intermediate bit pack / unpack +// instructions emitted when handling extloads and truncstores. Ideally we could +// recognize the pack / unpack pattern to eliminate it. +SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (!DCI.isBeforeLegalize()) + return SDValue(); + + StoreSDNode *SN = cast(N); + SDValue Value = SN->getValue(); + EVT VT = Value.getValueType(); + + if (isTypeLegal(VT) || SN->isVolatile() || + !ISD::isNormalLoad(Value.getNode()) || VT.getSizeInBits() < 8) + return SDValue(); + + LoadSDNode *LoadVal = cast(Value); + if (LoadVal->isVolatile() || !usesAllNormalStores(LoadVal)) + return SDValue(); + + EVT MemVT = LoadVal->getMemoryVT(); + + SDLoc SL(N); + SelectionDAG &DAG = DCI.DAG; + EVT LoadVT = getEquivalentMemType(*DAG.getContext(), MemVT); + + SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, + LoadVT, SL, + LoadVal->getChain(), + LoadVal->getBasePtr(), + LoadVal->getOffset(), + LoadVT, + LoadVal->getMemOperand()); + + SDValue CastLoad = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad.getValue(0)); + DCI.CombineTo(LoadVal, CastLoad, NewLoad.getValue(1), false); + + return DAG.getStore(SN->getChain(), SL, NewLoad, + SN->getBasePtr(), SN->getMemOperand()); +} + +SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + EVT VT = N->getValueType(0); + + if (VT.isVector() || VT.getSizeInBits() > 32) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue Mul; + + if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { + N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); + N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); + Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1); + } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { + N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); + N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); + Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1); + } else { + return SDValue(); + } + + // We need to use sext even for MUL_U24, because MUL_U24 is used + // for signed multiply of 8 and 16-bit types. + return DAG.getSExtOrTrunc(Mul, DL, VT); +} + +SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + switch(N->getOpcode()) { + default: break; + case ISD::MUL: + return performMulCombine(N, DCI); + case AMDGPUISD::MUL_I24: + case AMDGPUISD::MUL_U24: { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + simplifyI24(N0, DCI); + simplifyI24(N1, DCI); + return SDValue(); + } + case ISD::SELECT: { + SDValue Cond = N->getOperand(0); + if (Cond.getOpcode() == ISD::SETCC && Cond.hasOneUse()) { + EVT VT = N->getValueType(0); + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + SDValue CC = Cond.getOperand(2); + + SDValue True = N->getOperand(1); + SDValue False = N->getOperand(2); + + if (VT == MVT::f32) + return CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); + + // TODO: Implement min / max Evergreen instructions. + if (VT == MVT::i32 && + Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { + return CombineIMinMax(DL, VT, LHS, RHS, True, False, CC, DAG); + } + } + + break; + } + case AMDGPUISD::BFE_I32: + case AMDGPUISD::BFE_U32: { + assert(!N->getValueType(0).isVector() && + "Vector handling of BFE not implemented"); + ConstantSDNode *Width = dyn_cast(N->getOperand(2)); + if (!Width) + break; + + uint32_t WidthVal = Width->getZExtValue() & 0x1f; + if (WidthVal == 0) + return DAG.getConstant(0, DL, MVT::i32); + + ConstantSDNode *Offset = dyn_cast(N->getOperand(1)); + if (!Offset) + break; + + SDValue BitsFrom = N->getOperand(0); + uint32_t OffsetVal = Offset->getZExtValue() & 0x1f; + + bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32; + + if (OffsetVal == 0) { + // This is already sign / zero extended, so try to fold away extra BFEs. + unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal); + + unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom); + if (OpSignBits >= SignBits) + return BitsFrom; + + EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal); + if (Signed) { + // This is a sign_extend_inreg. Replace it to take advantage of existing + // DAG Combines. If not eliminated, we will match back to BFE during + // selection. + + // TODO: The sext_inreg of extended types ends, although we can could + // handle them in a single BFE. + return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom, + DAG.getValueType(SmallVT)); + } + + return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT); + } + + if (ConstantSDNode *CVal = dyn_cast(BitsFrom)) { + if (Signed) { + return constantFoldBFE(DAG, + CVal->getSExtValue(), + OffsetVal, + WidthVal, + DL); + } + + return constantFoldBFE(DAG, + CVal->getZExtValue(), + OffsetVal, + WidthVal, + DL); + } + + if ((OffsetVal + WidthVal) >= 32) { + SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32); + return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32, + BitsFrom, ShiftVal); + } + + if (BitsFrom.hasOneUse()) { + APInt Demanded = APInt::getBitsSet(32, + OffsetVal, + OffsetVal + WidthVal); + + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) || + TLI.SimplifyDemandedBits(BitsFrom, Demanded, + KnownZero, KnownOne, TLO)) { + DCI.CommitTargetLoweringOpt(TLO); + } + } + + break; + } + + case ISD::STORE: + return performStoreCombine(N, DCI); + } + return SDValue(); +} + +//===----------------------------------------------------------------------===// +// Helper functions +//===----------------------------------------------------------------------===// + +void AMDGPUTargetLowering::getOriginalFunctionArgs( + SelectionDAG &DAG, + const Function *F, + const SmallVectorImpl &Ins, + SmallVectorImpl &OrigIns) const { + + for (unsigned i = 0, e = Ins.size(); i < e; ++i) { + if (Ins[i].ArgVT == Ins[i].VT) { + OrigIns.push_back(Ins[i]); + continue; + } + + EVT VT; + if (Ins[i].ArgVT.isVector() && !Ins[i].VT.isVector()) { + // Vector has been split into scalars. + VT = Ins[i].ArgVT.getVectorElementType(); + } else if (Ins[i].VT.isVector() && Ins[i].ArgVT.isVector() && + Ins[i].ArgVT.getVectorElementType() != + Ins[i].VT.getVectorElementType()) { + // Vector elements have been promoted + VT = Ins[i].ArgVT; + } else { + // Vector has been spilt into smaller vectors. + VT = Ins[i].VT; + } + + ISD::InputArg Arg(Ins[i].Flags, VT, VT, Ins[i].Used, + Ins[i].OrigArgIndex, Ins[i].PartOffset); + OrigIns.push_back(Arg); + } +} + +bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const { + if (ConstantFPSDNode * CFP = dyn_cast(Op)) { + return CFP->isExactlyValue(1.0); + } + if (ConstantSDNode *C = dyn_cast(Op)) { + return C->isAllOnesValue(); + } + return false; +} + +bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const { + if (ConstantFPSDNode * CFP = dyn_cast(Op)) { + return CFP->getValueAPF().isZero(); + } + if (ConstantSDNode *C = dyn_cast(Op)) { + return C->isNullValue(); + } + return false; +} + +SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, + const TargetRegisterClass *RC, + unsigned Reg, EVT VT) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + unsigned VirtualRegister; + if (!MRI.isLiveIn(Reg)) { + VirtualRegister = MRI.createVirtualRegister(RC); + MRI.addLiveIn(Reg, VirtualRegister); + } else { + VirtualRegister = MRI.getLiveInVirtReg(Reg); + } + return DAG.getRegister(VirtualRegister, VT); +} + +#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node; + +const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { + switch ((AMDGPUISD::NodeType)Opcode) { + case AMDGPUISD::FIRST_NUMBER: break; + // AMDIL DAG nodes + NODE_NAME_CASE(CALL); + NODE_NAME_CASE(UMUL); + NODE_NAME_CASE(RET_FLAG); + NODE_NAME_CASE(BRANCH_COND); + + // AMDGPU DAG nodes + NODE_NAME_CASE(DWORDADDR) + NODE_NAME_CASE(FRACT) + NODE_NAME_CASE(CLAMP) + NODE_NAME_CASE(COS_HW) + NODE_NAME_CASE(SIN_HW) + NODE_NAME_CASE(FMAX_LEGACY) + NODE_NAME_CASE(FMIN_LEGACY) + NODE_NAME_CASE(FMAX3) + NODE_NAME_CASE(SMAX3) + NODE_NAME_CASE(UMAX3) + NODE_NAME_CASE(FMIN3) + NODE_NAME_CASE(SMIN3) + NODE_NAME_CASE(UMIN3) + NODE_NAME_CASE(URECIP) + NODE_NAME_CASE(DIV_SCALE) + NODE_NAME_CASE(DIV_FMAS) + NODE_NAME_CASE(DIV_FIXUP) + NODE_NAME_CASE(TRIG_PREOP) + NODE_NAME_CASE(RCP) + NODE_NAME_CASE(RSQ) + NODE_NAME_CASE(RSQ_LEGACY) + NODE_NAME_CASE(RSQ_CLAMPED) + NODE_NAME_CASE(LDEXP) + NODE_NAME_CASE(FP_CLASS) + NODE_NAME_CASE(DOT4) + NODE_NAME_CASE(CARRY) + NODE_NAME_CASE(BORROW) + NODE_NAME_CASE(BFE_U32) + NODE_NAME_CASE(BFE_I32) + NODE_NAME_CASE(BFI) + NODE_NAME_CASE(BFM) + NODE_NAME_CASE(BREV) + NODE_NAME_CASE(MUL_U24) + NODE_NAME_CASE(MUL_I24) + NODE_NAME_CASE(MAD_U24) + NODE_NAME_CASE(MAD_I24) + NODE_NAME_CASE(TEXTURE_FETCH) + NODE_NAME_CASE(EXPORT) + NODE_NAME_CASE(CONST_ADDRESS) + NODE_NAME_CASE(REGISTER_LOAD) + NODE_NAME_CASE(REGISTER_STORE) + NODE_NAME_CASE(LOAD_CONSTANT) + NODE_NAME_CASE(LOAD_INPUT) + NODE_NAME_CASE(SAMPLE) + NODE_NAME_CASE(SAMPLEB) + NODE_NAME_CASE(SAMPLED) + NODE_NAME_CASE(SAMPLEL) + NODE_NAME_CASE(CVT_F32_UBYTE0) + NODE_NAME_CASE(CVT_F32_UBYTE1) + NODE_NAME_CASE(CVT_F32_UBYTE2) + NODE_NAME_CASE(CVT_F32_UBYTE3) + NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) + NODE_NAME_CASE(CONST_DATA_PTR) + case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; + NODE_NAME_CASE(SENDMSG) + NODE_NAME_CASE(INTERP_MOV) + NODE_NAME_CASE(INTERP_P1) + NODE_NAME_CASE(INTERP_P2) + NODE_NAME_CASE(STORE_MSKOR) + NODE_NAME_CASE(TBUFFER_STORE_FORMAT) + case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; + } + return nullptr; +} + +SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps, + bool &UseOneConstNR) const { + SelectionDAG &DAG = DCI.DAG; + EVT VT = Operand.getValueType(); + + if (VT == MVT::f32) { + RefinementSteps = 0; + return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand); + } + + // TODO: There is also f64 rsq instruction, but the documentation is less + // clear on its precision. + + return SDValue(); +} + +SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const { + SelectionDAG &DAG = DCI.DAG; + EVT VT = Operand.getValueType(); + + if (VT == MVT::f32) { + // Reciprocal, < 1 ulp error. + // + // This reciprocal approximation converges to < 0.5 ulp error with one + // newton rhapson performed with two fused multiple adds (FMAs). + + RefinementSteps = 0; + return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand); + } + + // TODO: There is also f64 rcp instruction, but the documentation is less + // clear on its precision. + + return SDValue(); +} + +static void computeKnownBitsForMinMax(const SDValue Op0, + const SDValue Op1, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) { + APInt Op0Zero, Op0One; + APInt Op1Zero, Op1One; + DAG.computeKnownBits(Op0, Op0Zero, Op0One, Depth); + DAG.computeKnownBits(Op1, Op1Zero, Op1One, Depth); + + KnownZero = Op0Zero & Op1Zero; + KnownOne = Op0One & Op1One; +} + +void AMDGPUTargetLowering::computeKnownBitsForTargetNode( + const SDValue Op, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) const { + + KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything. + + APInt KnownZero2; + APInt KnownOne2; + unsigned Opc = Op.getOpcode(); + + switch (Opc) { + default: + break; + case ISD::INTRINSIC_WO_CHAIN: { + // FIXME: The intrinsic should just use the node. + switch (cast(Op.getOperand(0))->getZExtValue()) { + case AMDGPUIntrinsic::AMDGPU_imax: + case AMDGPUIntrinsic::AMDGPU_umax: + case AMDGPUIntrinsic::AMDGPU_imin: + case AMDGPUIntrinsic::AMDGPU_umin: + computeKnownBitsForMinMax(Op.getOperand(1), Op.getOperand(2), + KnownZero, KnownOne, DAG, Depth); + break; + default: + break; + } + + break; + } + case AMDGPUISD::CARRY: + case AMDGPUISD::BORROW: { + KnownZero = APInt::getHighBitsSet(32, 31); + break; + } + + case AMDGPUISD::BFE_I32: + case AMDGPUISD::BFE_U32: { + ConstantSDNode *CWidth = dyn_cast(Op.getOperand(2)); + if (!CWidth) + return; + + unsigned BitWidth = 32; + uint32_t Width = CWidth->getZExtValue() & 0x1f; + + if (Opc == AMDGPUISD::BFE_U32) + KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width); + + break; + } + } +} + +unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( + SDValue Op, + const SelectionDAG &DAG, + unsigned Depth) const { + switch (Op.getOpcode()) { + case AMDGPUISD::BFE_I32: { + ConstantSDNode *Width = dyn_cast(Op.getOperand(2)); + if (!Width) + return 1; + + unsigned SignBits = 32 - Width->getZExtValue() + 1; + ConstantSDNode *Offset = dyn_cast(Op.getOperand(1)); + if (!Offset || !Offset->isNullValue()) + return SignBits; + + // TODO: Could probably figure something out with non-0 offsets. + unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); + return std::max(SignBits, Op0SignBits); + } + + case AMDGPUISD::BFE_U32: { + ConstantSDNode *Width = dyn_cast(Op.getOperand(2)); + return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1; + } + + case AMDGPUISD::CARRY: + case AMDGPUISD::BORROW: + return 31; + + default: + return 1; + } +} diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h new file mode 100644 index 00000000000..fbb7d3c8843 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -0,0 +1,307 @@ +//===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface definition of the TargetLowering class that is common +/// to all AMD GPUs. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPUISELLOWERING_H +#define LLVM_LIB_TARGET_R600_AMDGPUISELLOWERING_H + +#include "llvm/Target/TargetLowering.h" + +namespace llvm { + +class AMDGPUMachineFunction; +class AMDGPUSubtarget; +class MachineRegisterInfo; + +class AMDGPUTargetLowering : public TargetLowering { +protected: + const AMDGPUSubtarget *Subtarget; + +private: + SDValue LowerConstantInitializer(const Constant* Init, const GlobalValue *GV, + const SDValue &InitPtr, + SDValue Chain, + SelectionDAG &DAG) const; + SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + /// \brief Lower vector stores by merging the vector elements into an integer + /// of the same bitwidth. + SDValue MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const; + /// \brief Split a vector store into multiple scalar stores. + /// \returns The resulting chain. + + SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const; + SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, bool Signed) const; + SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; + + SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const; + +protected: + static EVT getEquivalentMemType(LLVMContext &Context, EVT VT); + static EVT getEquivalentLoadRegType(LLVMContext &Context, EVT VT); + + virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, + SelectionDAG &DAG) const; + + /// \brief Split a vector load into a scalar load of each component. + SDValue ScalarizeVectorLoad(SDValue Op, SelectionDAG &DAG) const; + + /// \brief Split a vector load into 2 loads of half the vector. + SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const; + + /// \brief Split a vector store into a scalar store of each component. + SDValue ScalarizeVectorStore(SDValue Op, SelectionDAG &DAG) const; + + /// \brief Split a vector store into 2 stores of half the vector. + SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const; + void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, + SmallVectorImpl &Results) const; + bool isHWTrueValue(SDValue Op) const; + bool isHWFalseValue(SDValue Op) const; + + /// The SelectionDAGBuilder will automatically promote function arguments + /// with illegal types. However, this does not work for the AMDGPU targets + /// since the function arguments are stored in memory as these illegal types. + /// In order to handle this properly we need to get the origianl types sizes + /// from the LLVM IR Function and fixup the ISD:InputArg values before + /// passing them to AnalyzeFormalArguments() + void getOriginalFunctionArgs(SelectionDAG &DAG, + const Function *F, + const SmallVectorImpl &Ins, + SmallVectorImpl &OrigIns) const; + void AnalyzeFormalArguments(CCState &State, + const SmallVectorImpl &Ins) const; + +public: + AMDGPUTargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI); + + bool isFAbsFree(EVT VT) const override; + bool isFNegFree(EVT VT) const override; + bool isTruncateFree(EVT Src, EVT Dest) const override; + bool isTruncateFree(Type *Src, Type *Dest) const override; + + bool isZExtFree(Type *Src, Type *Dest) const override; + bool isZExtFree(EVT Src, EVT Dest) const override; + bool isZExtFree(SDValue Val, EVT VT2) const override; + + bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; + + MVT getVectorIdxTy() const override; + bool isSelectSupported(SelectSupportKind) const override; + + bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; + bool ShouldShrinkFPConstant(EVT VT) const override; + bool shouldReduceLoadWidth(SDNode *Load, + ISD::LoadExtType ExtType, + EVT ExtVT) const override; + + bool isLoadBitCastBeneficial(EVT, EVT) const override; + + bool storeOfVectorConstantIsCheap(EVT MemVT, + unsigned NumElem, + unsigned AS) const override; + bool isCheapToSpeculateCttz() const override; + bool isCheapToSpeculateCtlz() const override; + + SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, + SDLoc DL, SelectionDAG &DAG) const override; + SDValue LowerCall(CallLoweringInfo &CLI, + SmallVectorImpl &InVals) const override; + + SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; + void ReplaceNodeResults(SDNode * N, + SmallVectorImpl &Results, + SelectionDAG &DAG) const override; + + SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const; + SDValue CombineFMinMaxLegacy(SDLoc DL, + EVT VT, + SDValue LHS, + SDValue RHS, + SDValue True, + SDValue False, + SDValue CC, + DAGCombinerInfo &DCI) const; + SDValue CombineIMinMax(SDLoc DL, + EVT VT, + SDValue LHS, + SDValue RHS, + SDValue True, + SDValue False, + SDValue CC, + SelectionDAG &DAG) const; + + const char* getTargetNodeName(unsigned Opcode) const override; + + SDValue getRsqrtEstimate(SDValue Operand, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps, + bool &UseOneConstNR) const override; + SDValue getRecipEstimate(SDValue Operand, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const override; + + virtual SDNode *PostISelFolding(MachineSDNode *N, + SelectionDAG &DAG) const { + return N; + } + + /// \brief Determine which of the bits specified in \p Mask are known to be + /// either zero or one and return them in the \p KnownZero and \p KnownOne + /// bitsets. + void computeKnownBitsForTargetNode(const SDValue Op, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth = 0) const override; + + unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const SelectionDAG &DAG, + unsigned Depth = 0) const override; + + /// \brief Helper function that adds Reg to the LiveIn list of the DAG's + /// MachineFunction. + /// + /// \returns a RegisterSDNode representing Reg. + virtual SDValue CreateLiveInRegister(SelectionDAG &DAG, + const TargetRegisterClass *RC, + unsigned Reg, EVT VT) const; +}; + +namespace AMDGPUISD { + +enum NodeType : unsigned { + // AMDIL ISD Opcodes + FIRST_NUMBER = ISD::BUILTIN_OP_END, + CALL, // Function call based on a single integer + UMUL, // 32bit unsigned multiplication + RET_FLAG, + BRANCH_COND, + // End AMDIL ISD Opcodes + DWORDADDR, + FRACT, + CLAMP, + + // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi. + // Denormals handled on some parts. + COS_HW, + SIN_HW, + FMAX_LEGACY, + FMIN_LEGACY, + FMAX3, + SMAX3, + UMAX3, + FMIN3, + SMIN3, + UMIN3, + URECIP, + DIV_SCALE, + DIV_FMAS, + DIV_FIXUP, + TRIG_PREOP, // 1 ULP max error for f64 + + // RCP, RSQ - For f32, 1 ULP max error, no denormal handling. + // For f64, max error 2^29 ULP, handles denormals. + RCP, + RSQ, + RSQ_LEGACY, + RSQ_CLAMPED, + LDEXP, + FP_CLASS, + DOT4, + CARRY, + BORROW, + BFE_U32, // Extract range of bits with zero extension to 32-bits. + BFE_I32, // Extract range of bits with sign extension to 32-bits. + BFI, // (src0 & src1) | (~src0 & src2) + BFM, // Insert a range of bits into a 32-bit word. + BREV, // Reverse bits. + MUL_U24, + MUL_I24, + MAD_U24, + MAD_I24, + TEXTURE_FETCH, + EXPORT, + CONST_ADDRESS, + REGISTER_LOAD, + REGISTER_STORE, + LOAD_INPUT, + SAMPLE, + SAMPLEB, + SAMPLED, + SAMPLEL, + + // These cvt_f32_ubyte* nodes need to remain consecutive and in order. + CVT_F32_UBYTE0, + CVT_F32_UBYTE1, + CVT_F32_UBYTE2, + CVT_F32_UBYTE3, + /// This node is for VLIW targets and it is used to represent a vector + /// that is stored in consecutive registers with the same channel. + /// For example: + /// |X |Y|Z|W| + /// T0|v.x| | | | + /// T1|v.y| | | | + /// T2|v.z| | | | + /// T3|v.w| | | | + BUILD_VERTICAL_VECTOR, + /// Pointer to the start of the shader's constant data. + CONST_DATA_PTR, + SENDMSG, + INTERP_MOV, + INTERP_P1, + INTERP_P2, + FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, + STORE_MSKOR, + LOAD_CONSTANT, + TBUFFER_STORE_FORMAT, + LAST_AMDGPU_ISD_NUMBER +}; + + +} // End namespace AMDGPUISD + +} // End namespace llvm + +#endif diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp new file mode 100644 index 00000000000..15a3d543a68 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -0,0 +1,369 @@ +//===-- AMDGPUInstrInfo.cpp - Base class for AMD GPU InstrInfo ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Implementation of the TargetInstrInfo class that is common to all +/// AMD GPUs. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUInstrInfo.h" +#include "AMDGPURegisterInfo.h" +#include "AMDGPUTargetMachine.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +#define GET_INSTRINFO_CTOR_DTOR +#define GET_INSTRINFO_NAMED_OPS +#define GET_INSTRMAP_INFO +#include "AMDGPUGenInstrInfo.inc" + +// Pin the vtable to this file. +void AMDGPUInstrInfo::anchor() {} + +AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &st) + : AMDGPUGenInstrInfo(-1, -1), ST(st) {} + +const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const { + return RI; +} + +bool AMDGPUInstrInfo::isCoalescableExtInstr(const MachineInstr &MI, + unsigned &SrcReg, unsigned &DstReg, + unsigned &SubIdx) const { +// TODO: Implement this function + return false; +} + +unsigned AMDGPUInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const { +// TODO: Implement this function + return 0; +} + +unsigned AMDGPUInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const { +// TODO: Implement this function + return 0; +} + +bool AMDGPUInstrInfo::hasLoadFromStackSlot(const MachineInstr *MI, + const MachineMemOperand *&MMO, + int &FrameIndex) const { +// TODO: Implement this function + return false; +} +unsigned AMDGPUInstrInfo::isStoreFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const { +// TODO: Implement this function + return 0; +} +unsigned AMDGPUInstrInfo::isStoreFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const { +// TODO: Implement this function + return 0; +} +bool AMDGPUInstrInfo::hasStoreFromStackSlot(const MachineInstr *MI, + const MachineMemOperand *&MMO, + int &FrameIndex) const { +// TODO: Implement this function + return false; +} + +MachineInstr * +AMDGPUInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const { +// TODO: Implement this function + return nullptr; +} + +void +AMDGPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, bool isKill, + int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + llvm_unreachable("Not Implemented"); +} + +void +AMDGPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + llvm_unreachable("Not Implemented"); +} + +bool AMDGPUInstrInfo::expandPostRAPseudo (MachineBasicBlock::iterator MI) const { + MachineBasicBlock *MBB = MI->getParent(); + int OffsetOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::addr); + // addr is a custom operand with multiple MI operands, and only the + // first MI operand is given a name. + int RegOpIdx = OffsetOpIdx + 1; + int ChanOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::chan); + if (isRegisterLoad(*MI)) { + int DstOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::dst); + unsigned RegIndex = MI->getOperand(RegOpIdx).getImm(); + unsigned Channel = MI->getOperand(ChanOpIdx).getImm(); + unsigned Address = calculateIndirectAddress(RegIndex, Channel); + unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg(); + if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) { + buildMovInstr(MBB, MI, MI->getOperand(DstOpIdx).getReg(), + getIndirectAddrRegClass()->getRegister(Address)); + } else { + buildIndirectRead(MBB, MI, MI->getOperand(DstOpIdx).getReg(), + Address, OffsetReg); + } + } else if (isRegisterStore(*MI)) { + int ValOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::val); + unsigned RegIndex = MI->getOperand(RegOpIdx).getImm(); + unsigned Channel = MI->getOperand(ChanOpIdx).getImm(); + unsigned Address = calculateIndirectAddress(RegIndex, Channel); + unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg(); + if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) { + buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address), + MI->getOperand(ValOpIdx).getReg()); + } else { + buildIndirectWrite(MBB, MI, MI->getOperand(ValOpIdx).getReg(), + calculateIndirectAddress(RegIndex, Channel), + OffsetReg); + } + } else { + return false; + } + + MBB->erase(MI); + return true; +} + +MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl( + MachineFunction &MF, MachineInstr *MI, ArrayRef Ops, + MachineBasicBlock::iterator InsertPt, int FrameIndex) const { +// TODO: Implement this function + return nullptr; +} +MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl( + MachineFunction &MF, MachineInstr *MI, ArrayRef Ops, + MachineBasicBlock::iterator InsertPt, MachineInstr *LoadMI) const { + // TODO: Implement this function + return nullptr; +} +bool AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI, + ArrayRef Ops) const { + // TODO: Implement this function + return false; +} +bool +AMDGPUInstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, + unsigned Reg, bool UnfoldLoad, + bool UnfoldStore, + SmallVectorImpl &NewMIs) const { + // TODO: Implement this function + return false; +} + +bool +AMDGPUInstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, + SmallVectorImpl &NewNodes) const { + // TODO: Implement this function + return false; +} + +unsigned +AMDGPUInstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc, + bool UnfoldLoad, bool UnfoldStore, + unsigned *LoadRegIndex) const { + // TODO: Implement this function + return 0; +} + +bool AMDGPUInstrInfo::enableClusterLoads() const { + return true; +} + +// FIXME: This behaves strangely. If, for example, you have 32 load + stores, +// the first 16 loads will be interleaved with the stores, and the next 16 will +// be clustered as expected. It should really split into 2 16 store batches. +// +// Loads are clustered until this returns false, rather than trying to schedule +// groups of stores. This also means we have to deal with saying different +// address space loads should be clustered, and ones which might cause bank +// conflicts. +// +// This might be deprecated so it might not be worth that much effort to fix. +bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, + int64_t Offset0, int64_t Offset1, + unsigned NumLoads) const { + assert(Offset1 > Offset0 && + "Second offset should be larger than first offset!"); + // If we have less than 16 loads in a row, and the offsets are within 64 + // bytes, then schedule together. + + // A cacheline is 64 bytes (for global memory). + return (NumLoads <= 16 && (Offset1 - Offset0) < 64); +} + +bool +AMDGPUInstrInfo::ReverseBranchCondition(SmallVectorImpl &Cond) + const { + // TODO: Implement this function + return true; +} +void AMDGPUInstrInfo::insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const { + // TODO: Implement this function +} + +bool AMDGPUInstrInfo::isPredicated(const MachineInstr *MI) const { + // TODO: Implement this function + return false; +} + +bool AMDGPUInstrInfo::SubsumesPredicate(ArrayRef Pred1, + ArrayRef Pred2) const { + // TODO: Implement this function + return false; +} + +bool AMDGPUInstrInfo::DefinesPredicate(MachineInstr *MI, + std::vector &Pred) const { + // TODO: Implement this function + return false; +} + +bool AMDGPUInstrInfo::isPredicable(MachineInstr *MI) const { + // TODO: Implement this function + return MI->getDesc().isPredicable(); +} + +bool +AMDGPUInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { + // TODO: Implement this function + return true; +} + +bool AMDGPUInstrInfo::isRegisterStore(const MachineInstr &MI) const { + return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_STORE; +} + +bool AMDGPUInstrInfo::isRegisterLoad(const MachineInstr &MI) const { + return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_LOAD; +} + +int AMDGPUInstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const { + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const MachineFrameInfo *MFI = MF.getFrameInfo(); + int Offset = -1; + + if (MFI->getNumObjects() == 0) { + return -1; + } + + if (MRI.livein_empty()) { + return 0; + } + + const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass(); + for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(), + LE = MRI.livein_end(); + LI != LE; ++LI) { + unsigned Reg = LI->first; + if (TargetRegisterInfo::isVirtualRegister(Reg) || + !IndirectRC->contains(Reg)) + continue; + + unsigned RegIndex; + unsigned RegEnd; + for (RegIndex = 0, RegEnd = IndirectRC->getNumRegs(); RegIndex != RegEnd; + ++RegIndex) { + if (IndirectRC->getRegister(RegIndex) == Reg) + break; + } + Offset = std::max(Offset, (int)RegIndex); + } + + return Offset + 1; +} + +int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const { + int Offset = 0; + const MachineFrameInfo *MFI = MF.getFrameInfo(); + + // Variable sized objects are not supported + assert(!MFI->hasVarSizedObjects()); + + if (MFI->getNumObjects() == 0) { + return -1; + } + + Offset = MF.getSubtarget().getFrameLowering()->getFrameIndexOffset(MF, -1); + + return getIndirectIndexBegin(MF) + Offset; +} + +int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const { + switch (Channels) { + default: return Opcode; + case 1: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_1); + case 2: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_2); + case 3: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_3); + } +} + +// Wrapper for Tablegen'd function. enum Subtarget is not defined in any +// header files, so we need to wrap it in a function that takes unsigned +// instead. +namespace llvm { +namespace AMDGPU { +static int getMCOpcode(uint16_t Opcode, unsigned Gen) { + return getMCOpcodeGen(Opcode, (enum Subtarget)Gen); +} +} +} + +// This must be kept in sync with the SISubtarget class in SIInstrInfo.td +enum SISubtarget { + SI = 0, + VI = 1 +}; + +static enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) { + switch (Gen) { + default: + return SI; + case AMDGPUSubtarget::VOLCANIC_ISLANDS: + return VI; + } +} + +int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const { + int MCOp = AMDGPU::getMCOpcode( + Opcode, AMDGPUSubtargetToSISubtarget(ST.getGeneration())); + + // -1 means that Opcode is already a native instruction. + if (MCOp == -1) + return Opcode; + + // (uint16_t)-1 means that Opcode is a pseudo instruction that has + // no encoding in the given subtarget generation. + if (MCOp == (uint16_t)-1) + return -1; + + return MCOp; +} diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/lib/Target/AMDGPU/AMDGPUInstrInfo.h new file mode 100644 index 00000000000..86d3962b385 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -0,0 +1,206 @@ +//===-- AMDGPUInstrInfo.h - AMDGPU Instruction Information ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Contains the definition of a TargetInstrInfo class that is common +/// to all AMD GPUs. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPUINSTRINFO_H +#define LLVM_LIB_TARGET_R600_AMDGPUINSTRINFO_H + +#include "AMDGPURegisterInfo.h" +#include "llvm/Target/TargetInstrInfo.h" +#include + +#define GET_INSTRINFO_HEADER +#define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_OPERAND_ENUM +#include "AMDGPUGenInstrInfo.inc" + +#define OPCODE_IS_ZERO_INT AMDGPU::PRED_SETE_INT +#define OPCODE_IS_NOT_ZERO_INT AMDGPU::PRED_SETNE_INT +#define OPCODE_IS_ZERO AMDGPU::PRED_SETE +#define OPCODE_IS_NOT_ZERO AMDGPU::PRED_SETNE + +namespace llvm { + +class AMDGPUSubtarget; +class MachineFunction; +class MachineInstr; +class MachineInstrBuilder; + +class AMDGPUInstrInfo : public AMDGPUGenInstrInfo { +private: + const AMDGPURegisterInfo RI; + virtual void anchor(); +protected: + const AMDGPUSubtarget &ST; +public: + explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st); + + virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0; + + bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, + unsigned &DstReg, unsigned &SubIdx) const override; + + unsigned isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const override; + unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const override; + bool hasLoadFromStackSlot(const MachineInstr *MI, + const MachineMemOperand *&MMO, + int &FrameIndex) const override; + unsigned isStoreFromStackSlot(const MachineInstr *MI, int &FrameIndex) const; + unsigned isStoreFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const; + bool hasStoreFromStackSlot(const MachineInstr *MI, + const MachineMemOperand *&MMO, + int &FrameIndex) const; + + MachineInstr * + convertToThreeAddress(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const override; + + + bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; + + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + +protected: + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + ArrayRef Ops, + MachineBasicBlock::iterator InsertPt, + int FrameIndex) const override; + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + ArrayRef Ops, + MachineBasicBlock::iterator InsertPt, + MachineInstr *LoadMI) const override; + +public: + /// \returns the smallest register index that will be accessed by an indirect + /// read or write or -1 if indirect addressing is not used by this program. + int getIndirectIndexBegin(const MachineFunction &MF) const; + + /// \returns the largest register index that will be accessed by an indirect + /// read or write or -1 if indirect addressing is not used by this program. + int getIndirectIndexEnd(const MachineFunction &MF) const; + + bool canFoldMemoryOperand(const MachineInstr *MI, + ArrayRef Ops) const override; + bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, + unsigned Reg, bool UnfoldLoad, bool UnfoldStore, + SmallVectorImpl &NewMIs) const override; + bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, + SmallVectorImpl &NewNodes) const override; + unsigned getOpcodeAfterMemoryUnfold(unsigned Opc, + bool UnfoldLoad, bool UnfoldStore, + unsigned *LoadRegIndex = nullptr) const override; + + bool enableClusterLoads() const override; + + bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, + int64_t Offset1, int64_t Offset2, + unsigned NumLoads) const override; + + bool + ReverseBranchCondition(SmallVectorImpl &Cond) const override; + void insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const override; + bool isPredicated(const MachineInstr *MI) const override; + bool SubsumesPredicate(ArrayRef Pred1, + ArrayRef Pred2) const override; + bool DefinesPredicate(MachineInstr *MI, + std::vector &Pred) const override; + bool isPredicable(MachineInstr *MI) const override; + bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override; + + // Helper functions that check the opcode for status information + bool isRegisterStore(const MachineInstr &MI) const; + bool isRegisterLoad(const MachineInstr &MI) const; + + /// \brief Return a target-specific opcode if Opcode is a pseudo instruction. + /// Return -1 if the target-specific opcode for the pseudo instruction does + /// not exist. If Opcode is not a pseudo instruction, this is identity. + int pseudoToMCOpcode(int Opcode) const; + + /// \brief Return the descriptor of the target-specific machine instruction + /// that corresponds to the specified pseudo or native opcode. + const MCInstrDesc &getMCOpcodeFromPseudo(unsigned Opcode) const { + return get(pseudoToMCOpcode(Opcode)); + } + +//===---------------------------------------------------------------------===// +// Pure virtual funtions to be implemented by sub-classes. +//===---------------------------------------------------------------------===// + + virtual bool isMov(unsigned opcode) const = 0; + + /// \brief Calculate the "Indirect Address" for the given \p RegIndex and + /// \p Channel + /// + /// We model indirect addressing using a virtual address space that can be + /// accesed with loads and stores. The "Indirect Address" is the memory + /// address in this virtual address space that maps to the given \p RegIndex + /// and \p Channel. + virtual unsigned calculateIndirectAddress(unsigned RegIndex, + unsigned Channel) const = 0; + + /// \returns The register class to be used for loading and storing values + /// from an "Indirect Address" . + virtual const TargetRegisterClass *getIndirectAddrRegClass() const = 0; + + /// \brief Build instruction(s) for an indirect register write. + /// + /// \returns The instruction that performs the indirect register write + virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const = 0; + + /// \brief Build instruction(s) for an indirect register read. + /// + /// \returns The instruction that performs the indirect register read + virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const = 0; + + /// \brief Build a MOV instruction. + virtual MachineInstr *buildMovInstr(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned DstReg, unsigned SrcReg) const = 0; + + /// \brief Given a MIMG \p Opcode that writes all 4 channels, return the + /// equivalent opcode that writes \p Channels Channels. + int getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const; + +}; + +namespace AMDGPU { + int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIndex); +} // End namespace AMDGPU + +} // End llvm namespace + +#define AMDGPU_FLAG_REGISTER_LOAD (UINT64_C(1) << 63) +#define AMDGPU_FLAG_REGISTER_STORE (UINT64_C(1) << 62) + +#endif diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td new file mode 100644 index 00000000000..b413897d9d2 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -0,0 +1,245 @@ +//===-- AMDGPUInstrInfo.td - AMDGPU DAG nodes --------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains DAG node defintions for the AMDGPU target. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// AMDGPU DAG Profiles +//===----------------------------------------------------------------------===// + +def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [ + SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3> +]>; + +def AMDGPUTrigPreOp : SDTypeProfile<1, 2, + [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>] +>; + +def AMDGPULdExpOp : SDTypeProfile<1, 2, + [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>] +>; + +def AMDGPUFPClassOp : SDTypeProfile<1, 2, + [SDTCisInt<0>, SDTCisFP<1>, SDTCisInt<2>] +>; + +def AMDGPUDivScaleOp : SDTypeProfile<2, 3, + [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>] +>; + +// float, float, float, vcc +def AMDGPUFmasOp : SDTypeProfile<1, 4, + [SDTCisFP<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<4>] +>; + +//===----------------------------------------------------------------------===// +// AMDGPU DAG Nodes +// + +// This argument to this node is a dword address. +def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>; + +def AMDGPUcos : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>; +def AMDGPUsin : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>; + +// out = a - floor(a) +def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>; + +// out = 1.0 / a +def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>; + +// out = 1.0 / sqrt(a) +def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>; + +// out = 1.0 / sqrt(a) +def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>; + +// out = 1.0 / sqrt(a) result clamped to +/- max_float. +def AMDGPUrsq_clamped : SDNode<"AMDGPUISD::RSQ_CLAMPED", SDTFPUnaryOp>; + +def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>; + +def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>; + +// out = max(a, b) a and b are floats, where a nan comparison fails. +// This is not commutative because this gives the second operand: +// x < nan ? x : nan -> nan +// nan < x ? nan : x -> x +def AMDGPUfmax_legacy : SDNode<"AMDGPUISD::FMAX_LEGACY", SDTFPBinOp, + [] +>; + +def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>; + +// out = max(a, b) a and b are signed ints +def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp, + [SDNPCommutative, SDNPAssociative] +>; + +// out = max(a, b) a and b are unsigned ints +def AMDGPUumax : SDNode<"AMDGPUISD::UMAX", SDTIntBinOp, + [SDNPCommutative, SDNPAssociative] +>; + +// out = min(a, b) a and b are floats, where a nan comparison fails. +def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp, + [] +>; + +// FIXME: TableGen doesn't like commutative instructions with more +// than 2 operands. +// out = max(a, b, c) a, b and c are floats +def AMDGPUfmax3 : SDNode<"AMDGPUISD::FMAX3", SDTFPTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = max(a, b, c) a, b, and c are signed ints +def AMDGPUsmax3 : SDNode<"AMDGPUISD::SMAX3", AMDGPUDTIntTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = max(a, b, c) a, b and c are unsigned ints +def AMDGPUumax3 : SDNode<"AMDGPUISD::UMAX3", AMDGPUDTIntTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = min(a, b, c) a, b and c are floats +def AMDGPUfmin3 : SDNode<"AMDGPUISD::FMIN3", SDTFPTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = min(a, b, c) a, b and c are signed ints +def AMDGPUsmin3 : SDNode<"AMDGPUISD::SMIN3", AMDGPUDTIntTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = min(a, b) a and b are unsigned ints +def AMDGPUumin3 : SDNode<"AMDGPUISD::UMIN3", AMDGPUDTIntTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = (src0 + src1 > 0xFFFFFFFF) ? 1 : 0 +def AMDGPUcarry : SDNode<"AMDGPUISD::CARRY", SDTIntBinOp, []>; + +// out = (src1 > src0) ? 1 : 0 +def AMDGPUborrow : SDNode<"AMDGPUISD::BORROW", SDTIntBinOp, []>; + + +def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0", + SDTIntToFPOp, []>; +def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1", + SDTIntToFPOp, []>; +def AMDGPUcvt_f32_ubyte2 : SDNode<"AMDGPUISD::CVT_F32_UBYTE2", + SDTIntToFPOp, []>; +def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3", + SDTIntToFPOp, []>; + + +// urecip - This operation is a helper for integer division, it returns the +// result of 1 / a as a fractional unsigned integer. +// out = (2^32 / a) + e +// e is rounding error +def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>; + +// Special case divide preop and flags. +def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>; + +// Special case divide FMA with scale and flags (src0 = Quotient, +// src1 = Denominator, src2 = Numerator). +def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp>; + +// Single or double precision division fixup. +// Special case divide fixup and flags(src0 = Quotient, src1 = +// Denominator, src2 = Numerator). +def AMDGPUdiv_fixup : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>; + +// Look Up 2.0 / pi src0 with segment select src1[4:0] +def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>; + +def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD", + SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>, + [SDNPHasChain, SDNPMayLoad]>; + +def AMDGPUregister_store : SDNode<"AMDGPUISD::REGISTER_STORE", + SDTypeProfile<0, 3, [SDTCisPtrTy<1>, SDTCisInt<2>]>, + [SDNPHasChain, SDNPMayStore]>; + +// MSKOR instructions are atomic memory instructions used mainly for storing +// 8-bit and 16-bit values. The definition is: +// +// MSKOR(dst, mask, src) MEM[dst] = ((MEM[dst] & ~mask) | src) +// +// src0: vec4(src, 0, 0, mask) +// src1: dst - rat offset (aka pointer) in dwords +def AMDGPUstore_mskor : SDNode<"AMDGPUISD::STORE_MSKOR", + SDTypeProfile<0, 2, []>, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + +def AMDGPUround : SDNode<"ISD::FROUND", + SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>; + +def AMDGPUbfe_u32 : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>; +def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>; +def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>; +def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>; + +def AMDGPUbrev : SDNode<"AMDGPUISD::BREV", SDTIntUnaryOp>; + +// Signed and unsigned 24-bit mulitply. The highest 8-bits are ignore when +// performing the mulitply. The result is a 32-bit value. +def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp, + [SDNPCommutative] +>; +def AMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp, + [SDNPCommutative] +>; + +def AMDGPUmad_u24 : SDNode<"AMDGPUISD::MAD_U24", AMDGPUDTIntTernaryOp, + [] +>; +def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp, + [] +>; + +def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG", + SDTypeProfile<0, 1, [SDTCisInt<0>]>, + [SDNPHasChain, SDNPInGlue]>; + +def AMDGPUinterp_mov : SDNode<"AMDGPUISD::INTERP_MOV", + SDTypeProfile<1, 3, [SDTCisFP<0>]>, + [SDNPInGlue]>; + +def AMDGPUinterp_p1 : SDNode<"AMDGPUISD::INTERP_P1", + SDTypeProfile<1, 3, [SDTCisFP<0>]>, + [SDNPInGlue, SDNPOutGlue]>; + +def AMDGPUinterp_p2 : SDNode<"AMDGPUISD::INTERP_P2", + SDTypeProfile<1, 4, [SDTCisFP<0>]>, + [SDNPInGlue]>; + +//===----------------------------------------------------------------------===// +// Flow Control Profile Types +//===----------------------------------------------------------------------===// +// Branch instruction where second and third are basic blocks +def SDTIL_BRCond : SDTypeProfile<0, 2, [ + SDTCisVT<0, OtherVT> + ]>; + +//===----------------------------------------------------------------------===// +// Flow Control DAG Nodes +//===----------------------------------------------------------------------===// +def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>; + +//===----------------------------------------------------------------------===// +// Call/Return DAG Nodes +//===----------------------------------------------------------------------===// +def IL_retflag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone, + [SDNPHasChain, SDNPOptInGlue]>; diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td new file mode 100644 index 00000000000..72cab39277c --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -0,0 +1,682 @@ +//===-- AMDGPUInstructions.td - Common instruction defs ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains instruction defs that are common to all hw codegen +// targets. +// +//===----------------------------------------------------------------------===// + +class AMDGPUInst pattern> : Instruction { + field bit isRegisterLoad = 0; + field bit isRegisterStore = 0; + + let Namespace = "AMDGPU"; + let OutOperandList = outs; + let InOperandList = ins; + let AsmString = asm; + let Pattern = pattern; + let Itinerary = NullALU; + + let TSFlags{63} = isRegisterLoad; + let TSFlags{62} = isRegisterStore; +} + +class AMDGPUShaderInst pattern> + : AMDGPUInst { + + field bits<32> Inst = 0xffffffff; + +} + +def FP32Denormals : Predicate<"Subtarget.hasFP32Denormals()">; +def FP64Denormals : Predicate<"Subtarget.hasFP64Denormals()">; +def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">; + +def InstFlag : OperandWithDefaultOps ; +def ADDRIndirect : ComplexPattern; + +let OperandType = "OPERAND_IMMEDIATE" in { + +def u32imm : Operand { + let PrintMethod = "printU32ImmOperand"; +} + +def u16imm : Operand { + let PrintMethod = "printU16ImmOperand"; +} + +def u8imm : Operand { + let PrintMethod = "printU8ImmOperand"; +} + +} // End OperandType = "OPERAND_IMMEDIATE" + +//===--------------------------------------------------------------------===// +// Custom Operands +//===--------------------------------------------------------------------===// +def brtarget : Operand; + +//===----------------------------------------------------------------------===// +// PatLeafs for floating-point comparisons +//===----------------------------------------------------------------------===// + +def COND_OEQ : PatLeaf < + (cond), + [{return N->get() == ISD::SETOEQ || N->get() == ISD::SETEQ;}] +>; + +def COND_ONE : PatLeaf < + (cond), + [{return N->get() == ISD::SETONE || N->get() == ISD::SETNE;}] +>; + +def COND_OGT : PatLeaf < + (cond), + [{return N->get() == ISD::SETOGT || N->get() == ISD::SETGT;}] +>; + +def COND_OGE : PatLeaf < + (cond), + [{return N->get() == ISD::SETOGE || N->get() == ISD::SETGE;}] +>; + +def COND_OLT : PatLeaf < + (cond), + [{return N->get() == ISD::SETOLT || N->get() == ISD::SETLT;}] +>; + +def COND_OLE : PatLeaf < + (cond), + [{return N->get() == ISD::SETOLE || N->get() == ISD::SETLE;}] +>; + + +def COND_O : PatLeaf <(cond), [{return N->get() == ISD::SETO;}]>; +def COND_UO : PatLeaf <(cond), [{return N->get() == ISD::SETUO;}]>; + +//===----------------------------------------------------------------------===// +// PatLeafs for unsigned / unordered comparisons +//===----------------------------------------------------------------------===// + +def COND_UEQ : PatLeaf <(cond), [{return N->get() == ISD::SETUEQ;}]>; +def COND_UNE : PatLeaf <(cond), [{return N->get() == ISD::SETUNE;}]>; +def COND_UGT : PatLeaf <(cond), [{return N->get() == ISD::SETUGT;}]>; +def COND_UGE : PatLeaf <(cond), [{return N->get() == ISD::SETUGE;}]>; +def COND_ULT : PatLeaf <(cond), [{return N->get() == ISD::SETULT;}]>; +def COND_ULE : PatLeaf <(cond), [{return N->get() == ISD::SETULE;}]>; + +// XXX - For some reason R600 version is preferring to use unordered +// for setne? +def COND_UNE_NE : PatLeaf < + (cond), + [{return N->get() == ISD::SETUNE || N->get() == ISD::SETNE;}] +>; + +//===----------------------------------------------------------------------===// +// PatLeafs for signed comparisons +//===----------------------------------------------------------------------===// + +def COND_SGT : PatLeaf <(cond), [{return N->get() == ISD::SETGT;}]>; +def COND_SGE : PatLeaf <(cond), [{return N->get() == ISD::SETGE;}]>; +def COND_SLT : PatLeaf <(cond), [{return N->get() == ISD::SETLT;}]>; +def COND_SLE : PatLeaf <(cond), [{return N->get() == ISD::SETLE;}]>; + +//===----------------------------------------------------------------------===// +// PatLeafs for integer equality +//===----------------------------------------------------------------------===// + +def COND_EQ : PatLeaf < + (cond), + [{return N->get() == ISD::SETEQ || N->get() == ISD::SETUEQ;}] +>; + +def COND_NE : PatLeaf < + (cond), + [{return N->get() == ISD::SETNE || N->get() == ISD::SETUNE;}] +>; + +def COND_NULL : PatLeaf < + (cond), + [{(void)N; return false;}] +>; + +//===----------------------------------------------------------------------===// +// Load/Store Pattern Fragments +//===----------------------------------------------------------------------===// + +class PrivateMemOp : PatFrag (N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; +}]>; + +class PrivateLoad : PrivateMemOp < + (ops node:$ptr), (op node:$ptr) +>; + +class PrivateStore : PrivateMemOp < + (ops node:$value, node:$ptr), (op node:$value, node:$ptr) +>; + +def load_private : PrivateLoad ; + +def truncstorei8_private : PrivateStore ; +def truncstorei16_private : PrivateStore ; +def store_private : PrivateStore ; + +def global_store : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast(N)); +}]>; + +// Global address space loads +def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isGlobalLoad(dyn_cast(N)); +}]>; + +// Constant address space loads +def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isConstantLoad(dyn_cast(N), -1); +}]>; + +class AZExtLoadBase : PatFrag<(ops node:$ptr), + (ld_node node:$ptr), [{ + LoadSDNode *L = cast(N); + return L->getExtensionType() == ISD::ZEXTLOAD || + L->getExtensionType() == ISD::EXTLOAD; +}]>; + +def az_extload : AZExtLoadBase ; + +def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i8; +}]>; + +def az_extloadi8_global : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ + return isGlobalLoad(dyn_cast(N)); +}]>; + +def sextloadi8_global : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ + return isGlobalLoad(dyn_cast(N)); +}]>; + +def az_extloadi8_flat : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ + return isFlatLoad(dyn_cast(N)); +}]>; + +def sextloadi8_flat : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ + return isFlatLoad(dyn_cast(N)); +}]>; + +def az_extloadi8_constant : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ + return isConstantLoad(dyn_cast(N), -1); +}]>; + +def sextloadi8_constant : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ + return isConstantLoad(dyn_cast(N), -1); +}]>; + +def az_extloadi8_local : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ + return isLocalLoad(dyn_cast(N)); +}]>; + +def sextloadi8_local : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ + return isLocalLoad(dyn_cast(N)); +}]>; + +def extloadi8_private : PrivateLoad ; +def sextloadi8_private : PrivateLoad ; + +def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i16; +}]>; + +def az_extloadi16_global : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ + return isGlobalLoad(dyn_cast(N)); +}]>; + +def sextloadi16_global : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ + return isGlobalLoad(dyn_cast(N)); +}]>; + +def az_extloadi16_flat : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ + return isFlatLoad(dyn_cast(N)); +}]>; + +def sextloadi16_flat : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ + return isFlatLoad(dyn_cast(N)); +}]>; + +def az_extloadi16_constant : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ + return isConstantLoad(dyn_cast(N), -1); +}]>; + +def sextloadi16_constant : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ + return isConstantLoad(dyn_cast(N), -1); +}]>; + +def az_extloadi16_local : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ + return isLocalLoad(dyn_cast(N)); +}]>; + +def sextloadi16_local : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ + return isLocalLoad(dyn_cast(N)); +}]>; + +def extloadi16_private : PrivateLoad ; +def sextloadi16_private : PrivateLoad ; + +def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i32; +}]>; + +def az_extloadi32_global : PatFrag<(ops node:$ptr), + (az_extloadi32 node:$ptr), [{ + return isGlobalLoad(dyn_cast(N)); +}]>; + +def az_extloadi32_flat : PatFrag<(ops node:$ptr), + (az_extloadi32 node:$ptr), [{ + return isFlatLoad(dyn_cast(N)); +}]>; + +def az_extloadi32_constant : PatFrag<(ops node:$ptr), + (az_extloadi32 node:$ptr), [{ + return isConstantLoad(dyn_cast(N), -1); +}]>; + +def truncstorei8_global : PatFrag<(ops node:$val, node:$ptr), + (truncstorei8 node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast(N)); +}]>; + +def truncstorei16_global : PatFrag<(ops node:$val, node:$ptr), + (truncstorei16 node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast(N)); +}]>; + +def truncstorei8_flat : PatFrag<(ops node:$val, node:$ptr), + (truncstorei8 node:$val, node:$ptr), [{ + return isFlatStore(dyn_cast(N)); +}]>; + +def truncstorei16_flat : PatFrag<(ops node:$val, node:$ptr), + (truncstorei16 node:$val, node:$ptr), [{ + return isFlatStore(dyn_cast(N)); +}]>; + +def local_store : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return isLocalStore(dyn_cast(N)); +}]>; + +def truncstorei8_local : PatFrag<(ops node:$val, node:$ptr), + (truncstorei8 node:$val, node:$ptr), [{ + return isLocalStore(dyn_cast(N)); +}]>; + +def truncstorei16_local : PatFrag<(ops node:$val, node:$ptr), + (truncstorei16 node:$val, node:$ptr), [{ + return isLocalStore(dyn_cast(N)); +}]>; + +def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isLocalLoad(dyn_cast(N)); +}]>; + +class Aligned8Bytes : PatFrag (N)->getAlignment() % 8 == 0; +}]>; + +def local_load_aligned8bytes : Aligned8Bytes < + (ops node:$ptr), (local_load node:$ptr) +>; + +def local_store_aligned8bytes : Aligned8Bytes < + (ops node:$val, node:$ptr), (local_store node:$val, node:$ptr) +>; + +class local_binary_atomic_op : + PatFrag<(ops node:$ptr, node:$value), + (atomic_op node:$ptr, node:$value), [{ + return cast(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; +}]>; + + +def atomic_swap_local : local_binary_atomic_op; +def atomic_load_add_local : local_binary_atomic_op; +def atomic_load_sub_local : local_binary_atomic_op; +def atomic_load_and_local : local_binary_atomic_op; +def atomic_load_or_local : local_binary_atomic_op; +def atomic_load_xor_local : local_binary_atomic_op; +def atomic_load_nand_local : local_binary_atomic_op; +def atomic_load_min_local : local_binary_atomic_op; +def atomic_load_max_local : local_binary_atomic_op; +def atomic_load_umin_local : local_binary_atomic_op; +def atomic_load_umax_local : local_binary_atomic_op; + +def mskor_global : PatFrag<(ops node:$val, node:$ptr), + (AMDGPUstore_mskor node:$val, node:$ptr), [{ + return cast(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; +}]>; + +multiclass AtomicCmpSwapLocal { + + def _32_local : PatFrag < + (ops node:$ptr, node:$cmp, node:$swap), + (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{ + AtomicSDNode *AN = cast(N); + return AN->getMemoryVT() == MVT::i32 && + AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; + }]>; + + def _64_local : PatFrag< + (ops node:$ptr, node:$cmp, node:$swap), + (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{ + AtomicSDNode *AN = cast(N); + return AN->getMemoryVT() == MVT::i64 && + AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; + }]>; +} + +defm atomic_cmp_swap : AtomicCmpSwapLocal ; + +def flat_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isFlatLoad(dyn_cast(N)); +}]>; + +def flat_store : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return isFlatStore(dyn_cast(N)); +}]>; + +def mskor_flat : PatFrag<(ops node:$val, node:$ptr), + (AMDGPUstore_mskor node:$val, node:$ptr), [{ + return cast(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS; +}]>; + +class global_binary_atomic_op : PatFrag< + (ops node:$ptr, node:$value), + (atomic_op node:$ptr, node:$value), + [{return cast(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}] +>; + +def atomic_swap_global : global_binary_atomic_op; +def atomic_add_global : global_binary_atomic_op; +def atomic_and_global : global_binary_atomic_op; +def atomic_max_global : global_binary_atomic_op; +def atomic_min_global : global_binary_atomic_op; +def atomic_or_global : global_binary_atomic_op; +def atomic_sub_global : global_binary_atomic_op; +def atomic_umax_global : global_binary_atomic_op; +def atomic_umin_global : global_binary_atomic_op; +def atomic_xor_global : global_binary_atomic_op; + +//===----------------------------------------------------------------------===// +// Misc Pattern Fragments +//===----------------------------------------------------------------------===// + +class Constants { +int TWO_PI = 0x40c90fdb; +int PI = 0x40490fdb; +int TWO_PI_INV = 0x3e22f983; +int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding +int FP32_NEG_ONE = 0xbf800000; +int FP32_ONE = 0x3f800000; +} +def CONST : Constants; + +def FP_ZERO : PatLeaf < + (fpimm), + [{return N->getValueAPF().isZero();}] +>; + +def FP_ONE : PatLeaf < + (fpimm), + [{return N->isExactlyValue(1.0);}] +>; + +def FP_HALF : PatLeaf < + (fpimm), + [{return N->isExactlyValue(0.5);}] +>; + +let isCodeGenOnly = 1, isPseudo = 1 in { + +let usesCustomInserter = 1 in { + +class CLAMP : AMDGPUShaderInst < + (outs rc:$dst), + (ins rc:$src0), + "CLAMP $dst, $src0", + [(set f32:$dst, (AMDGPUclamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))] +>; + +class FABS : AMDGPUShaderInst < + (outs rc:$dst), + (ins rc:$src0), + "FABS $dst, $src0", + [(set f32:$dst, (fabs f32:$src0))] +>; + +class FNEG : AMDGPUShaderInst < + (outs rc:$dst), + (ins rc:$src0), + "FNEG $dst, $src0", + [(set f32:$dst, (fneg f32:$src0))] +>; + +} // usesCustomInserter = 1 + +multiclass RegisterLoadStore { +let UseNamedOperandTable = 1 in { + + def RegisterLoad : AMDGPUShaderInst < + (outs dstClass:$dst), + (ins addrClass:$addr, i32imm:$chan), + "RegisterLoad $dst, $addr", + [(set i32:$dst, (AMDGPUregister_load addrPat:$addr, (i32 timm:$chan)))] + > { + let isRegisterLoad = 1; + } + + def RegisterStore : AMDGPUShaderInst < + (outs), + (ins dstClass:$val, addrClass:$addr, i32imm:$chan), + "RegisterStore $val, $addr", + [(AMDGPUregister_store i32:$val, addrPat:$addr, (i32 timm:$chan))] + > { + let isRegisterStore = 1; + } +} +} + +} // End isCodeGenOnly = 1, isPseudo = 1 + +/* Generic helper patterns for intrinsics */ +/* -------------------------------------- */ + +class POW_Common + : Pat < + (fpow f32:$src0, f32:$src1), + (exp_ieee (mul f32:$src1, (log_ieee f32:$src0))) +>; + +/* Other helper patterns */ +/* --------------------- */ + +/* Extract element pattern */ +class Extract_Element + : Pat< + (sub_type (vector_extract vec_type:$src, sub_idx)), + (EXTRACT_SUBREG $src, sub_reg) +>; + +/* Insert element pattern */ +class Insert_Element + : Pat < + (vector_insert vec_type:$vec, elem_type:$elem, sub_idx), + (INSERT_SUBREG $vec, $elem, sub_reg) +>; + +// XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer +// can handle COPY instructions. +// bitconvert pattern +class BitConvert : Pat < + (dt (bitconvert (st rc:$src0))), + (dt rc:$src0) +>; + +// XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer +// can handle COPY instructions. +class DwordAddrPat : Pat < + (vt (AMDGPUdwordaddr (vt rc:$addr))), + (vt rc:$addr) +>; + +// BFI_INT patterns + +multiclass BFIPatterns { + // Definition from ISA doc: + // (y & x) | (z & ~x) + def : Pat < + (or (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))), + (BFI_INT $x, $y, $z) + >; + + // SHA-256 Ch function + // z ^ (x & (y ^ z)) + def : Pat < + (xor i32:$z, (and i32:$x, (xor i32:$y, i32:$z))), + (BFI_INT $x, $y, $z) + >; + + def : Pat < + (fcopysign f32:$src0, f32:$src1), + (BFI_INT (LoadImm32 0x7fffffff), $src0, $src1) + >; + + def : Pat < + (f64 (fcopysign f64:$src0, f64:$src1)), + (REG_SEQUENCE RC64, + (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, + (BFI_INT (LoadImm32 0x7fffffff), + (i32 (EXTRACT_SUBREG $src0, sub1)), + (i32 (EXTRACT_SUBREG $src1, sub1))), sub1) + >; +} + +// SHA-256 Ma patterns + +// ((x & z) | (y & (x | z))) -> BFI_INT (XOR x, y), z, y +class SHA256MaPattern : Pat < + (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))), + (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y) +>; + +// Bitfield extract patterns + +def IMMZeroBasedBitfieldMask : PatLeaf <(imm), [{ + return isMask_32(N->getZExtValue()); +}]>; + +def IMMPopCount : SDNodeXFormgetTargetConstant(countPopulation(N->getZExtValue()), SDLoc(N), + MVT::i32); +}]>; + +class BFEPattern : Pat < + (i32 (and (i32 (srl i32:$src, i32:$rshift)), IMMZeroBasedBitfieldMask:$mask)), + (BFE $src, $rshift, (MOV (i32 (IMMPopCount $mask)))) +>; + +// rotr pattern +class ROTRPattern : Pat < + (rotr i32:$src0, i32:$src1), + (BIT_ALIGN $src0, $src0, $src1) +>; + +// 24-bit arithmetic patterns +def umul24 : PatFrag <(ops node:$x, node:$y), (mul node:$x, node:$y)>; + +// Special conversion patterns + +def cvt_rpi_i32_f32 : PatFrag < + (ops node:$src), + (fp_to_sint (ffloor (fadd $src, FP_HALF))), + [{ (void) N; return TM.Options.NoNaNsFPMath; }] +>; + +def cvt_flr_i32_f32 : PatFrag < + (ops node:$src), + (fp_to_sint (ffloor $src)), + [{ (void)N; return TM.Options.NoNaNsFPMath; }] +>; + +/* +class UMUL24Pattern : Pat < + (mul U24:$x, U24:$y), + (UMUL24 $x, $y) +>; +*/ + +class IMad24Pat : Pat < + (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2), + (Inst $src0, $src1, $src2) +>; + +class UMad24Pat : Pat < + (add (AMDGPUmul_u24 i32:$src0, i32:$src1), i32:$src2), + (Inst $src0, $src1, $src2) +>; + +multiclass Expand24IBitOps { + def _expand_imad24 : Pat < + (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2), + (AddInst (MulInst $src0, $src1), $src2) + >; + + def _expand_imul24 : Pat < + (AMDGPUmul_i24 i32:$src0, i32:$src1), + (MulInst $src0, $src1) + >; +} + +multiclass Expand24UBitOps { + def _expand_umad24 : Pat < + (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2), + (AddInst (MulInst $src0, $src1), $src2) + >; + + def _expand_umul24 : Pat < + (AMDGPUmul_u24 i32:$src0, i32:$src1), + (MulInst $src0, $src1) + >; +} + +class RcpPat : Pat < + (fdiv FP_ONE, vt:$src), + (RcpInst $src) +>; + +class RsqPat : Pat < + (AMDGPUrcp (fsqrt vt:$src)), + (RsqInst $src) +>; + +include "R600Instructions.td" +include "R700Instructions.td" +include "EvergreenInstructions.td" +include "CaymanInstructions.td" + +include "SIInstrInfo.td" + diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp new file mode 100644 index 00000000000..e94bb6013d8 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp @@ -0,0 +1,77 @@ +//===- AMDGPUIntrinsicInfo.cpp - AMDGPU Intrinsic Information ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief AMDGPU Implementation of the IntrinsicInfo class. +// +//===-----------------------------------------------------------------------===// + +#include "AMDGPUIntrinsicInfo.h" +#include "AMDGPUSubtarget.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" + +using namespace llvm; + +#define GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN +#include "AMDGPUGenIntrinsics.inc" +#undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN + +AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo() + : TargetIntrinsicInfo() {} + +std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys, + unsigned numTys) const { + static const char *const names[] = { +#define GET_INTRINSIC_NAME_TABLE +#include "AMDGPUGenIntrinsics.inc" +#undef GET_INTRINSIC_NAME_TABLE + }; + + if (IntrID < Intrinsic::num_intrinsics) { + return nullptr; + } + assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics && + "Invalid intrinsic ID"); + + std::string Result(names[IntrID - Intrinsic::num_intrinsics]); + return Result; +} + +unsigned AMDGPUIntrinsicInfo::lookupName(const char *Name, + unsigned Len) const { + if (!StringRef(Name, Len).startswith("llvm.")) + return 0; // All intrinsics start with 'llvm.' + +#define GET_FUNCTION_RECOGNIZER +#include "AMDGPUGenIntrinsics.inc" +#undef GET_FUNCTION_RECOGNIZER + AMDGPUIntrinsic::ID IntrinsicID = + (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic; + IntrinsicID = getIntrinsicForGCCBuiltin("AMDGPU", Name); + + if (IntrinsicID != (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic) { + return IntrinsicID; + } + return 0; +} + +bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const { +// Overload Table +#define GET_INTRINSIC_OVERLOAD_TABLE +#include "AMDGPUGenIntrinsics.inc" +#undef GET_INTRINSIC_OVERLOAD_TABLE +} + +Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, + Type **Tys, + unsigned numTys) const { + llvm_unreachable("Not implemented"); +} diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h new file mode 100644 index 00000000000..4c95b5ec097 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h @@ -0,0 +1,48 @@ +//===- AMDGPUIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class. +// +//===-----------------------------------------------------------------------===// +#ifndef LLVM_LIB_TARGET_R600_AMDGPUINTRINSICINFO_H +#define LLVM_LIB_TARGET_R600_AMDGPUINTRINSICINFO_H + +#include "llvm/IR/Intrinsics.h" +#include "llvm/Target/TargetIntrinsicInfo.h" + +namespace llvm { +class TargetMachine; + +namespace AMDGPUIntrinsic { +enum ID { + last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1, +#define GET_INTRINSIC_ENUM_VALUES +#include "AMDGPUGenIntrinsics.inc" +#undef GET_INTRINSIC_ENUM_VALUES + , num_AMDGPU_intrinsics +}; + +} // end namespace AMDGPUIntrinsic + +class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo { +public: + AMDGPUIntrinsicInfo(); + std::string getName(unsigned IntrId, Type **Tys = nullptr, + unsigned numTys = 0) const override; + unsigned lookupName(const char *Name, unsigned Len) const override; + bool isOverloaded(unsigned IID) const override; + Function *getDeclaration(Module *M, unsigned ID, + Type **Tys = nullptr, + unsigned numTys = 0) const override; +}; + +} // end namespace llvm + +#endif diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/lib/Target/AMDGPU/AMDGPUIntrinsics.td new file mode 100644 index 00000000000..ab489cd2a4a --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUIntrinsics.td @@ -0,0 +1,90 @@ +//===-- AMDGPUIntrinsics.td - Common intrinsics -*- tablegen -*-----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines intrinsics that are used by all hw codegen targets. +// +//===----------------------------------------------------------------------===// + +let TargetPrefix = "AMDGPU", isTarget = 1 in { + + def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>; + def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_abs : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_arl : Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_cndlt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_fract : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; + def int_AMDGPU_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + + // This is named backwards (instead of rsq_legacy) so we don't have + // to define it with the public builtins intrinsics. This is a + // workaround for how intrinsic names are parsed. If the name is + // llvm.AMDGPU.rsq.legacy, the parser assumes that you meant + // llvm.AMDGPU.rsq.{f32 | f64} and incorrectly mangled the name. + def int_AMDGPU_legacy_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; + + def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; + def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>; + def int_AMDGPU_kilp : Intrinsic<[], [], []>; + def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_sle : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_sne : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_mullit : Intrinsic<[llvm_v4f32_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_tex : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_txb : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_txf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_txq : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_txd : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_txl : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_trunc : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_ddx : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_ddy : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_imax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_imin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_umul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_imul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_imad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_umad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_cvt_f32_ubyte0 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_cvt_f32_ubyte1 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_cvt_f32_ubyte2 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_cvt_f32_ubyte3 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; + def int_AMDGPU_bfi : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_bfe_u32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_barrier_local : Intrinsic<[], [], []>; + def int_AMDGPU_barrier_global : Intrinsic<[], [], []>; +} + +// Legacy names for compatibility. +let TargetPrefix = "AMDIL", isTarget = 1 in { + def int_AMDIL_abs : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>; + def int_AMDIL_fraction : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; + def int_AMDIL_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + def int_AMDIL_exp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; + def int_AMDIL_round_nearest : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; +} + +let TargetPrefix = "TGSI", isTarget = 1 in { + + def int_TGSI_lit_z : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],[IntrNoMem]>; +} + +include "SIIntrinsics.td" diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp new file mode 100644 index 00000000000..20831460b93 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -0,0 +1,154 @@ +//===- AMDGPUMCInstLower.cpp - Lower AMDGPU MachineInstr to an MCInst -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Code to lower AMDGPU MachineInstrs to their corresponding MCInst. +// +//===----------------------------------------------------------------------===// +// + +#include "AMDGPUMCInstLower.h" +#include "AMDGPUAsmPrinter.h" +#include "AMDGPUTargetMachine.h" +#include "InstPrinter/AMDGPUInstPrinter.h" +#include "R600InstrInfo.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCObjectStreamer.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" +#include + +using namespace llvm; + +AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st): + Ctx(ctx), ST(st) +{ } + +void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { + + int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(MI->getOpcode()); + + if (MCOpcode == -1) { + LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext(); + C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have " + "a target-specific version: " + Twine(MI->getOpcode())); + } + + OutMI.setOpcode(MCOpcode); + + for (const MachineOperand &MO : MI->explicit_operands()) { + MCOperand MCOp; + switch (MO.getType()) { + default: + llvm_unreachable("unknown operand type"); + case MachineOperand::MO_Immediate: + MCOp = MCOperand::createImm(MO.getImm()); + break; + case MachineOperand::MO_Register: + MCOp = MCOperand::createReg(MO.getReg()); + break; + case MachineOperand::MO_MachineBasicBlock: + MCOp = MCOperand::createExpr(MCSymbolRefExpr::create( + MO.getMBB()->getSymbol(), Ctx)); + break; + case MachineOperand::MO_GlobalAddress: { + const GlobalValue *GV = MO.getGlobal(); + MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(GV->getName())); + MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(Sym, Ctx)); + break; + } + case MachineOperand::MO_TargetIndex: { + assert(MO.getIndex() == AMDGPU::TI_CONSTDATA_START); + MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); + const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx); + MCOp = MCOperand::createExpr(Expr); + break; + } + case MachineOperand::MO_ExternalSymbol: { + MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(MO.getSymbolName())); + const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx); + MCOp = MCOperand::createExpr(Expr); + break; + } + } + OutMI.addOperand(MCOp); + } +} + +void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { + const AMDGPUSubtarget &STI = MF->getSubtarget(); + AMDGPUMCInstLower MCInstLowering(OutContext, STI); + +#ifdef _DEBUG + StringRef Err; + if (!STI.getInstrInfo()->verifyInstruction(MI, Err)) { + errs() << "Warning: Illegal instruction detected: " << Err << "\n"; + MI->dump(); + } +#endif + if (MI->isBundle()) { + const MachineBasicBlock *MBB = MI->getParent(); + MachineBasicBlock::const_instr_iterator I = MI; + ++I; + while (I != MBB->end() && I->isInsideBundle()) { + EmitInstruction(I); + ++I; + } + } else { + MCInst TmpInst; + MCInstLowering.lower(MI, TmpInst); + EmitToStreamer(*OutStreamer, TmpInst); + + if (STI.dumpCode()) { + // Disassemble instruction/operands to text. + DisasmLines.resize(DisasmLines.size() + 1); + std::string &DisasmLine = DisasmLines.back(); + raw_string_ostream DisasmStream(DisasmLine); + + AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(), + *MF->getSubtarget().getInstrInfo(), + *MF->getSubtarget().getRegisterInfo()); + InstPrinter.printInst(&TmpInst, DisasmStream, StringRef(), + MF->getSubtarget()); + + // Disassemble instruction/operands to hex representation. + SmallVector Fixups; + SmallVector CodeBytes; + raw_svector_ostream CodeStream(CodeBytes); + + auto &ObjStreamer = static_cast(*OutStreamer); + MCCodeEmitter &InstEmitter = ObjStreamer.getAssembler().getEmitter(); + InstEmitter.encodeInstruction(TmpInst, CodeStream, Fixups, + MF->getSubtarget()); + CodeStream.flush(); + + HexLines.resize(HexLines.size() + 1); + std::string &HexLine = HexLines.back(); + raw_string_ostream HexStream(HexLine); + + for (size_t i = 0; i < CodeBytes.size(); i += 4) { + unsigned int CodeDWord = *(unsigned int *)&CodeBytes[i]; + HexStream << format("%s%08X", (i > 0 ? " " : ""), CodeDWord); + } + + DisasmStream.flush(); + DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLine.size()); + } + } +} diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/lib/Target/AMDGPU/AMDGPUMCInstLower.h new file mode 100644 index 00000000000..d322fe072b2 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.h @@ -0,0 +1,35 @@ +//===- AMDGPUMCInstLower.h MachineInstr Lowering Interface ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPUMCINSTLOWER_H +#define LLVM_LIB_TARGET_R600_AMDGPUMCINSTLOWER_H + +namespace llvm { + +class AMDGPUSubtarget; +class MachineInstr; +class MCContext; +class MCInst; + +class AMDGPUMCInstLower { + MCContext &Ctx; + const AMDGPUSubtarget &ST; + +public: + AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST); + + /// \brief Lower a MachineInstr to an MCInst + void lower(const MachineInstr *MI, MCInst &OutMI) const; + +}; + +} // End namespace llvm + +#endif diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp new file mode 100644 index 00000000000..21c7da66323 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -0,0 +1,25 @@ +#include "AMDGPUMachineFunction.h" +#include "AMDGPU.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Function.h" +using namespace llvm; + +static const char *const ShaderTypeAttribute = "ShaderType"; + +// Pin the vtable to this file. +void AMDGPUMachineFunction::anchor() {} + +AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : + MachineFunctionInfo(), + ShaderType(ShaderType::COMPUTE), + LDSSize(0), + ScratchSize(0), + IsKernel(true) { + Attribute A = MF.getFunction()->getFnAttribute(ShaderTypeAttribute); + + if (A.isStringAttribute()) { + StringRef Str = A.getValueAsString(); + if (Str.getAsInteger(0, ShaderType)) + llvm_unreachable("Can't parse shader type!"); + } +} diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/lib/Target/AMDGPU/AMDGPUMachineFunction.h new file mode 100644 index 00000000000..f5e4694e76f --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -0,0 +1,45 @@ +//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPUMACHINEFUNCTION_H +#define LLVM_LIB_TARGET_R600_AMDGPUMACHINEFUNCTION_H + +#include "llvm/CodeGen/MachineFunction.h" +#include + +namespace llvm { + +class AMDGPUMachineFunction : public MachineFunctionInfo { + virtual void anchor(); + unsigned ShaderType; + +public: + AMDGPUMachineFunction(const MachineFunction &MF); + /// A map to keep track of local memory objects and their offsets within + /// the local memory space. + std::map LocalMemoryObjects; + /// Number of bytes in the LDS that are being used. + unsigned LDSSize; + + /// Start of implicit kernel args + unsigned ABIArgOffset; + + unsigned getShaderType() const { + return ShaderType; + } + + unsigned ScratchSize; + bool IsKernel; +}; + +} +#endif diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp new file mode 100644 index 00000000000..4a65bfc57f1 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -0,0 +1,407 @@ +//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass eliminates allocas by either converting them into vectors or +// by migrating them to local address space. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "amdgpu-promote-alloca" + +using namespace llvm; + +namespace { + +class AMDGPUPromoteAlloca : public FunctionPass, + public InstVisitor { + + static char ID; + Module *Mod; + const AMDGPUSubtarget &ST; + int LocalMemAvailable; + +public: + AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st), + LocalMemAvailable(0) { } + bool doInitialization(Module &M) override; + bool runOnFunction(Function &F) override; + const char *getPassName() const override { return "AMDGPU Promote Alloca"; } + void visitAlloca(AllocaInst &I); +}; + +} // End anonymous namespace + +char AMDGPUPromoteAlloca::ID = 0; + +bool AMDGPUPromoteAlloca::doInitialization(Module &M) { + Mod = &M; + return false; +} + +bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { + + const FunctionType *FTy = F.getFunctionType(); + + LocalMemAvailable = ST.getLocalMemorySize(); + + + // If the function has any arguments in the local address space, then it's + // possible these arguments require the entire local memory space, so + // we cannot use local memory in the pass. + for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) { + const Type *ParamTy = FTy->getParamType(i); + if (ParamTy->isPointerTy() && + ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + LocalMemAvailable = 0; + DEBUG(dbgs() << "Function has local memory argument. Promoting to " + "local memory disabled.\n"); + break; + } + } + + if (LocalMemAvailable > 0) { + // Check how much local memory is being used by global objects + for (Module::global_iterator I = Mod->global_begin(), + E = Mod->global_end(); I != E; ++I) { + GlobalVariable *GV = I; + PointerType *GVTy = GV->getType(); + if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) + continue; + for (Value::use_iterator U = GV->use_begin(), + UE = GV->use_end(); U != UE; ++U) { + Instruction *Use = dyn_cast(*U); + if (!Use) + continue; + if (Use->getParent()->getParent() == &F) + LocalMemAvailable -= + Mod->getDataLayout().getTypeAllocSize(GVTy->getElementType()); + } + } + } + + LocalMemAvailable = std::max(0, LocalMemAvailable); + DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n"); + + visit(F); + + return false; +} + +static VectorType *arrayTypeToVecType(const Type *ArrayTy) { + return VectorType::get(ArrayTy->getArrayElementType(), + ArrayTy->getArrayNumElements()); +} + +static Value * +calculateVectorIndex(Value *Ptr, + const std::map &GEPIdx) { + if (isa(Ptr)) + return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext())); + + GetElementPtrInst *GEP = cast(Ptr); + + auto I = GEPIdx.find(GEP); + return I == GEPIdx.end() ? nullptr : I->second; +} + +static Value* GEPToVectorIndex(GetElementPtrInst *GEP) { + // FIXME we only support simple cases + if (GEP->getNumOperands() != 3) + return NULL; + + ConstantInt *I0 = dyn_cast(GEP->getOperand(1)); + if (!I0 || !I0->isZero()) + return NULL; + + return GEP->getOperand(2); +} + +// Not an instruction handled below to turn into a vector. +// +// TODO: Check isTriviallyVectorizable for calls and handle other +// instructions. +static bool canVectorizeInst(Instruction *Inst) { + switch (Inst->getOpcode()) { + case Instruction::Load: + case Instruction::Store: + case Instruction::BitCast: + case Instruction::AddrSpaceCast: + return true; + default: + return false; + } +} + +static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { + Type *AllocaTy = Alloca->getAllocatedType(); + + DEBUG(dbgs() << "Alloca Candidate for vectorization \n"); + + // FIXME: There is no reason why we can't support larger arrays, we + // are just being conservative for now. + if (!AllocaTy->isArrayTy() || + AllocaTy->getArrayElementType()->isVectorTy() || + AllocaTy->getArrayNumElements() > 4) { + + DEBUG(dbgs() << " Cannot convert type to vector"); + return false; + } + + std::map GEPVectorIdx; + std::vector WorkList; + for (User *AllocaUser : Alloca->users()) { + GetElementPtrInst *GEP = dyn_cast(AllocaUser); + if (!GEP) { + if (!canVectorizeInst(cast(AllocaUser))) + return false; + + WorkList.push_back(AllocaUser); + continue; + } + + Value *Index = GEPToVectorIndex(GEP); + + // If we can't compute a vector index from this GEP, then we can't + // promote this alloca to vector. + if (!Index) { + DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << '\n'); + return false; + } + + GEPVectorIdx[GEP] = Index; + for (User *GEPUser : AllocaUser->users()) { + if (!canVectorizeInst(cast(GEPUser))) + return false; + + WorkList.push_back(GEPUser); + } + } + + VectorType *VectorTy = arrayTypeToVecType(AllocaTy); + + DEBUG(dbgs() << " Converting alloca to vector " + << *AllocaTy << " -> " << *VectorTy << '\n'); + + for (std::vector::iterator I = WorkList.begin(), + E = WorkList.end(); I != E; ++I) { + Instruction *Inst = cast(*I); + IRBuilder<> Builder(Inst); + switch (Inst->getOpcode()) { + case Instruction::Load: { + Value *Ptr = Inst->getOperand(0); + Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); + Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0)); + Value *VecValue = Builder.CreateLoad(BitCast); + Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index); + Inst->replaceAllUsesWith(ExtractElement); + Inst->eraseFromParent(); + break; + } + case Instruction::Store: { + Value *Ptr = Inst->getOperand(1); + Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); + Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0)); + Value *VecValue = Builder.CreateLoad(BitCast); + Value *NewVecValue = Builder.CreateInsertElement(VecValue, + Inst->getOperand(0), + Index); + Builder.CreateStore(NewVecValue, BitCast); + Inst->eraseFromParent(); + break; + } + case Instruction::BitCast: + case Instruction::AddrSpaceCast: + break; + + default: + Inst->dump(); + llvm_unreachable("Inconsistency in instructions promotable to vector"); + } + } + return true; +} + +static bool collectUsesWithPtrTypes(Value *Val, std::vector &WorkList) { + bool Success = true; + for (User *User : Val->users()) { + if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end()) + continue; + if (isa(User)) { + WorkList.push_back(User); + continue; + } + + // FIXME: Correctly handle ptrtoint instructions. + Instruction *UseInst = dyn_cast(User); + if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt) + return false; + + if (!User->getType()->isPointerTy()) + continue; + + WorkList.push_back(User); + + Success &= collectUsesWithPtrTypes(User, WorkList); + } + return Success; +} + +void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { + IRBuilder<> Builder(&I); + + // First try to replace the alloca with a vector + Type *AllocaTy = I.getAllocatedType(); + + DEBUG(dbgs() << "Trying to promote " << I << '\n'); + + if (tryPromoteAllocaToVector(&I)) + return; + + DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n"); + + // FIXME: This is the maximum work group size. We should try to get + // value from the reqd_work_group_size function attribute if it is + // available. + unsigned WorkGroupSize = 256; + int AllocaSize = + WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy); + + if (AllocaSize > LocalMemAvailable) { + DEBUG(dbgs() << " Not enough local memory to promote alloca.\n"); + return; + } + + std::vector WorkList; + + if (!collectUsesWithPtrTypes(&I, WorkList)) { + DEBUG(dbgs() << " Do not know how to convert all uses\n"); + return; + } + + DEBUG(dbgs() << "Promoting alloca to local memory\n"); + LocalMemAvailable -= AllocaSize; + + Type *GVTy = ArrayType::get(I.getAllocatedType(), 256); + GlobalVariable *GV = new GlobalVariable( + *Mod, GVTy, false, GlobalValue::ExternalLinkage, 0, I.getName(), 0, + GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); + + FunctionType *FTy = FunctionType::get( + Type::getInt32Ty(Mod->getContext()), false); + AttributeSet AttrSet; + AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone); + + Value *ReadLocalSizeY = Mod->getOrInsertFunction( + "llvm.r600.read.local.size.y", FTy, AttrSet); + Value *ReadLocalSizeZ = Mod->getOrInsertFunction( + "llvm.r600.read.local.size.z", FTy, AttrSet); + Value *ReadTIDIGX = Mod->getOrInsertFunction( + "llvm.r600.read.tidig.x", FTy, AttrSet); + Value *ReadTIDIGY = Mod->getOrInsertFunction( + "llvm.r600.read.tidig.y", FTy, AttrSet); + Value *ReadTIDIGZ = Mod->getOrInsertFunction( + "llvm.r600.read.tidig.z", FTy, AttrSet); + + Value *TCntY = Builder.CreateCall(ReadLocalSizeY, {}); + Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ, {}); + Value *TIdX = Builder.CreateCall(ReadTIDIGX, {}); + Value *TIdY = Builder.CreateCall(ReadTIDIGY, {}); + Value *TIdZ = Builder.CreateCall(ReadTIDIGZ, {}); + + Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ); + Tmp0 = Builder.CreateMul(Tmp0, TIdX); + Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ); + Value *TID = Builder.CreateAdd(Tmp0, Tmp1); + TID = Builder.CreateAdd(TID, TIdZ); + + std::vector Indices; + Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext()))); + Indices.push_back(TID); + + Value *Offset = Builder.CreateGEP(GVTy, GV, Indices); + I.mutateType(Offset->getType()); + I.replaceAllUsesWith(Offset); + I.eraseFromParent(); + + for (std::vector::iterator i = WorkList.begin(), + e = WorkList.end(); i != e; ++i) { + Value *V = *i; + CallInst *Call = dyn_cast(V); + if (!Call) { + Type *EltTy = V->getType()->getPointerElementType(); + PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); + + // The operand's value should be corrected on its own. + if (isa(V)) + continue; + + // FIXME: It doesn't really make sense to try to do this for all + // instructions. + V->mutateType(NewTy); + continue; + } + + IntrinsicInst *Intr = dyn_cast(Call); + if (!Intr) { + std::vector ArgTypes; + for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands(); + ArgIdx != ArgEnd; ++ArgIdx) { + ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType()); + } + Function *F = Call->getCalledFunction(); + FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes, + F->isVarArg()); + Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(), + NewType, F->getAttributes()); + Function *NewF = cast(C); + Call->setCalledFunction(NewF); + continue; + } + + Builder.SetInsertPoint(Intr); + switch (Intr->getIntrinsicID()) { + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + // These intrinsics are for address space 0 only + Intr->eraseFromParent(); + continue; + case Intrinsic::memcpy: { + MemCpyInst *MemCpy = cast(Intr); + Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(), + MemCpy->getLength(), MemCpy->getAlignment(), + MemCpy->isVolatile()); + Intr->eraseFromParent(); + continue; + } + case Intrinsic::memset: { + MemSetInst *MemSet = cast(Intr); + Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(), + MemSet->getLength(), MemSet->getAlignment(), + MemSet->isVolatile()); + Intr->eraseFromParent(); + continue; + } + default: + Intr->dump(); + llvm_unreachable("Don't know how to promote alloca intrinsic use."); + } + } +} + +FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) { + return new AMDGPUPromoteAlloca(ST); +} diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp new file mode 100644 index 00000000000..3ca0eca3417 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp @@ -0,0 +1,63 @@ +//===-- AMDGPURegisterInfo.cpp - AMDGPU Register Information -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Parent TargetRegisterInfo class common to all hw codegen targets. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPURegisterInfo.h" +#include "AMDGPUTargetMachine.h" + +using namespace llvm; + +AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {} + +//===----------------------------------------------------------------------===// +// Function handling callbacks - Functions are a seldom used feature of GPUS, so +// they are not supported at this time. +//===----------------------------------------------------------------------===// + +const MCPhysReg AMDGPURegisterInfo::CalleeSavedReg = AMDGPU::NoRegister; + +const MCPhysReg* +AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { + return &CalleeSavedReg; +} + +void AMDGPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, + int SPAdj, + unsigned FIOperandNum, + RegScavenger *RS) const { + llvm_unreachable("Subroutines not supported yet"); +} + +unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const { + return AMDGPU::NoRegister; +} + +unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const { + static const unsigned SubRegs[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, + AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9, + AMDGPU::sub10, AMDGPU::sub11, AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, + AMDGPU::sub15 + }; + + assert(Channel < array_lengthof(SubRegs)); + return SubRegs[Channel]; +} + +unsigned AMDGPURegisterInfo::getIndirectSubReg(unsigned IndirectIndex) const { + + return getSubRegFromChannel(IndirectIndex); +} + +#define GET_REGINFO_TARGET_DESC +#include "AMDGPUGenRegisterInfo.inc" diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h new file mode 100644 index 00000000000..cfd800bdc70 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.h @@ -0,0 +1,64 @@ +//===-- AMDGPURegisterInfo.h - AMDGPURegisterInfo Interface -*- C++ -*-----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief TargetRegisterInfo interface that is implemented by all hw codegen +/// targets. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPUREGISTERINFO_H +#define LLVM_LIB_TARGET_R600_AMDGPUREGISTERINFO_H + +#include "llvm/ADT/BitVector.h" +#include "llvm/Target/TargetRegisterInfo.h" + +#define GET_REGINFO_HEADER +#define GET_REGINFO_ENUM +#include "AMDGPUGenRegisterInfo.inc" + +namespace llvm { + +class AMDGPUSubtarget; +class TargetInstrInfo; + +struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { + static const MCPhysReg CalleeSavedReg; + + AMDGPURegisterInfo(); + + BitVector getReservedRegs(const MachineFunction &MF) const override { + assert(!"Unimplemented"); return BitVector(); + } + + virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const { + assert(!"Unimplemented"); return nullptr; + } + + virtual unsigned getHWRegIndex(unsigned Reg) const { + assert(!"Unimplemented"); return 0; + } + + /// \returns the sub reg enum value for the given \p Channel + /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0) + unsigned getSubRegFromChannel(unsigned Channel) const; + + const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override; + void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, + unsigned FIOperandNum, + RegScavenger *RS) const override; + unsigned getFrameRegister(const MachineFunction &MF) const override; + + unsigned getIndirectSubReg(unsigned IndirectIndex) const; + +}; + +} // End namespace llvm + +#endif diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.td b/lib/Target/AMDGPU/AMDGPURegisterInfo.td new file mode 100644 index 00000000000..835a1464395 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.td @@ -0,0 +1,26 @@ +//===-- AMDGPURegisterInfo.td - AMDGPU register info -------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Tablegen register definitions common to all hw codegen targets. +// +//===----------------------------------------------------------------------===// + +let Namespace = "AMDGPU" in { + +foreach Index = 0-15 in { + // Indices are used in a variety of ways here, so don't set a size/offset. + def sub#Index : SubRegIndex<-1, -1>; +} + +def INDIRECT_BASE_ADDR : Register <"INDIRECT_BASE_ADDR">; + +} + +include "R600RegisterInfo.td" +include "SIRegisterInfo.td" diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp new file mode 100644 index 00000000000..605ccd0e136 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -0,0 +1,133 @@ +//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Implements the AMDGPU specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUSubtarget.h" +#include "R600ISelLowering.h" +#include "R600InstrInfo.h" +#include "R600MachineScheduler.h" +#include "SIISelLowering.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/CodeGen/MachineScheduler.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-subtarget" + +#define GET_SUBTARGETINFO_ENUM +#define GET_SUBTARGETINFO_TARGET_DESC +#define GET_SUBTARGETINFO_CTOR +#include "AMDGPUGenSubtargetInfo.inc" + +AMDGPUSubtarget & +AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, + StringRef GPU, StringRef FS) { + // Determine default and user-specified characteristics + // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be + // enabled, but some instructions do not respect them and they run at the + // double precision rate, so don't enable by default. + // + // We want to be able to turn these off, but making this a subtarget feature + // for SI has the unhelpful behavior that it unsets everything else if you + // disable it. + + SmallString<256> FullFS("+promote-alloca,+fp64-denormals,"); + FullFS += FS; + + if (GPU == "" && TT.getArch() == Triple::amdgcn) + GPU = "SI"; + + ParseSubtargetFeatures(GPU, FullFS); + + // FIXME: I don't think think Evergreen has any useful support for + // denormals, but should be checked. Should we issue a warning somewhere + // if someone tries to enable these? + if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { + FP32Denormals = false; + FP64Denormals = false; + } + return *this; +} + +AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, + TargetMachine &TM) + : AMDGPUGenSubtargetInfo(TT, GPU, FS), DevName(GPU), Is64bit(false), + DumpCode(false), R600ALUInst(false), HasVertexCache(false), + TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false), + FP64Denormals(false), FP32Denormals(false), FastFMAF32(false), + CaymanISA(false), FlatAddressSpace(false), EnableIRStructurizer(true), + EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false), + WavefrontSize(0), CFALUBug(false), LocalMemorySize(0), + EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false), + GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0), + FrameLowering(TargetFrameLowering::StackGrowsUp, + 64 * 16, // Maximum stack alignment (long16) + 0), + InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) { + + initializeSubtargetDependencies(TT, GPU, FS); + + if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { + InstrInfo.reset(new R600InstrInfo(*this)); + TLInfo.reset(new R600TargetLowering(TM, *this)); + } else { + InstrInfo.reset(new SIInstrInfo(*this)); + TLInfo.reset(new SITargetLowering(TM, *this)); + } +} + +unsigned AMDGPUSubtarget::getStackEntrySize() const { + assert(getGeneration() <= NORTHERN_ISLANDS); + switch(getWavefrontSize()) { + case 16: + return 8; + case 32: + return hasCaymanISA() ? 4 : 8; + case 64: + return 4; + default: + llvm_unreachable("Illegal wavefront size."); + } +} + +unsigned AMDGPUSubtarget::getAmdKernelCodeChipID() const { + switch(getGeneration()) { + default: llvm_unreachable("ChipID unknown"); + case SEA_ISLANDS: return 12; + } +} + +bool AMDGPUSubtarget::isVGPRSpillingEnabled( + const SIMachineFunctionInfo *MFI) const { + return MFI->getShaderType() == ShaderType::COMPUTE || EnableVGPRSpilling; +} + +void AMDGPUSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, + MachineInstr *begin, + MachineInstr *end, + unsigned NumRegionInstrs) const { + if (getGeneration() >= SOUTHERN_ISLANDS) { + + // Track register pressure so the scheduler can try to decrease + // pressure once register usage is above the threshold defined by + // SIRegisterInfo::getRegPressureSetLimit() + Policy.ShouldTrackPressure = true; + + // Enabling both top down and bottom up scheduling seems to give us less + // register spills than just using one of these approaches on its own. + Policy.OnlyTopDown = false; + Policy.OnlyBottomUp = false; + } +} diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h new file mode 100644 index 00000000000..0d40d14f820 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -0,0 +1,282 @@ +//=====-- AMDGPUSubtarget.h - Define Subtarget for the AMDIL ---*- C++ -*-====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief AMDGPU specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H +#define LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H +#include "AMDGPU.h" +#include "AMDGPUFrameLowering.h" +#include "AMDGPUInstrInfo.h" +#include "AMDGPUIntrinsicInfo.h" +#include "AMDGPUSubtarget.h" +#include "R600ISelLowering.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +#define GET_SUBTARGETINFO_HEADER +#include "AMDGPUGenSubtargetInfo.inc" + +namespace llvm { + +class SIMachineFunctionInfo; + +class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo { + +public: + enum Generation { + R600 = 0, + R700, + EVERGREEN, + NORTHERN_ISLANDS, + SOUTHERN_ISLANDS, + SEA_ISLANDS, + VOLCANIC_ISLANDS, + }; + + enum { + FIXED_SGPR_COUNT_FOR_INIT_BUG = 80 + }; + +private: + std::string DevName; + bool Is64bit; + bool DumpCode; + bool R600ALUInst; + bool HasVertexCache; + short TexVTXClauseSize; + Generation Gen; + bool FP64; + bool FP64Denormals; + bool FP32Denormals; + bool FastFMAF32; + bool CaymanISA; + bool FlatAddressSpace; + bool EnableIRStructurizer; + bool EnablePromoteAlloca; + bool EnableIfCvt; + bool EnableLoadStoreOpt; + unsigned WavefrontSize; + bool CFALUBug; + int LocalMemorySize; + bool EnableVGPRSpilling; + bool SGPRInitBug; + bool IsGCN; + bool GCN1Encoding; + bool GCN3Encoding; + bool CIInsts; + bool FeatureDisable; + int LDSBankCount; + + AMDGPUFrameLowering FrameLowering; + std::unique_ptr TLInfo; + std::unique_ptr InstrInfo; + InstrItineraryData InstrItins; + Triple TargetTriple; + +public: + AMDGPUSubtarget(const Triple &TT, StringRef CPU, StringRef FS, + TargetMachine &TM); + AMDGPUSubtarget &initializeSubtargetDependencies(const Triple &TT, + StringRef GPU, StringRef FS); + + const AMDGPUFrameLowering *getFrameLowering() const override { + return &FrameLowering; + } + const AMDGPUInstrInfo *getInstrInfo() const override { + return InstrInfo.get(); + } + const AMDGPURegisterInfo *getRegisterInfo() const override { + return &InstrInfo->getRegisterInfo(); + } + AMDGPUTargetLowering *getTargetLowering() const override { + return TLInfo.get(); + } + const InstrItineraryData *getInstrItineraryData() const override { + return &InstrItins; + } + + void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + + bool is64bit() const { + return Is64bit; + } + + bool hasVertexCache() const { + return HasVertexCache; + } + + short getTexVTXClauseSize() const { + return TexVTXClauseSize; + } + + Generation getGeneration() const { + return Gen; + } + + bool hasHWFP64() const { + return FP64; + } + + bool hasCaymanISA() const { + return CaymanISA; + } + + bool hasFP32Denormals() const { + return FP32Denormals; + } + + bool hasFP64Denormals() const { + return FP64Denormals; + } + + bool hasFastFMAF32() const { + return FastFMAF32; + } + + bool hasFlatAddressSpace() const { + return FlatAddressSpace; + } + + bool hasBFE() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasBFI() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasBFM() const { + return hasBFE(); + } + + bool hasBCNT(unsigned Size) const { + if (Size == 32) + return (getGeneration() >= EVERGREEN); + + if (Size == 64) + return (getGeneration() >= SOUTHERN_ISLANDS); + + return false; + } + + bool hasMulU24() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasMulI24() const { + return (getGeneration() >= SOUTHERN_ISLANDS || + hasCaymanISA()); + } + + bool hasFFBL() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasFFBH() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasCARRY() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasBORROW() const { + return (getGeneration() >= EVERGREEN); + } + + bool IsIRStructurizerEnabled() const { + return EnableIRStructurizer; + } + + bool isPromoteAllocaEnabled() const { + return EnablePromoteAlloca; + } + + bool isIfCvtEnabled() const { + return EnableIfCvt; + } + + bool loadStoreOptEnabled() const { + return EnableLoadStoreOpt; + } + + unsigned getWavefrontSize() const { + return WavefrontSize; + } + + unsigned getStackEntrySize() const; + + bool hasCFAluBug() const { + assert(getGeneration() <= NORTHERN_ISLANDS); + return CFALUBug; + } + + int getLocalMemorySize() const { + return LocalMemorySize; + } + + bool hasSGPRInitBug() const { + return SGPRInitBug; + } + + int getLDSBankCount() const { + return LDSBankCount; + } + + unsigned getAmdKernelCodeChipID() const; + + bool enableMachineScheduler() const override { + return true; + } + + void overrideSchedPolicy(MachineSchedPolicy &Policy, + MachineInstr *begin, MachineInstr *end, + unsigned NumRegionInstrs) const override; + + // Helper functions to simplify if statements + bool isTargetELF() const { + return false; + } + + StringRef getDeviceName() const { + return DevName; + } + + bool dumpCode() const { + return DumpCode; + } + bool r600ALUEncoding() const { + return R600ALUInst; + } + bool isAmdHsaOS() const { + return TargetTriple.getOS() == Triple::AMDHSA; + } + bool isVGPRSpillingEnabled(const SIMachineFunctionInfo *MFI) const; + + unsigned getMaxWavesPerCU() const { + if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) + return 10; + + // FIXME: Not sure what this is for other subtagets. + llvm_unreachable("do not know max waves per CU for this subtarget."); + } + + bool enableSubRegLiveness() const override { + return true; + } +}; + +} // End namespace llvm + +#endif diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp new file mode 100644 index 00000000000..a9a911a8efe --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -0,0 +1,292 @@ +//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief The AMDGPU target machine contains all of the hardware specific +/// information needed to emit code for R600 and SI GPUs. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUTargetMachine.h" +#include "AMDGPU.h" +#include "AMDGPUTargetTransformInfo.h" +#include "R600ISelLowering.h" +#include "R600InstrInfo.h" +#include "R600MachineScheduler.h" +#include "SIISelLowering.h" +#include "SIInstrInfo.h" +#include "llvm/Analysis/Passes.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Verifier.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_os_ostream.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/Scalar.h" +#include + +using namespace llvm; + +extern "C" void LLVMInitializeAMDGPUTarget() { + // Register the target + RegisterTargetMachine X(TheAMDGPUTarget); + RegisterTargetMachine Y(TheGCNTarget); +} + +static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { + return new ScheduleDAGMILive(C, make_unique()); +} + +static MachineSchedRegistry +SchedCustomRegistry("r600", "Run R600's custom scheduler", + createR600MachineScheduler); + +static std::string computeDataLayout(const Triple &TT) { + std::string Ret = "e-p:32:32"; + + if (TT.getArch() == Triple::amdgcn) { + // 32-bit private, local, and region pointers. 64-bit global and constant. + Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64"; + } + + Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256" + "-v512:512-v1024:1024-v2048:2048-n32:64"; + + return Ret; +} + +AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, + TargetOptions Options, Reloc::Model RM, + CodeModel::Model CM, + CodeGenOpt::Level OptLevel) + : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, RM, CM, + OptLevel), + TLOF(new TargetLoweringObjectFileELF()), Subtarget(TT, CPU, FS, *this), + IntrinsicInfo() { + setRequiresStructuredCFG(true); + initAsmInfo(); +} + +AMDGPUTargetMachine::~AMDGPUTargetMachine() { + delete TLOF; +} + +//===----------------------------------------------------------------------===// +// R600 Target Machine (R600 -> Cayman) +//===----------------------------------------------------------------------===// + +R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT, + StringRef FS, StringRef CPU, + TargetOptions Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL) + : AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) {} + +//===----------------------------------------------------------------------===// +// GCN Target Machine (SI+) +//===----------------------------------------------------------------------===// + +GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, + StringRef FS, StringRef CPU, + TargetOptions Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL) + : AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) {} + +//===----------------------------------------------------------------------===// +// AMDGPU Pass Setup +//===----------------------------------------------------------------------===// + +namespace { +class AMDGPUPassConfig : public TargetPassConfig { +public: + AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM) + : TargetPassConfig(TM, PM) {} + + AMDGPUTargetMachine &getAMDGPUTargetMachine() const { + return getTM(); + } + + ScheduleDAGInstrs * + createMachineScheduler(MachineSchedContext *C) const override { + const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); + if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) + return createR600MachineScheduler(C); + return nullptr; + } + + void addIRPasses() override; + void addCodeGenPrepare() override; + virtual bool addPreISel() override; + virtual bool addInstSelector() override; +}; + +class R600PassConfig : public AMDGPUPassConfig { +public: + R600PassConfig(TargetMachine *TM, PassManagerBase &PM) + : AMDGPUPassConfig(TM, PM) { } + + bool addPreISel() override; + void addPreRegAlloc() override; + void addPreSched2() override; + void addPreEmitPass() override; +}; + +class GCNPassConfig : public AMDGPUPassConfig { +public: + GCNPassConfig(TargetMachine *TM, PassManagerBase &PM) + : AMDGPUPassConfig(TM, PM) { } + bool addPreISel() override; + bool addInstSelector() override; + void addPreRegAlloc() override; + void addPostRegAlloc() override; + void addPreSched2() override; + void addPreEmitPass() override; +}; + +} // End of anonymous namespace + +TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() { + return TargetIRAnalysis( + [this](Function &F) { return TargetTransformInfo(AMDGPUTTIImpl(this)); }); +} + +void AMDGPUPassConfig::addIRPasses() { + // Function calls are not supported, so make sure we inline everything. + addPass(createAMDGPUAlwaysInlinePass()); + addPass(createAlwaysInlinerPass()); + // We need to add the barrier noop pass, otherwise adding the function + // inlining pass will cause all of the PassConfigs passes to be run + // one function at a time, which means if we have a nodule with two + // functions, then we will generate code for the first function + // without ever running any passes on the second. + addPass(createBarrierNoopPass()); + TargetPassConfig::addIRPasses(); +} + +void AMDGPUPassConfig::addCodeGenPrepare() { + const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); + if (ST.isPromoteAllocaEnabled()) { + addPass(createAMDGPUPromoteAlloca(ST)); + addPass(createSROAPass()); + } + TargetPassConfig::addCodeGenPrepare(); +} + +bool +AMDGPUPassConfig::addPreISel() { + const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); + addPass(createFlattenCFGPass()); + if (ST.IsIRStructurizerEnabled()) + addPass(createStructurizeCFGPass()); + return false; +} + +bool AMDGPUPassConfig::addInstSelector() { + addPass(createAMDGPUISelDag(getAMDGPUTargetMachine())); + return false; +} + +//===----------------------------------------------------------------------===// +// R600 Pass Setup +//===----------------------------------------------------------------------===// + +bool R600PassConfig::addPreISel() { + AMDGPUPassConfig::addPreISel(); + addPass(createR600TextureIntrinsicsReplacer()); + return false; +} + +void R600PassConfig::addPreRegAlloc() { + addPass(createR600VectorRegMerger(*TM)); +} + +void R600PassConfig::addPreSched2() { + const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); + addPass(createR600EmitClauseMarkers(), false); + if (ST.isIfCvtEnabled()) + addPass(&IfConverterID, false); + addPass(createR600ClauseMergePass(*TM), false); +} + +void R600PassConfig::addPreEmitPass() { + addPass(createAMDGPUCFGStructurizerPass(), false); + addPass(createR600ExpandSpecialInstrsPass(*TM), false); + addPass(&FinalizeMachineBundlesID, false); + addPass(createR600Packetizer(*TM), false); + addPass(createR600ControlFlowFinalizer(*TM), false); +} + +TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { + return new R600PassConfig(this, PM); +} + +//===----------------------------------------------------------------------===// +// GCN Pass Setup +//===----------------------------------------------------------------------===// + +bool GCNPassConfig::addPreISel() { + AMDGPUPassConfig::addPreISel(); + addPass(createSinkingPass()); + addPass(createSITypeRewriter()); + addPass(createSIAnnotateControlFlowPass()); + return false; +} + +bool GCNPassConfig::addInstSelector() { + AMDGPUPassConfig::addInstSelector(); + addPass(createSILowerI1CopiesPass()); + addPass(createSIFixSGPRCopiesPass(*TM)); + addPass(createSIFoldOperandsPass()); + return false; +} + +void GCNPassConfig::addPreRegAlloc() { + const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); + + // This needs to be run directly before register allocation because + // earlier passes might recompute live intervals. + // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass + if (getOptLevel() > CodeGenOpt::None) { + initializeSIFixControlFlowLiveIntervalsPass(*PassRegistry::getPassRegistry()); + insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); + } + + if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) { + // Don't do this with no optimizations since it throws away debug info by + // merging nonadjacent loads. + + // This should be run after scheduling, but before register allocation. It + // also need extra copies to the address operand to be eliminated. + initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); + insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); + } + addPass(createSIShrinkInstructionsPass(), false); + addPass(createSIFixSGPRLiveRangesPass(), false); +} + +void GCNPassConfig::addPostRegAlloc() { + addPass(createSIPrepareScratchRegs(), false); + addPass(createSIShrinkInstructionsPass(), false); +} + +void GCNPassConfig::addPreSched2() { + addPass(createSIInsertWaits(*TM), false); +} + +void GCNPassConfig::addPreEmitPass() { + addPass(createSILowerControlFlowPass(*TM), false); +} + +TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { + return new GCNPassConfig(this, PM); +} diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h new file mode 100644 index 00000000000..14792e347a7 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -0,0 +1,89 @@ +//===-- AMDGPUTargetMachine.h - AMDGPU TargetMachine Interface --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief The AMDGPU TargetMachine interface definition for hw codgen targets. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPUTARGETMACHINE_H +#define LLVM_LIB_TARGET_R600_AMDGPUTARGETMACHINE_H + +#include "AMDGPUFrameLowering.h" +#include "AMDGPUInstrInfo.h" +#include "AMDGPUIntrinsicInfo.h" +#include "AMDGPUSubtarget.h" +#include "R600ISelLowering.h" +#include "llvm/IR/DataLayout.h" + +namespace llvm { + +//===----------------------------------------------------------------------===// +// AMDGPU Target Machine (R600+) +//===----------------------------------------------------------------------===// + +class AMDGPUTargetMachine : public LLVMTargetMachine { +private: + +protected: + TargetLoweringObjectFile *TLOF; + AMDGPUSubtarget Subtarget; + AMDGPUIntrinsicInfo IntrinsicInfo; + +public: + AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef FS, + StringRef CPU, TargetOptions Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL); + ~AMDGPUTargetMachine(); + + const AMDGPUSubtarget *getSubtargetImpl() const { return &Subtarget; } + const AMDGPUSubtarget *getSubtargetImpl(const Function &) const override { + return &Subtarget; + } + const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override { + return &IntrinsicInfo; + } + TargetIRAnalysis getTargetIRAnalysis() override; + + TargetLoweringObjectFile *getObjFileLowering() const override { + return TLOF; + } +}; + +//===----------------------------------------------------------------------===// +// R600 Target Machine (R600 -> Cayman) +//===----------------------------------------------------------------------===// + +class R600TargetMachine : public AMDGPUTargetMachine { + +public: + R600TargetMachine(const Target &T, const Triple &TT, StringRef FS, + StringRef CPU, TargetOptions Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL); + + TargetPassConfig *createPassConfig(PassManagerBase &PM) override; +}; + +//===----------------------------------------------------------------------===// +// GCN Target Machine (SI+) +//===----------------------------------------------------------------------===// + +class GCNTargetMachine : public AMDGPUTargetMachine { + +public: + GCNTargetMachine(const Target &T, const Triple &TT, StringRef FS, + StringRef CPU, TargetOptions Options, Reloc::Model RM, + CodeModel::Model CM, CodeGenOpt::Level OL); + + TargetPassConfig *createPassConfig(PassManagerBase &PM) override; +}; + +} // End namespace llvm + +#endif diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp new file mode 100644 index 00000000000..6dacc742b12 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -0,0 +1,82 @@ +//===-- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// \file +// This file implements a TargetTransformInfo analysis pass specific to the +// AMDGPU target machine. It uses the target's detailed information to provide +// more precise answers to certain TTI queries, while letting the target +// independent and default TTI implementations handle the rest. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUTargetTransformInfo.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/CostTable.h" +#include "llvm/Target/TargetLowering.h" +using namespace llvm; + +#define DEBUG_TYPE "AMDGPUtti" + +void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, + TTI::UnrollingPreferences &UP) { + UP.Threshold = 300; // Twice the default. + UP.MaxCount = UINT_MAX; + UP.Partial = true; + + // TODO: Do we want runtime unrolling? + + for (const BasicBlock *BB : L->getBlocks()) { + const DataLayout &DL = BB->getModule()->getDataLayout(); + for (const Instruction &I : *BB) { + const GetElementPtrInst *GEP = dyn_cast(&I); + if (!GEP || GEP->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) + continue; + + const Value *Ptr = GEP->getPointerOperand(); + const AllocaInst *Alloca = + dyn_cast(GetUnderlyingObject(Ptr, DL)); + if (Alloca) { + // We want to do whatever we can to limit the number of alloca + // instructions that make it through to the code generator. allocas + // require us to use indirect addressing, which is slow and prone to + // compiler bugs. If this loop does an address calculation on an + // alloca ptr, then we want to use a higher than normal loop unroll + // threshold. This will give SROA a better chance to eliminate these + // allocas. + // + // Don't use the maximum allowed value here as it will make some + // programs way too big. + UP.Threshold = 800; + } + } + } +} + +unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) { + if (Vec) + return 0; + + // Number of VGPRs on SI. + if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) + return 256; + + return 4 * 128; // XXX - 4 channels. Should these count as vector instead? +} + +unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool) { return 32; } + +unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { + // Semi-arbitrary large amount. + return 64; +} diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h new file mode 100644 index 00000000000..791c84e6f28 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -0,0 +1,78 @@ +//===-- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file a TargetTransformInfo::Concept conforming object specific to the +/// AMDGPU target machine. It uses the target's detailed information to +/// provide more precise answers to certain TTI queries, while letting the +/// target independent and default TTI implementations handle the rest. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_AMDGPUTARGETTRANSFORMINFO_H +#define LLVM_LIB_TARGET_R600_AMDGPUTARGETTRANSFORMINFO_H + +#include "AMDGPU.h" +#include "AMDGPUTargetMachine.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/Target/TargetLowering.h" + +namespace llvm { + +class AMDGPUTTIImpl : public BasicTTIImplBase { + typedef BasicTTIImplBase BaseT; + typedef TargetTransformInfo TTI; + friend BaseT; + + const AMDGPUSubtarget *ST; + const AMDGPUTargetLowering *TLI; + + const AMDGPUSubtarget *getST() const { return ST; } + const AMDGPUTargetLowering *getTLI() const { return TLI; } + +public: + explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM) + : BaseT(TM), ST(TM->getSubtargetImpl()), TLI(ST->getTargetLowering()) {} + + // Provide value semantics. MSVC requires that we spell all of these out. + AMDGPUTTIImpl(const AMDGPUTTIImpl &Arg) + : BaseT(static_cast(Arg)), ST(Arg.ST), TLI(Arg.TLI) {} + AMDGPUTTIImpl(AMDGPUTTIImpl &&Arg) + : BaseT(std::move(static_cast(Arg))), ST(std::move(Arg.ST)), + TLI(std::move(Arg.TLI)) {} + AMDGPUTTIImpl &operator=(const AMDGPUTTIImpl &RHS) { + BaseT::operator=(static_cast(RHS)); + ST = RHS.ST; + TLI = RHS.TLI; + return *this; + } + AMDGPUTTIImpl &operator=(AMDGPUTTIImpl &&RHS) { + BaseT::operator=(std::move(static_cast(RHS))); + ST = std::move(RHS.ST); + TLI = std::move(RHS.TLI); + return *this; + } + + bool hasBranchDivergence() { return true; } + + void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); + + TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) { + assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); + return ST->hasBCNT(TyWidth) ? TTI::PSK_FastHardware : TTI::PSK_Software; + } + + unsigned getNumberOfRegisters(bool Vector); + unsigned getRegisterBitWidth(bool Vector); + unsigned getMaxInterleaveFactor(unsigned VF); +}; + +} // end namespace llvm + +#endif diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp new file mode 100644 index 00000000000..c9b25a1a0b8 --- /dev/null +++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp @@ -0,0 +1,1912 @@ +//===-- AMDILCFGStructurizer.cpp - CFG Structurizer -----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//==-----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUInstrInfo.h" +#include "AMDGPUSubtarget.h" +#include "R600InstrInfo.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "structcfg" + +#define DEFAULT_VEC_SLOTS 8 + +// TODO: move-begin. + +//===----------------------------------------------------------------------===// +// +// Statistics for CFGStructurizer. +// +//===----------------------------------------------------------------------===// + +STATISTIC(numSerialPatternMatch, "CFGStructurizer number of serial pattern " + "matched"); +STATISTIC(numIfPatternMatch, "CFGStructurizer number of if pattern " + "matched"); +STATISTIC(numLoopcontPatternMatch, "CFGStructurizer number of loop-continue " + "pattern matched"); +STATISTIC(numClonedBlock, "CFGStructurizer cloned blocks"); +STATISTIC(numClonedInstr, "CFGStructurizer cloned instructions"); + +namespace llvm { + void initializeAMDGPUCFGStructurizerPass(PassRegistry&); +} + +//===----------------------------------------------------------------------===// +// +// Miscellaneous utility for CFGStructurizer. +// +//===----------------------------------------------------------------------===// +namespace { +#define SHOWNEWINSTR(i) \ + DEBUG(dbgs() << "New instr: " << *i << "\n"); + +#define SHOWNEWBLK(b, msg) \ +DEBUG( \ + dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \ + dbgs() << "\n"; \ +); + +#define SHOWBLK_DETAIL(b, msg) \ +DEBUG( \ + if (b) { \ + dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \ + b->print(dbgs()); \ + dbgs() << "\n"; \ + } \ +); + +#define INVALIDSCCNUM -1 + +template +void ReverseVector(SmallVectorImpl &Src) { + size_t sz = Src.size(); + for (size_t i = 0; i < sz/2; ++i) { + NodeT *t = Src[i]; + Src[i] = Src[sz - i - 1]; + Src[sz - i - 1] = t; + } +} + +} // end anonymous namespace + +//===----------------------------------------------------------------------===// +// +// supporting data structure for CFGStructurizer +// +//===----------------------------------------------------------------------===// + + +namespace { + +class BlockInformation { +public: + bool IsRetired; + int SccNum; + BlockInformation() : IsRetired(false), SccNum(INVALIDSCCNUM) {} +}; + +} // end anonymous namespace + +//===----------------------------------------------------------------------===// +// +// CFGStructurizer +// +//===----------------------------------------------------------------------===// + +namespace { +class AMDGPUCFGStructurizer : public MachineFunctionPass { +public: + typedef SmallVector MBBVector; + typedef std::map MBBInfoMap; + typedef std::map LoopLandInfoMap; + + enum PathToKind { + Not_SinglePath = 0, + SinglePath_InPath = 1, + SinglePath_NotInPath = 2 + }; + + static char ID; + + AMDGPUCFGStructurizer() : + MachineFunctionPass(ID), TII(nullptr), TRI(nullptr) { + initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry()); + } + + const char *getPassName() const override { + return "AMDGPU Control Flow Graph structurizer Pass"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addPreserved(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + } + + /// Perform the CFG structurization + bool run(); + + /// Perform the CFG preparation + /// This step will remove every unconditionnal/dead jump instructions and make + /// sure all loops have an exit block + bool prepare(); + + bool runOnMachineFunction(MachineFunction &MF) override { + TII = static_cast(MF.getSubtarget().getInstrInfo()); + TRI = &TII->getRegisterInfo(); + DEBUG(MF.dump();); + OrderedBlks.clear(); + Visited.clear(); + FuncRep = &MF; + MLI = &getAnalysis(); + DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI);); + MDT = &getAnalysis(); + DEBUG(MDT->print(dbgs(), (const llvm::Module*)nullptr);); + PDT = &getAnalysis(); + DEBUG(PDT->print(dbgs());); + prepare(); + run(); + DEBUG(MF.dump();); + return true; + } + +protected: + MachineDominatorTree *MDT; + MachinePostDominatorTree *PDT; + MachineLoopInfo *MLI; + const R600InstrInfo *TII; + const AMDGPURegisterInfo *TRI; + + // PRINT FUNCTIONS + /// Print the ordered Blocks. + void printOrderedBlocks() const { + size_t i = 0; + for (MBBVector::const_iterator iterBlk = OrderedBlks.begin(), + iterBlkEnd = OrderedBlks.end(); iterBlk != iterBlkEnd; ++iterBlk, ++i) { + dbgs() << "BB" << (*iterBlk)->getNumber(); + dbgs() << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")"; + if (i != 0 && i % 10 == 0) { + dbgs() << "\n"; + } else { + dbgs() << " "; + } + } + } + static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) { + for (MachineLoop::iterator iter = LoopInfo.begin(), + iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) { + (*iter)->print(dbgs(), 0); + } + } + + // UTILITY FUNCTIONS + int getSCCNum(MachineBasicBlock *MBB) const; + MachineBasicBlock *getLoopLandInfo(MachineLoop *LoopRep) const; + bool hasBackEdge(MachineBasicBlock *MBB) const; + static unsigned getLoopDepth(MachineLoop *LoopRep); + bool isRetiredBlock(MachineBasicBlock *MBB) const; + bool isActiveLoophead(MachineBasicBlock *MBB) const; + PathToKind singlePathTo(MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB, + bool AllowSideEntry = true) const; + int countActiveBlock(MBBVector::const_iterator It, + MBBVector::const_iterator E) const; + bool needMigrateBlock(MachineBasicBlock *MBB) const; + + // Utility Functions + void reversePredicateSetter(MachineBasicBlock::iterator I); + /// Compute the reversed DFS post order of Blocks + void orderBlocks(MachineFunction *MF); + + // Function originally from CFGStructTraits + void insertInstrEnd(MachineBasicBlock *MBB, int NewOpcode, + DebugLoc DL = DebugLoc()); + MachineInstr *insertInstrBefore(MachineBasicBlock *MBB, int NewOpcode, + DebugLoc DL = DebugLoc()); + MachineInstr *insertInstrBefore(MachineBasicBlock::iterator I, int NewOpcode); + void insertCondBranchBefore(MachineBasicBlock::iterator I, int NewOpcode, + DebugLoc DL); + void insertCondBranchBefore(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, int NewOpcode, int RegNum, + DebugLoc DL); + void insertCondBranchEnd(MachineBasicBlock *MBB, int NewOpcode, int RegNum); + static int getBranchNzeroOpcode(int OldOpcode); + static int getBranchZeroOpcode(int OldOpcode); + static int getContinueNzeroOpcode(int OldOpcode); + static int getContinueZeroOpcode(int OldOpcode); + static MachineBasicBlock *getTrueBranch(MachineInstr *MI); + static void setTrueBranch(MachineInstr *MI, MachineBasicBlock *MBB); + static MachineBasicBlock *getFalseBranch(MachineBasicBlock *MBB, + MachineInstr *MI); + static bool isCondBranch(MachineInstr *MI); + static bool isUncondBranch(MachineInstr *MI); + static DebugLoc getLastDebugLocInBB(MachineBasicBlock *MBB); + static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *MBB); + /// The correct naming for this is getPossibleLoopendBlockBranchInstr. + /// + /// BB with backward-edge could have move instructions after the branch + /// instruction. Such move instruction "belong to" the loop backward-edge. + MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *MBB); + static MachineInstr *getReturnInstr(MachineBasicBlock *MBB); + static MachineInstr *getContinueInstr(MachineBasicBlock *MBB); + static bool isReturnBlock(MachineBasicBlock *MBB); + static void cloneSuccessorList(MachineBasicBlock *DstMBB, + MachineBasicBlock *SrcMBB) ; + static MachineBasicBlock *clone(MachineBasicBlock *MBB); + /// MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose + /// because the AMDGPU instruction is not recognized as terminator fix this + /// and retire this routine + void replaceInstrUseOfBlockWith(MachineBasicBlock *SrcMBB, + MachineBasicBlock *OldMBB, MachineBasicBlock *NewBlk); + static void wrapup(MachineBasicBlock *MBB); + + + int patternMatch(MachineBasicBlock *MBB); + int patternMatchGroup(MachineBasicBlock *MBB); + int serialPatternMatch(MachineBasicBlock *MBB); + int ifPatternMatch(MachineBasicBlock *MBB); + int loopendPatternMatch(); + int mergeLoop(MachineLoop *LoopRep); + int loopcontPatternMatch(MachineLoop *LoopRep, MachineBasicBlock *LoopHeader); + + void handleLoopcontBlock(MachineBasicBlock *ContingMBB, + MachineLoop *ContingLoop, MachineBasicBlock *ContMBB, + MachineLoop *ContLoop); + /// return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in + /// the same loop with LoopLandInfo without explicitly keeping track of + /// loopContBlks and loopBreakBlks, this is a method to get the information. + bool isSameloopDetachedContbreak(MachineBasicBlock *Src1MBB, + MachineBasicBlock *Src2MBB); + int handleJumpintoIf(MachineBasicBlock *HeadMBB, + MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB); + int handleJumpintoIfImp(MachineBasicBlock *HeadMBB, + MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB); + int improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, + MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, + MachineBasicBlock **LandMBBPtr); + void showImproveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, + MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, + MachineBasicBlock *LandMBB, bool Detail = false); + int cloneOnSideEntryTo(MachineBasicBlock *PreMBB, + MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB); + void mergeSerialBlock(MachineBasicBlock *DstMBB, + MachineBasicBlock *SrcMBB); + + void mergeIfthenelseBlock(MachineInstr *BranchMI, + MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB, + MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB); + void mergeLooplandBlock(MachineBasicBlock *DstMBB, + MachineBasicBlock *LandMBB); + void mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB, + MachineBasicBlock *LandMBB); + void settleLoopcontBlock(MachineBasicBlock *ContingMBB, + MachineBasicBlock *ContMBB); + /// normalizeInfiniteLoopExit change + /// B1: + /// uncond_br LoopHeader + /// + /// to + /// B1: + /// cond_br 1 LoopHeader dummyExit + /// and return the newly added dummy exit block + MachineBasicBlock *normalizeInfiniteLoopExit(MachineLoop *LoopRep); + void removeUnconditionalBranch(MachineBasicBlock *MBB); + /// Remove duplicate branches instructions in a block. + /// For instance + /// B0: + /// cond_br X B1 B2 + /// cond_br X B1 B2 + /// is transformed to + /// B0: + /// cond_br X B1 B2 + void removeRedundantConditionalBranch(MachineBasicBlock *MBB); + void addDummyExitBlock(SmallVectorImpl &RetMBB); + void removeSuccessor(MachineBasicBlock *MBB); + MachineBasicBlock *cloneBlockForPredecessor(MachineBasicBlock *MBB, + MachineBasicBlock *PredMBB); + void migrateInstruction(MachineBasicBlock *SrcMBB, + MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I); + void recordSccnum(MachineBasicBlock *MBB, int SCCNum); + void retireBlock(MachineBasicBlock *MBB); + void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = nullptr); + + MachineBasicBlock *findNearestCommonPostDom(std::set&); + /// This is work around solution for findNearestCommonDominator not available + /// to post dom a proper fix should go to Dominators.h. + MachineBasicBlock *findNearestCommonPostDom(MachineBasicBlock *MBB1, + MachineBasicBlock *MBB2); + +private: + MBBInfoMap BlockInfoMap; + LoopLandInfoMap LLInfoMap; + std::map Visited; + MachineFunction *FuncRep; + SmallVector OrderedBlks; +}; + +int AMDGPUCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const { + MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB); + if (It == BlockInfoMap.end()) + return INVALIDSCCNUM; + return (*It).second->SccNum; +} + +MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep) + const { + LoopLandInfoMap::const_iterator It = LLInfoMap.find(LoopRep); + if (It == LLInfoMap.end()) + return nullptr; + return (*It).second; +} + +bool AMDGPUCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const { + MachineLoop *LoopRep = MLI->getLoopFor(MBB); + if (!LoopRep) + return false; + MachineBasicBlock *LoopHeader = LoopRep->getHeader(); + return MBB->isSuccessor(LoopHeader); +} + +unsigned AMDGPUCFGStructurizer::getLoopDepth(MachineLoop *LoopRep) { + return LoopRep ? LoopRep->getLoopDepth() : 0; +} + +bool AMDGPUCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const { + MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB); + if (It == BlockInfoMap.end()) + return false; + return (*It).second->IsRetired; +} + +bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const { + MachineLoop *LoopRep = MLI->getLoopFor(MBB); + while (LoopRep && LoopRep->getHeader() == MBB) { + MachineBasicBlock *LoopLand = getLoopLandInfo(LoopRep); + if(!LoopLand) + return true; + if (!isRetiredBlock(LoopLand)) + return true; + LoopRep = LoopRep->getParentLoop(); + } + return false; +} +AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo( + MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB, + bool AllowSideEntry) const { + assert(DstMBB); + if (SrcMBB == DstMBB) + return SinglePath_InPath; + while (SrcMBB && SrcMBB->succ_size() == 1) { + SrcMBB = *SrcMBB->succ_begin(); + if (SrcMBB == DstMBB) + return SinglePath_InPath; + if (!AllowSideEntry && SrcMBB->pred_size() > 1) + return Not_SinglePath; + } + if (SrcMBB && SrcMBB->succ_size()==0) + return SinglePath_NotInPath; + return Not_SinglePath; +} + +int AMDGPUCFGStructurizer::countActiveBlock(MBBVector::const_iterator It, + MBBVector::const_iterator E) const { + int Count = 0; + while (It != E) { + if (!isRetiredBlock(*It)) + ++Count; + ++It; + } + return Count; +} + +bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const { + unsigned BlockSizeThreshold = 30; + unsigned CloneInstrThreshold = 100; + bool MultiplePreds = MBB && (MBB->pred_size() > 1); + + if(!MultiplePreds) + return false; + unsigned BlkSize = MBB->size(); + return ((BlkSize > BlockSizeThreshold) && + (BlkSize * (MBB->pred_size() - 1) > CloneInstrThreshold)); +} + +void AMDGPUCFGStructurizer::reversePredicateSetter( + MachineBasicBlock::iterator I) { + while (I--) { + if (I->getOpcode() == AMDGPU::PRED_X) { + switch (static_cast(I)->getOperand(2).getImm()) { + case OPCODE_IS_ZERO_INT: + static_cast(I)->getOperand(2) + .setImm(OPCODE_IS_NOT_ZERO_INT); + return; + case OPCODE_IS_NOT_ZERO_INT: + static_cast(I)->getOperand(2) + .setImm(OPCODE_IS_ZERO_INT); + return; + case OPCODE_IS_ZERO: + static_cast(I)->getOperand(2) + .setImm(OPCODE_IS_NOT_ZERO); + return; + case OPCODE_IS_NOT_ZERO: + static_cast(I)->getOperand(2) + .setImm(OPCODE_IS_ZERO); + return; + default: + llvm_unreachable("PRED_X Opcode invalid!"); + } + } + } +} + +void AMDGPUCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB, + int NewOpcode, DebugLoc DL) { + MachineInstr *MI = MBB->getParent() + ->CreateMachineInstr(TII->get(NewOpcode), DL); + MBB->push_back(MI); + //assume the instruction doesn't take any reg operand ... + SHOWNEWINSTR(MI); +} + +MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB, + int NewOpcode, DebugLoc DL) { + MachineInstr *MI = + MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL); + if (MBB->begin() != MBB->end()) + MBB->insert(MBB->begin(), MI); + else + MBB->push_back(MI); + SHOWNEWINSTR(MI); + return MI; +} + +MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore( + MachineBasicBlock::iterator I, int NewOpcode) { + MachineInstr *OldMI = &(*I); + MachineBasicBlock *MBB = OldMI->getParent(); + MachineInstr *NewMBB = + MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DebugLoc()); + MBB->insert(I, NewMBB); + //assume the instruction doesn't take any reg operand ... + SHOWNEWINSTR(NewMBB); + return NewMBB; +} + +void AMDGPUCFGStructurizer::insertCondBranchBefore( + MachineBasicBlock::iterator I, int NewOpcode, DebugLoc DL) { + MachineInstr *OldMI = &(*I); + MachineBasicBlock *MBB = OldMI->getParent(); + MachineFunction *MF = MBB->getParent(); + MachineInstr *NewMI = MF->CreateMachineInstr(TII->get(NewOpcode), DL); + MBB->insert(I, NewMI); + MachineInstrBuilder MIB(*MF, NewMI); + MIB.addReg(OldMI->getOperand(1).getReg(), false); + SHOWNEWINSTR(NewMI); + //erase later oldInstr->eraseFromParent(); +} + +void AMDGPUCFGStructurizer::insertCondBranchBefore(MachineBasicBlock *blk, + MachineBasicBlock::iterator I, int NewOpcode, int RegNum, + DebugLoc DL) { + MachineFunction *MF = blk->getParent(); + MachineInstr *NewInstr = MF->CreateMachineInstr(TII->get(NewOpcode), DL); + //insert before + blk->insert(I, NewInstr); + MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false); + SHOWNEWINSTR(NewInstr); +} + +void AMDGPUCFGStructurizer::insertCondBranchEnd(MachineBasicBlock *MBB, + int NewOpcode, int RegNum) { + MachineFunction *MF = MBB->getParent(); + MachineInstr *NewInstr = + MF->CreateMachineInstr(TII->get(NewOpcode), DebugLoc()); + MBB->push_back(NewInstr); + MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false); + SHOWNEWINSTR(NewInstr); +} + +int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) { + switch(OldOpcode) { + case AMDGPU::JUMP_COND: + case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET; + case AMDGPU::BRANCH_COND_i32: + case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32; + default: llvm_unreachable("internal error"); + } + return -1; +} + +int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) { + switch(OldOpcode) { + case AMDGPU::JUMP_COND: + case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET; + case AMDGPU::BRANCH_COND_i32: + case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32; + default: llvm_unreachable("internal error"); + } + return -1; +} + +int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) { + switch(OldOpcode) { + case AMDGPU::JUMP_COND: + case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32; + default: llvm_unreachable("internal error"); + }; + return -1; +} + +int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) { + switch(OldOpcode) { + case AMDGPU::JUMP_COND: + case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32; + default: llvm_unreachable("internal error"); + } + return -1; +} + +MachineBasicBlock *AMDGPUCFGStructurizer::getTrueBranch(MachineInstr *MI) { + return MI->getOperand(0).getMBB(); +} + +void AMDGPUCFGStructurizer::setTrueBranch(MachineInstr *MI, + MachineBasicBlock *MBB) { + MI->getOperand(0).setMBB(MBB); +} + +MachineBasicBlock * +AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB, + MachineInstr *MI) { + assert(MBB->succ_size() == 2); + MachineBasicBlock *TrueBranch = getTrueBranch(MI); + MachineBasicBlock::succ_iterator It = MBB->succ_begin(); + MachineBasicBlock::succ_iterator Next = It; + ++Next; + return (*It == TrueBranch) ? *Next : *It; +} + +bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) { + switch (MI->getOpcode()) { + case AMDGPU::JUMP_COND: + case AMDGPU::BRANCH_COND_i32: + case AMDGPU::BRANCH_COND_f32: return true; + default: + return false; + } + return false; +} + +bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) { + switch (MI->getOpcode()) { + case AMDGPU::JUMP: + case AMDGPU::BRANCH: + return true; + default: + return false; + } + return false; +} + +DebugLoc AMDGPUCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) { + //get DebugLoc from the first MachineBasicBlock instruction with debug info + DebugLoc DL; + for (MachineBasicBlock::iterator It = MBB->begin(); It != MBB->end(); + ++It) { + MachineInstr *instr = &(*It); + if (instr->getDebugLoc()) + DL = instr->getDebugLoc(); + } + return DL; +} + +MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr( + MachineBasicBlock *MBB) { + MachineBasicBlock::reverse_iterator It = MBB->rbegin(); + MachineInstr *MI = &*It; + if (MI && (isCondBranch(MI) || isUncondBranch(MI))) + return MI; + return nullptr; +} + +MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr( + MachineBasicBlock *MBB) { + for (MachineBasicBlock::reverse_iterator It = MBB->rbegin(), E = MBB->rend(); + It != E; ++It) { + // FIXME: Simplify + MachineInstr *MI = &*It; + if (MI) { + if (isCondBranch(MI) || isUncondBranch(MI)) + return MI; + else if (!TII->isMov(MI->getOpcode())) + break; + } + } + return nullptr; +} + +MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) { + MachineBasicBlock::reverse_iterator It = MBB->rbegin(); + if (It != MBB->rend()) { + MachineInstr *instr = &(*It); + if (instr->getOpcode() == AMDGPU::RETURN) + return instr; + } + return nullptr; +} + +MachineInstr *AMDGPUCFGStructurizer::getContinueInstr(MachineBasicBlock *MBB) { + MachineBasicBlock::reverse_iterator It = MBB->rbegin(); + if (It != MBB->rend()) { + MachineInstr *MI = &(*It); + if (MI->getOpcode() == AMDGPU::CONTINUE) + return MI; + } + return nullptr; +} + +bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) { + MachineInstr *MI = getReturnInstr(MBB); + bool IsReturn = (MBB->succ_size() == 0); + if (MI) + assert(IsReturn); + else if (IsReturn) + DEBUG( + dbgs() << "BB" << MBB->getNumber() + <<" is return block without RETURN instr\n";); + return IsReturn; +} + +void AMDGPUCFGStructurizer::cloneSuccessorList(MachineBasicBlock *DstMBB, + MachineBasicBlock *SrcMBB) { + for (MachineBasicBlock::succ_iterator It = SrcMBB->succ_begin(), + iterEnd = SrcMBB->succ_end(); It != iterEnd; ++It) + DstMBB->addSuccessor(*It); // *iter's predecessor is also taken care of +} + +MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) { + MachineFunction *Func = MBB->getParent(); + MachineBasicBlock *NewMBB = Func->CreateMachineBasicBlock(); + Func->push_back(NewMBB); //insert to function + for (MachineBasicBlock::iterator It = MBB->begin(), E = MBB->end(); + It != E; ++It) { + MachineInstr *MI = Func->CloneMachineInstr(It); + NewMBB->push_back(MI); + } + return NewMBB; +} + +void AMDGPUCFGStructurizer::replaceInstrUseOfBlockWith( + MachineBasicBlock *SrcMBB, MachineBasicBlock *OldMBB, + MachineBasicBlock *NewBlk) { + MachineInstr *BranchMI = getLoopendBlockBranchInstr(SrcMBB); + if (BranchMI && isCondBranch(BranchMI) && + getTrueBranch(BranchMI) == OldMBB) + setTrueBranch(BranchMI, NewBlk); +} + +void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) { + assert((!MBB->getParent()->getJumpTableInfo() + || MBB->getParent()->getJumpTableInfo()->isEmpty()) + && "found a jump table"); + + //collect continue right before endloop + SmallVector ContInstr; + MachineBasicBlock::iterator Pre = MBB->begin(); + MachineBasicBlock::iterator E = MBB->end(); + MachineBasicBlock::iterator It = Pre; + while (It != E) { + if (Pre->getOpcode() == AMDGPU::CONTINUE + && It->getOpcode() == AMDGPU::ENDLOOP) + ContInstr.push_back(Pre); + Pre = It; + ++It; + } + + //delete continue right before endloop + for (unsigned i = 0; i < ContInstr.size(); ++i) + ContInstr[i]->eraseFromParent(); + + // TODO to fix up jump table so later phase won't be confused. if + // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but + // there isn't such an interface yet. alternatively, replace all the other + // blocks in the jump table with the entryBlk //} + +} + + +bool AMDGPUCFGStructurizer::prepare() { + bool Changed = false; + + //FIXME: if not reducible flow graph, make it so ??? + + DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";); + + orderBlocks(FuncRep); + + SmallVector RetBlks; + + // Add an ExitBlk to loop that don't have one + for (MachineLoopInfo::iterator It = MLI->begin(), + E = MLI->end(); It != E; ++It) { + MachineLoop *LoopRep = (*It); + MBBVector ExitingMBBs; + LoopRep->getExitingBlocks(ExitingMBBs); + + if (ExitingMBBs.size() == 0) { + MachineBasicBlock* DummyExitBlk = normalizeInfiniteLoopExit(LoopRep); + if (DummyExitBlk) + RetBlks.push_back(DummyExitBlk); + } + } + + // Remove unconditional branch instr. + // Add dummy exit block iff there are multiple returns. + for (SmallVectorImpl::const_iterator + It = OrderedBlks.begin(), E = OrderedBlks.end(); It != E; ++It) { + MachineBasicBlock *MBB = *It; + removeUnconditionalBranch(MBB); + removeRedundantConditionalBranch(MBB); + if (isReturnBlock(MBB)) { + RetBlks.push_back(MBB); + } + assert(MBB->succ_size() <= 2); + } + + if (RetBlks.size() >= 2) { + addDummyExitBlock(RetBlks); + Changed = true; + } + + return Changed; +} + +bool AMDGPUCFGStructurizer::run() { + + //Assume reducible CFG... + DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n"); + +#ifdef STRESSTEST + //Use the worse block ordering to test the algorithm. + ReverseVector(orderedBlks); +#endif + + DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks();); + int NumIter = 0; + bool Finish = false; + MachineBasicBlock *MBB; + bool MakeProgress = false; + int NumRemainedBlk = countActiveBlock(OrderedBlks.begin(), + OrderedBlks.end()); + + do { + ++NumIter; + DEBUG( + dbgs() << "numIter = " << NumIter + << ", numRemaintedBlk = " << NumRemainedBlk << "\n"; + ); + + SmallVectorImpl::const_iterator It = + OrderedBlks.begin(); + SmallVectorImpl::const_iterator E = + OrderedBlks.end(); + + SmallVectorImpl::const_iterator SccBeginIter = + It; + MachineBasicBlock *SccBeginMBB = nullptr; + int SccNumBlk = 0; // The number of active blocks, init to a + // maximum possible number. + int SccNumIter; // Number of iteration in this SCC. + + while (It != E) { + MBB = *It; + + if (!SccBeginMBB) { + SccBeginIter = It; + SccBeginMBB = MBB; + SccNumIter = 0; + SccNumBlk = NumRemainedBlk; // Init to maximum possible number. + DEBUG( + dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB); + dbgs() << "\n"; + ); + } + + if (!isRetiredBlock(MBB)) + patternMatch(MBB); + + ++It; + + bool ContNextScc = true; + if (It == E + || getSCCNum(SccBeginMBB) != getSCCNum(*It)) { + // Just finish one scc. + ++SccNumIter; + int sccRemainedNumBlk = countActiveBlock(SccBeginIter, It); + if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= SccNumBlk) { + DEBUG( + dbgs() << "Can't reduce SCC " << getSCCNum(MBB) + << ", sccNumIter = " << SccNumIter; + dbgs() << "doesn't make any progress\n"; + ); + ContNextScc = true; + } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < SccNumBlk) { + SccNumBlk = sccRemainedNumBlk; + It = SccBeginIter; + ContNextScc = false; + DEBUG( + dbgs() << "repeat processing SCC" << getSCCNum(MBB) + << "sccNumIter = " << SccNumIter << '\n'; + ); + } else { + // Finish the current scc. + ContNextScc = true; + } + } else { + // Continue on next component in the current scc. + ContNextScc = false; + } + + if (ContNextScc) + SccBeginMBB = nullptr; + } //while, "one iteration" over the function. + + MachineBasicBlock *EntryMBB = + GraphTraits::nodes_begin(FuncRep); + if (EntryMBB->succ_size() == 0) { + Finish = true; + DEBUG( + dbgs() << "Reduce to one block\n"; + ); + } else { + int NewnumRemainedBlk + = countActiveBlock(OrderedBlks.begin(), OrderedBlks.end()); + // consider cloned blocks ?? + if (NewnumRemainedBlk == 1 || NewnumRemainedBlk < NumRemainedBlk) { + MakeProgress = true; + NumRemainedBlk = NewnumRemainedBlk; + } else { + MakeProgress = false; + DEBUG( + dbgs() << "No progress\n"; + ); + } + } + } while (!Finish && MakeProgress); + + // Misc wrap up to maintain the consistency of the Function representation. + wrapup(GraphTraits::nodes_begin(FuncRep)); + + // Detach retired Block, release memory. + for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end(); + It != E; ++It) { + if ((*It).second && (*It).second->IsRetired) { + assert(((*It).first)->getNumber() != -1); + DEBUG( + dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n"; + ); + (*It).first->eraseFromParent(); //Remove from the parent Function. + } + delete (*It).second; + } + BlockInfoMap.clear(); + LLInfoMap.clear(); + + if (!Finish) { + DEBUG(FuncRep->viewCFG()); + llvm_unreachable("IRREDUCIBLE_CFG"); + } + + return true; +} + + + +void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) { + int SccNum = 0; + MachineBasicBlock *MBB; + for (scc_iterator It = scc_begin(MF); !It.isAtEnd(); + ++It, ++SccNum) { + const std::vector &SccNext = *It; + for (std::vector::const_iterator + blockIter = SccNext.begin(), blockEnd = SccNext.end(); + blockIter != blockEnd; ++blockIter) { + MBB = *blockIter; + OrderedBlks.push_back(MBB); + recordSccnum(MBB, SccNum); + } + } + + //walk through all the block in func to check for unreachable + typedef GraphTraits GTM; + MachineFunction::iterator It = GTM::nodes_begin(MF), E = GTM::nodes_end(MF); + for (; It != E; ++It) { + MachineBasicBlock *MBB = &(*It); + SccNum = getSCCNum(MBB); + if (SccNum == INVALIDSCCNUM) + dbgs() << "unreachable block BB" << MBB->getNumber() << "\n"; + } +} + +int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) { + int NumMatch = 0; + int CurMatch; + + DEBUG( + dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n"; + ); + + while ((CurMatch = patternMatchGroup(MBB)) > 0) + NumMatch += CurMatch; + + DEBUG( + dbgs() << "End patternMatch BB" << MBB->getNumber() + << ", numMatch = " << NumMatch << "\n"; + ); + + return NumMatch; +} + +int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) { + int NumMatch = 0; + NumMatch += loopendPatternMatch(); + NumMatch += serialPatternMatch(MBB); + NumMatch += ifPatternMatch(MBB); + return NumMatch; +} + + +int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) { + if (MBB->succ_size() != 1) + return 0; + + MachineBasicBlock *childBlk = *MBB->succ_begin(); + if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk)) + return 0; + + mergeSerialBlock(MBB, childBlk); + ++numSerialPatternMatch; + return 1; +} + +int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) { + //two edges + if (MBB->succ_size() != 2) + return 0; + if (hasBackEdge(MBB)) + return 0; + MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB); + if (!BranchMI) + return 0; + + assert(isCondBranch(BranchMI)); + int NumMatch = 0; + + MachineBasicBlock *TrueMBB = getTrueBranch(BranchMI); + NumMatch += serialPatternMatch(TrueMBB); + NumMatch += ifPatternMatch(TrueMBB); + MachineBasicBlock *FalseMBB = getFalseBranch(MBB, BranchMI); + NumMatch += serialPatternMatch(FalseMBB); + NumMatch += ifPatternMatch(FalseMBB); + MachineBasicBlock *LandBlk; + int Cloned = 0; + + assert (!TrueMBB->succ_empty() || !FalseMBB->succ_empty()); + // TODO: Simplify + if (TrueMBB->succ_size() == 1 && FalseMBB->succ_size() == 1 + && *TrueMBB->succ_begin() == *FalseMBB->succ_begin()) { + // Diamond pattern + LandBlk = *TrueMBB->succ_begin(); + } else if (TrueMBB->succ_size() == 1 && *TrueMBB->succ_begin() == FalseMBB) { + // Triangle pattern, false is empty + LandBlk = FalseMBB; + FalseMBB = nullptr; + } else if (FalseMBB->succ_size() == 1 + && *FalseMBB->succ_begin() == TrueMBB) { + // Triangle pattern, true is empty + // We reverse the predicate to make a triangle, empty false pattern; + std::swap(TrueMBB, FalseMBB); + reversePredicateSetter(MBB->end()); + LandBlk = FalseMBB; + FalseMBB = nullptr; + } else if (FalseMBB->succ_size() == 1 + && isSameloopDetachedContbreak(TrueMBB, FalseMBB)) { + LandBlk = *FalseMBB->succ_begin(); + } else if (TrueMBB->succ_size() == 1 + && isSameloopDetachedContbreak(FalseMBB, TrueMBB)) { + LandBlk = *TrueMBB->succ_begin(); + } else { + return NumMatch + handleJumpintoIf(MBB, TrueMBB, FalseMBB); + } + + // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the + // new BB created for landBlk==NULL may introduce new challenge to the + // reduction process. + if (LandBlk && + ((TrueMBB && TrueMBB->pred_size() > 1) + || (FalseMBB && FalseMBB->pred_size() > 1))) { + Cloned += improveSimpleJumpintoIf(MBB, TrueMBB, FalseMBB, &LandBlk); + } + + if (TrueMBB && TrueMBB->pred_size() > 1) { + TrueMBB = cloneBlockForPredecessor(TrueMBB, MBB); + ++Cloned; + } + + if (FalseMBB && FalseMBB->pred_size() > 1) { + FalseMBB = cloneBlockForPredecessor(FalseMBB, MBB); + ++Cloned; + } + + mergeIfthenelseBlock(BranchMI, MBB, TrueMBB, FalseMBB, LandBlk); + + ++numIfPatternMatch; + + numClonedBlock += Cloned; + + return 1 + Cloned + NumMatch; +} + +int AMDGPUCFGStructurizer::loopendPatternMatch() { + std::deque NestedLoops; + for (auto &It: *MLI) + for (MachineLoop *ML : depth_first(It)) + NestedLoops.push_front(ML); + + if (NestedLoops.size() == 0) + return 0; + + // Process nested loop outside->inside (we did push_front), + // so "continue" to a outside loop won't be mistaken as "break" + // of the current loop. + int Num = 0; + for (MachineLoop *ExaminedLoop : NestedLoops) { + if (ExaminedLoop->getNumBlocks() == 0 || Visited[ExaminedLoop]) + continue; + DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump();); + int NumBreak = mergeLoop(ExaminedLoop); + if (NumBreak == -1) + break; + Num += NumBreak; + } + return Num; +} + +int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) { + MachineBasicBlock *LoopHeader = LoopRep->getHeader(); + MBBVector ExitingMBBs; + LoopRep->getExitingBlocks(ExitingMBBs); + assert(!ExitingMBBs.empty() && "Infinite Loop not supported"); + DEBUG(dbgs() << "Loop has " << ExitingMBBs.size() << " exiting blocks\n";); + // We assume a single ExitBlk + MBBVector ExitBlks; + LoopRep->getExitBlocks(ExitBlks); + SmallPtrSet ExitBlkSet; + for (unsigned i = 0, e = ExitBlks.size(); i < e; ++i) + ExitBlkSet.insert(ExitBlks[i]); + assert(ExitBlkSet.size() == 1); + MachineBasicBlock *ExitBlk = *ExitBlks.begin(); + assert(ExitBlk && "Loop has several exit block"); + MBBVector LatchBlks; + typedef GraphTraits > InvMBBTraits; + InvMBBTraits::ChildIteratorType PI = InvMBBTraits::child_begin(LoopHeader), + PE = InvMBBTraits::child_end(LoopHeader); + for (; PI != PE; PI++) { + if (LoopRep->contains(*PI)) + LatchBlks.push_back(*PI); + } + + for (unsigned i = 0, e = ExitingMBBs.size(); i < e; ++i) + mergeLoopbreakBlock(ExitingMBBs[i], ExitBlk); + for (unsigned i = 0, e = LatchBlks.size(); i < e; ++i) + settleLoopcontBlock(LatchBlks[i], LoopHeader); + int Match = 0; + do { + Match = 0; + Match += serialPatternMatch(LoopHeader); + Match += ifPatternMatch(LoopHeader); + } while (Match > 0); + mergeLooplandBlock(LoopHeader, ExitBlk); + MachineLoop *ParentLoop = LoopRep->getParentLoop(); + if (ParentLoop) + MLI->changeLoopFor(LoopHeader, ParentLoop); + else + MLI->removeBlock(LoopHeader); + Visited[LoopRep] = true; + return 1; +} + +int AMDGPUCFGStructurizer::loopcontPatternMatch(MachineLoop *LoopRep, + MachineBasicBlock *LoopHeader) { + int NumCont = 0; + SmallVector ContMBB; + typedef GraphTraits > GTIM; + GTIM::ChildIteratorType It = GTIM::child_begin(LoopHeader), + E = GTIM::child_end(LoopHeader); + for (; It != E; ++It) { + MachineBasicBlock *MBB = *It; + if (LoopRep->contains(MBB)) { + handleLoopcontBlock(MBB, MLI->getLoopFor(MBB), + LoopHeader, LoopRep); + ContMBB.push_back(MBB); + ++NumCont; + } + } + + for (SmallVectorImpl::iterator It = ContMBB.begin(), + E = ContMBB.end(); It != E; ++It) { + (*It)->removeSuccessor(LoopHeader); + } + + numLoopcontPatternMatch += NumCont; + + return NumCont; +} + + +bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak( + MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) { + if (Src1MBB->succ_size() == 0) { + MachineLoop *LoopRep = MLI->getLoopFor(Src1MBB); + if (LoopRep&& LoopRep == MLI->getLoopFor(Src2MBB)) { + MachineBasicBlock *&TheEntry = LLInfoMap[LoopRep]; + if (TheEntry) { + DEBUG( + dbgs() << "isLoopContBreakBlock yes src1 = BB" + << Src1MBB->getNumber() + << " src2 = BB" << Src2MBB->getNumber() << "\n"; + ); + return true; + } + } + } + return false; +} + +int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB, + MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) { + int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB); + if (Num == 0) { + DEBUG( + dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n"; + ); + Num = handleJumpintoIfImp(HeadMBB, FalseMBB, TrueMBB); + } + return Num; +} + +int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB, + MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) { + int Num = 0; + MachineBasicBlock *DownBlk; + + //trueBlk could be the common post dominator + DownBlk = TrueMBB; + + DEBUG( + dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber() + << " true = BB" << TrueMBB->getNumber() + << ", numSucc=" << TrueMBB->succ_size() + << " false = BB" << FalseMBB->getNumber() << "\n"; + ); + + while (DownBlk) { + DEBUG( + dbgs() << "check down = BB" << DownBlk->getNumber(); + ); + + if (singlePathTo(FalseMBB, DownBlk) == SinglePath_InPath) { + DEBUG( + dbgs() << " working\n"; + ); + + Num += cloneOnSideEntryTo(HeadMBB, TrueMBB, DownBlk); + Num += cloneOnSideEntryTo(HeadMBB, FalseMBB, DownBlk); + + numClonedBlock += Num; + Num += serialPatternMatch(*HeadMBB->succ_begin()); + Num += serialPatternMatch(*std::next(HeadMBB->succ_begin())); + Num += ifPatternMatch(HeadMBB); + assert(Num > 0); + + break; + } + DEBUG( + dbgs() << " not working\n"; + ); + DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : nullptr; + } // walk down the postDomTree + + return Num; +} + +void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf( + MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB, + MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB, bool Detail) { + dbgs() << "head = BB" << HeadMBB->getNumber() + << " size = " << HeadMBB->size(); + if (Detail) { + dbgs() << "\n"; + HeadMBB->print(dbgs()); + dbgs() << "\n"; + } + + if (TrueMBB) { + dbgs() << ", true = BB" << TrueMBB->getNumber() << " size = " + << TrueMBB->size() << " numPred = " << TrueMBB->pred_size(); + if (Detail) { + dbgs() << "\n"; + TrueMBB->print(dbgs()); + dbgs() << "\n"; + } + } + if (FalseMBB) { + dbgs() << ", false = BB" << FalseMBB->getNumber() << " size = " + << FalseMBB->size() << " numPred = " << FalseMBB->pred_size(); + if (Detail) { + dbgs() << "\n"; + FalseMBB->print(dbgs()); + dbgs() << "\n"; + } + } + if (LandMBB) { + dbgs() << ", land = BB" << LandMBB->getNumber() << " size = " + << LandMBB->size() << " numPred = " << LandMBB->pred_size(); + if (Detail) { + dbgs() << "\n"; + LandMBB->print(dbgs()); + dbgs() << "\n"; + } + } + + dbgs() << "\n"; +} + +int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, + MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, + MachineBasicBlock **LandMBBPtr) { + bool MigrateTrue = false; + bool MigrateFalse = false; + + MachineBasicBlock *LandBlk = *LandMBBPtr; + + assert((!TrueMBB || TrueMBB->succ_size() <= 1) + && (!FalseMBB || FalseMBB->succ_size() <= 1)); + + if (TrueMBB == FalseMBB) + return 0; + + MigrateTrue = needMigrateBlock(TrueMBB); + MigrateFalse = needMigrateBlock(FalseMBB); + + if (!MigrateTrue && !MigrateFalse) + return 0; + + // If we need to migrate either trueBlk and falseBlk, migrate the rest that + // have more than one predecessors. without doing this, its predecessor + // rather than headBlk will have undefined value in initReg. + if (!MigrateTrue && TrueMBB && TrueMBB->pred_size() > 1) + MigrateTrue = true; + if (!MigrateFalse && FalseMBB && FalseMBB->pred_size() > 1) + MigrateFalse = true; + + DEBUG( + dbgs() << "before improveSimpleJumpintoIf: "; + showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0); + ); + + // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk + // + // new: headBlk => if () {initReg = 1; org trueBlk branch} else + // {initReg = 0; org falseBlk branch } + // => landBlk => if (initReg) {org trueBlk} else {org falseBlk} + // => org landBlk + // if landBlk->pred_size() > 2, put the about if-else inside + // if (initReg !=2) {...} + // + // add initReg = initVal to headBlk + + const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32); + if (!MigrateTrue || !MigrateFalse) { + // XXX: We have an opportunity here to optimize the "branch into if" case + // here. Branch into if looks like this: + // entry + // / | + // diamond_head branch_from + // / \ | + // diamond_false diamond_true + // \ / + // done + // + // The diamond_head block begins the "if" and the diamond_true block + // is the block being "branched into". + // + // If MigrateTrue is true, then TrueBB is the block being "branched into" + // and if MigrateFalse is true, then FalseBB is the block being + // "branched into" + // + // Here is the pseudo code for how I think the optimization should work: + // 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head. + // 2. Insert MOV GPR0, 1 before the branch instruction in branch_from. + // 3. Move the branch instruction from diamond_head into its own basic + // block (new_block). + // 4. Add an unconditional branch from diamond_head to new_block + // 5. Replace the branch instruction in branch_from with an unconditional + // branch to new_block. If branch_from has multiple predecessors, then + // we need to replace the True/False block in the branch + // instruction instead of replacing it. + // 6. Change the condition of the branch instruction in new_block from + // COND to (COND || GPR0) + // + // In order insert these MOV instruction, we will need to use the + // RegisterScavenger. Usually liveness stops being tracked during + // the late machine optimization passes, however if we implement + // bool TargetRegisterInfo::requiresRegisterScavenging( + // const MachineFunction &MF) + // and have it return true, liveness will be tracked correctly + // by generic optimization passes. We will also need to make sure that + // all of our target-specific passes that run after regalloc and before + // the CFGStructurizer track liveness and we will need to modify this pass + // to correctly track liveness. + // + // After the above changes, the new CFG should look like this: + // entry + // / | + // diamond_head branch_from + // \ / + // new_block + // / | + // diamond_false diamond_true + // \ / + // done + // + // Without this optimization, we are forced to duplicate the diamond_true + // block and we will end up with a CFG like this: + // + // entry + // / | + // diamond_head branch_from + // / \ | + // diamond_false diamond_true diamond_true (duplicate) + // \ / | + // done --------------------| + // + // Duplicating diamond_true can be very costly especially if it has a + // lot of instructions. + return 0; + } + + int NumNewBlk = 0; + + bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2); + + //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL" + MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, AMDGPU::ENDIF); + + if (LandBlkHasOtherPred) { + llvm_unreachable("Extra register needed to handle CFG"); + unsigned CmpResReg = + HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC); + llvm_unreachable("Extra compare instruction needed to handle CFG"); + insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, + CmpResReg, DebugLoc()); + } + + // XXX: We are running this after RA, so creating virtual registers will + // cause an assertion failure in the PostRA scheduling pass. + unsigned InitReg = + HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC); + insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, InitReg, + DebugLoc()); + + if (MigrateTrue) { + migrateInstruction(TrueMBB, LandBlk, I); + // need to uncondionally insert the assignment to ensure a path from its + // predecessor rather than headBlk has valid value in initReg if + // (initVal != 1). + llvm_unreachable("Extra register needed to handle CFG"); + } + insertInstrBefore(I, AMDGPU::ELSE); + + if (MigrateFalse) { + migrateInstruction(FalseMBB, LandBlk, I); + // need to uncondionally insert the assignment to ensure a path from its + // predecessor rather than headBlk has valid value in initReg if + // (initVal != 0) + llvm_unreachable("Extra register needed to handle CFG"); + } + + if (LandBlkHasOtherPred) { + // add endif + insertInstrBefore(I, AMDGPU::ENDIF); + + // put initReg = 2 to other predecessors of landBlk + for (MachineBasicBlock::pred_iterator PI = LandBlk->pred_begin(), + PE = LandBlk->pred_end(); PI != PE; ++PI) { + MachineBasicBlock *MBB = *PI; + if (MBB != TrueMBB && MBB != FalseMBB) + llvm_unreachable("Extra register needed to handle CFG"); + } + } + DEBUG( + dbgs() << "result from improveSimpleJumpintoIf: "; + showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0); + ); + + // update landBlk + *LandMBBPtr = LandBlk; + + return NumNewBlk; +} + +void AMDGPUCFGStructurizer::handleLoopcontBlock(MachineBasicBlock *ContingMBB, + MachineLoop *ContingLoop, MachineBasicBlock *ContMBB, + MachineLoop *ContLoop) { + DEBUG(dbgs() << "loopcontPattern cont = BB" << ContingMBB->getNumber() + << " header = BB" << ContMBB->getNumber() << "\n"; + dbgs() << "Trying to continue loop-depth = " + << getLoopDepth(ContLoop) + << " from loop-depth = " << getLoopDepth(ContingLoop) << "\n";); + settleLoopcontBlock(ContingMBB, ContMBB); +} + +void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB, + MachineBasicBlock *SrcMBB) { + DEBUG( + dbgs() << "serialPattern BB" << DstMBB->getNumber() + << " <= BB" << SrcMBB->getNumber() << "\n"; + ); + DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end()); + + DstMBB->removeSuccessor(SrcMBB); + cloneSuccessorList(DstMBB, SrcMBB); + + removeSuccessor(SrcMBB); + MLI->removeBlock(SrcMBB); + retireBlock(SrcMBB); +} + +void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI, + MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB, + MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) { + assert (TrueMBB); + DEBUG( + dbgs() << "ifPattern BB" << MBB->getNumber(); + dbgs() << "{ "; + if (TrueMBB) { + dbgs() << "BB" << TrueMBB->getNumber(); + } + dbgs() << " } else "; + dbgs() << "{ "; + if (FalseMBB) { + dbgs() << "BB" << FalseMBB->getNumber(); + } + dbgs() << " }\n "; + dbgs() << "landBlock: "; + if (!LandMBB) { + dbgs() << "NULL"; + } else { + dbgs() << "BB" << LandMBB->getNumber(); + } + dbgs() << "\n"; + ); + + int OldOpcode = BranchMI->getOpcode(); + DebugLoc BranchDL = BranchMI->getDebugLoc(); + +// transform to +// if cond +// trueBlk +// else +// falseBlk +// endif +// landBlk + + MachineBasicBlock::iterator I = BranchMI; + insertCondBranchBefore(I, getBranchNzeroOpcode(OldOpcode), + BranchDL); + + if (TrueMBB) { + MBB->splice(I, TrueMBB, TrueMBB->begin(), TrueMBB->end()); + MBB->removeSuccessor(TrueMBB); + if (LandMBB && TrueMBB->succ_size()!=0) + TrueMBB->removeSuccessor(LandMBB); + retireBlock(TrueMBB); + MLI->removeBlock(TrueMBB); + } + + if (FalseMBB) { + insertInstrBefore(I, AMDGPU::ELSE); + MBB->splice(I, FalseMBB, FalseMBB->begin(), + FalseMBB->end()); + MBB->removeSuccessor(FalseMBB); + if (LandMBB && FalseMBB->succ_size() != 0) + FalseMBB->removeSuccessor(LandMBB); + retireBlock(FalseMBB); + MLI->removeBlock(FalseMBB); + } + insertInstrBefore(I, AMDGPU::ENDIF); + + BranchMI->eraseFromParent(); + + if (LandMBB && TrueMBB && FalseMBB) + MBB->addSuccessor(LandMBB); + +} + +void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk, + MachineBasicBlock *LandMBB) { + DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber() + << " land = BB" << LandMBB->getNumber() << "\n";); + + insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc()); + insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc()); + DstBlk->addSuccessor(LandMBB); + DstBlk->removeSuccessor(DstBlk); +} + + +void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB, + MachineBasicBlock *LandMBB) { + DEBUG(dbgs() << "loopbreakPattern exiting = BB" << ExitingMBB->getNumber() + << " land = BB" << LandMBB->getNumber() << "\n";); + MachineInstr *BranchMI = getLoopendBlockBranchInstr(ExitingMBB); + assert(BranchMI && isCondBranch(BranchMI)); + DebugLoc DL = BranchMI->getDebugLoc(); + MachineBasicBlock *TrueBranch = getTrueBranch(BranchMI); + MachineBasicBlock::iterator I = BranchMI; + if (TrueBranch != LandMBB) + reversePredicateSetter(I); + insertCondBranchBefore(ExitingMBB, I, AMDGPU::IF_PREDICATE_SET, AMDGPU::PREDICATE_BIT, DL); + insertInstrBefore(I, AMDGPU::BREAK); + insertInstrBefore(I, AMDGPU::ENDIF); + //now branchInst can be erase safely + BranchMI->eraseFromParent(); + //now take care of successors, retire blocks + ExitingMBB->removeSuccessor(LandMBB); +} + +void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB, + MachineBasicBlock *ContMBB) { + DEBUG(dbgs() << "settleLoopcontBlock conting = BB" + << ContingMBB->getNumber() + << ", cont = BB" << ContMBB->getNumber() << "\n";); + + MachineInstr *MI = getLoopendBlockBranchInstr(ContingMBB); + if (MI) { + assert(isCondBranch(MI)); + MachineBasicBlock::iterator I = MI; + MachineBasicBlock *TrueBranch = getTrueBranch(MI); + int OldOpcode = MI->getOpcode(); + DebugLoc DL = MI->getDebugLoc(); + + bool UseContinueLogical = ((&*ContingMBB->rbegin()) == MI); + + if (!UseContinueLogical) { + int BranchOpcode = + TrueBranch == ContMBB ? getBranchNzeroOpcode(OldOpcode) : + getBranchZeroOpcode(OldOpcode); + insertCondBranchBefore(I, BranchOpcode, DL); + // insertEnd to ensure phi-moves, if exist, go before the continue-instr. + insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, DL); + insertInstrEnd(ContingMBB, AMDGPU::ENDIF, DL); + } else { + int BranchOpcode = + TrueBranch == ContMBB ? getContinueNzeroOpcode(OldOpcode) : + getContinueZeroOpcode(OldOpcode); + insertCondBranchBefore(I, BranchOpcode, DL); + } + + MI->eraseFromParent(); + } else { + // if we've arrived here then we've already erased the branch instruction + // travel back up the basic block to see the last reference of our debug + // location we've just inserted that reference here so it should be + // representative insertEnd to ensure phi-moves, if exist, go before the + // continue-instr. + insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, + getLastDebugLocInBB(ContingMBB)); + } +} + +int AMDGPUCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB, + MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB) { + int Cloned = 0; + assert(PreMBB->isSuccessor(SrcMBB)); + while (SrcMBB && SrcMBB != DstMBB) { + assert(SrcMBB->succ_size() == 1); + if (SrcMBB->pred_size() > 1) { + SrcMBB = cloneBlockForPredecessor(SrcMBB, PreMBB); + ++Cloned; + } + + PreMBB = SrcMBB; + SrcMBB = *SrcMBB->succ_begin(); + } + + return Cloned; +} + +MachineBasicBlock * +AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB, + MachineBasicBlock *PredMBB) { + assert(PredMBB->isSuccessor(MBB) && + "succBlk is not a prececessor of curBlk"); + + MachineBasicBlock *CloneMBB = clone(MBB); //clone instructions + replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB); + //srcBlk, oldBlk, newBlk + + PredMBB->removeSuccessor(MBB); + PredMBB->addSuccessor(CloneMBB); + + // add all successor to cloneBlk + cloneSuccessorList(CloneMBB, MBB); + + numClonedInstr += MBB->size(); + + DEBUG( + dbgs() << "Cloned block: " << "BB" + << MBB->getNumber() << "size " << MBB->size() << "\n"; + ); + + SHOWNEWBLK(CloneMBB, "result of Cloned block: "); + + return CloneMBB; +} + +void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB, + MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I) { + MachineBasicBlock::iterator SpliceEnd; + //look for the input branchinstr, not the AMDGPU branchinstr + MachineInstr *BranchMI = getNormalBlockBranchInstr(SrcMBB); + if (!BranchMI) { + DEBUG( + dbgs() << "migrateInstruction don't see branch instr\n" ; + ); + SpliceEnd = SrcMBB->end(); + } else { + DEBUG( + dbgs() << "migrateInstruction see branch instr\n" ; + BranchMI->dump(); + ); + SpliceEnd = BranchMI; + } + DEBUG( + dbgs() << "migrateInstruction before splice dstSize = " << DstMBB->size() + << "srcSize = " << SrcMBB->size() << "\n"; + ); + + //splice insert before insertPos + DstMBB->splice(I, SrcMBB, SrcMBB->begin(), SpliceEnd); + + DEBUG( + dbgs() << "migrateInstruction after splice dstSize = " << DstMBB->size() + << "srcSize = " << SrcMBB->size() << "\n"; + ); +} + +MachineBasicBlock * +AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) { + MachineBasicBlock *LoopHeader = LoopRep->getHeader(); + MachineBasicBlock *LoopLatch = LoopRep->getLoopLatch(); + const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32); + + if (!LoopHeader || !LoopLatch) + return nullptr; + MachineInstr *BranchMI = getLoopendBlockBranchInstr(LoopLatch); + // Is LoopRep an infinite loop ? + if (!BranchMI || !isUncondBranch(BranchMI)) + return nullptr; + + MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock(); + FuncRep->push_back(DummyExitBlk); //insert to function + SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: "); + DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";); + MachineBasicBlock::iterator I = BranchMI; + unsigned ImmReg = FuncRep->getRegInfo().createVirtualRegister(I32RC); + llvm_unreachable("Extra register needed to handle CFG"); + MachineInstr *NewMI = insertInstrBefore(I, AMDGPU::BRANCH_COND_i32); + MachineInstrBuilder MIB(*FuncRep, NewMI); + MIB.addMBB(LoopHeader); + MIB.addReg(ImmReg, false); + SHOWNEWINSTR(NewMI); + BranchMI->eraseFromParent(); + LoopLatch->addSuccessor(DummyExitBlk); + + return DummyExitBlk; +} + +void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) { + MachineInstr *BranchMI; + + // I saw two unconditional branch in one basic block in example + // test_fc_do_while_or.c need to fix the upstream on this to remove the loop. + while ((BranchMI = getLoopendBlockBranchInstr(MBB)) + && isUncondBranch(BranchMI)) { + DEBUG(dbgs() << "Removing uncond branch instr"; BranchMI->dump();); + BranchMI->eraseFromParent(); + } +} + +void AMDGPUCFGStructurizer::removeRedundantConditionalBranch( + MachineBasicBlock *MBB) { + if (MBB->succ_size() != 2) + return; + MachineBasicBlock *MBB1 = *MBB->succ_begin(); + MachineBasicBlock *MBB2 = *std::next(MBB->succ_begin()); + if (MBB1 != MBB2) + return; + + MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB); + assert(BranchMI && isCondBranch(BranchMI)); + DEBUG(dbgs() << "Removing unneeded cond branch instr"; BranchMI->dump();); + BranchMI->eraseFromParent(); + SHOWNEWBLK(MBB1, "Removing redundant successor"); + MBB->removeSuccessor(MBB1); +} + +void AMDGPUCFGStructurizer::addDummyExitBlock( + SmallVectorImpl &RetMBB) { + MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock(); + FuncRep->push_back(DummyExitBlk); //insert to function + insertInstrEnd(DummyExitBlk, AMDGPU::RETURN); + + for (SmallVectorImpl::iterator It = RetMBB.begin(), + E = RetMBB.end(); It != E; ++It) { + MachineBasicBlock *MBB = *It; + MachineInstr *MI = getReturnInstr(MBB); + if (MI) + MI->eraseFromParent(); + MBB->addSuccessor(DummyExitBlk); + DEBUG( + dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber() + << " successors\n"; + ); + } + SHOWNEWBLK(DummyExitBlk, "DummyExitBlock: "); +} + +void AMDGPUCFGStructurizer::removeSuccessor(MachineBasicBlock *MBB) { + while (MBB->succ_size()) + MBB->removeSuccessor(*MBB->succ_begin()); +} + +void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB, + int SccNum) { + BlockInformation *&srcBlkInfo = BlockInfoMap[MBB]; + if (!srcBlkInfo) + srcBlkInfo = new BlockInformation(); + srcBlkInfo->SccNum = SccNum; +} + +void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) { + DEBUG( + dbgs() << "Retiring BB" << MBB->getNumber() << "\n"; + ); + + BlockInformation *&SrcBlkInfo = BlockInfoMap[MBB]; + + if (!SrcBlkInfo) + SrcBlkInfo = new BlockInformation(); + + SrcBlkInfo->IsRetired = true; + assert(MBB->succ_size() == 0 && MBB->pred_size() == 0 + && "can't retire block yet"); +} + +void AMDGPUCFGStructurizer::setLoopLandBlock(MachineLoop *loopRep, + MachineBasicBlock *MBB) { + MachineBasicBlock *&TheEntry = LLInfoMap[loopRep]; + if (!MBB) { + MBB = FuncRep->CreateMachineBasicBlock(); + FuncRep->push_back(MBB); //insert to function + SHOWNEWBLK(MBB, "DummyLandingBlock for loop without break: "); + } + TheEntry = MBB; + DEBUG( + dbgs() << "setLoopLandBlock loop-header = BB" + << loopRep->getHeader()->getNumber() + << " landing-block = BB" << MBB->getNumber() << "\n"; + ); +} + +MachineBasicBlock * +AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1, + MachineBasicBlock *MBB2) { + + if (PDT->dominates(MBB1, MBB2)) + return MBB1; + if (PDT->dominates(MBB2, MBB1)) + return MBB2; + + MachineDomTreeNode *Node1 = PDT->getNode(MBB1); + MachineDomTreeNode *Node2 = PDT->getNode(MBB2); + + // Handle newly cloned node. + if (!Node1 && MBB1->succ_size() == 1) + return findNearestCommonPostDom(*MBB1->succ_begin(), MBB2); + if (!Node2 && MBB2->succ_size() == 1) + return findNearestCommonPostDom(MBB1, *MBB2->succ_begin()); + + if (!Node1 || !Node2) + return nullptr; + + Node1 = Node1->getIDom(); + while (Node1) { + if (PDT->dominates(Node1, Node2)) + return Node1->getBlock(); + Node1 = Node1->getIDom(); + } + + return nullptr; +} + +MachineBasicBlock * +AMDGPUCFGStructurizer::findNearestCommonPostDom( + std::set &MBBs) { + MachineBasicBlock *CommonDom; + std::set::const_iterator It = MBBs.begin(); + std::set::const_iterator E = MBBs.end(); + for (CommonDom = *It; It != E && CommonDom; ++It) { + MachineBasicBlock *MBB = *It; + if (MBB != CommonDom) + CommonDom = findNearestCommonPostDom(MBB, CommonDom); + } + + DEBUG( + dbgs() << "Common post dominator for exit blocks is "; + if (CommonDom) + dbgs() << "BB" << CommonDom->getNumber() << "\n"; + else + dbgs() << "NULL\n"; + ); + + return CommonDom; +} + +char AMDGPUCFGStructurizer::ID = 0; + +} // end anonymous namespace + + +INITIALIZE_PASS_BEGIN(AMDGPUCFGStructurizer, "amdgpustructurizer", + "AMDGPU CFG Structurizer", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_END(AMDGPUCFGStructurizer, "amdgpustructurizer", + "AMDGPU CFG Structurizer", false, false) + +FunctionPass *llvm::createAMDGPUCFGStructurizerPass() { + return new AMDGPUCFGStructurizer(); +} diff --git a/lib/Target/AMDGPU/AMDKernelCodeT.h b/lib/Target/AMDGPU/AMDKernelCodeT.h new file mode 100644 index 00000000000..4d3041ff3db --- /dev/null +++ b/lib/Target/AMDGPU/AMDKernelCodeT.h @@ -0,0 +1,704 @@ +//===-- AMDGPUKernelCodeT.h - Print AMDGPU assembly code ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file AMDKernelCodeT.h +//===----------------------------------------------------------------------===// + +#ifndef AMDKERNELCODET_H +#define AMDKERNELCODET_H + +#include +#include + +//---------------------------------------------------------------------------// +// AMD Kernel Code, and its dependencies // +//---------------------------------------------------------------------------// + +typedef uint8_t hsa_powertwo8_t; +typedef uint32_t hsa_ext_code_kind_t; +typedef uint8_t hsa_ext_brig_profile8_t; +typedef uint8_t hsa_ext_brig_machine_model8_t; +typedef uint64_t hsa_ext_control_directive_present64_t; +typedef uint16_t hsa_ext_exception_kind16_t; +typedef uint32_t hsa_ext_code_kind32_t; + +typedef struct hsa_dim3_s { + uint32_t x; + uint32_t y; + uint32_t z; +} hsa_dim3_t; + +/// The version of the amd_*_code_t struct. Minor versions must be +/// backward compatible. +typedef uint32_t amd_code_version32_t; +enum amd_code_version_t { + AMD_CODE_VERSION_MAJOR = 0, + AMD_CODE_VERSION_MINOR = 1 +}; + +/// The values used to define the number of bytes to use for the +/// swizzle element size. +enum amd_element_byte_size_t { + AMD_ELEMENT_2_BYTES = 0, + AMD_ELEMENT_4_BYTES = 1, + AMD_ELEMENT_8_BYTES = 2, + AMD_ELEMENT_16_BYTES = 3 +}; + +/// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and +/// COMPUTE_PGM_RSRC2 registers. +typedef uint64_t amd_compute_pgm_resource_register64_t; + +/// Every amd_*_code_t has the following properties, which are composed of +/// a number of bit fields. Every bit field has a mask (AMD_CODE_PROPERTY_*), +/// bit width (AMD_CODE_PROPERTY_*_WIDTH, and bit shift amount +/// (AMD_CODE_PROPERTY_*_SHIFT) for convenient access. Unused bits must be 0. +/// +/// (Note that bit fields cannot be used as their layout is +/// implementation defined in the C standard and so cannot be used to +/// specify an ABI) +typedef uint32_t amd_code_property32_t; +enum amd_code_property_mask_t { + + /// Enable the setup of the SGPR user data registers + /// (AMD_CODE_PROPERTY_ENABLE_SGPR_*), see documentation of amd_kernel_code_t + /// for initial register state. + /// + /// The total number of SGPRuser data registers requested must not + /// exceed 16. Any requests beyond 16 will be ignored. + /// + /// Used to set COMPUTE_PGM_RSRC2.USER_SGPR (set to total count of + /// SGPR user data registers enabled up to 16). + + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT = 0, + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT = 2, + AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT = 3, + AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT = 4, + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT = 5, + AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT = 6, + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT = 7, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT = 8, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT, + + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT = 9, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT, + + /// Control wave ID base counter for GDS ordered-append. Used to set + /// COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if + /// ORDERED_APPEND_MODE also needs to be settable) + AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 10, + AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1, + AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS = ((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT, + + /// The interleave (swizzle) element size in bytes required by the + /// code for private memory. This must be 2, 4, 8 or 16. This value + /// is provided to the finalizer when it is invoked and is recorded + /// here. The hardware will interleave the memory requests of each + /// lane of a wavefront by this element size to ensure each + /// work-item gets a distinct memory memory location. Therefore, the + /// finalizer ensures that all load and store operations done to + /// private memory do not exceed this size. For example, if the + /// element size is 4 (32-bits or dword) and a 64-bit value must be + /// loaded, the finalizer will generate two 32-bit loads. This + /// ensures that the interleaving will get the the work-item + /// specific dword for both halves of the 64-bit value. If it just + /// did a 64-bit load then it would get one dword which belonged to + /// its own work-item, but the second dword would belong to the + /// adjacent lane work-item since the interleaving is in dwords. + /// + /// The value used must match the value that the runtime configures + /// the GPU flat scratch (SH_STATIC_MEM_CONFIG.ELEMENT_SIZE). This + /// is generally DWORD. + /// + /// Use values from the amd_element_byte_size_t enum. + AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 11, + AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2, + AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE = ((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT, + + /// Are global memory addresses 64 bits. Must match + /// amd_kernel_code_t.hsail_machine_model == + /// HSA_MACHINE_LARGE. Must also match + /// SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)), + /// SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+). + AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 13, + AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1, + AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_PTR64_SHIFT, + + /// Indicate if the generated ISA is using a dynamically sized call + /// stack. This can happen if calls are implemented using a call + /// stack and recursion, alloca or calls to indirect functions are + /// present. In these cases the Finalizer cannot compute the total + /// private segment size at compile time. In this case the + /// workitem_private_segment_byte_size only specifies the statically + /// know private segment size, and additional space must be added + /// for the call stack. + AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 14, + AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1, + AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK = ((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT, + + /// Indicate if code generated has support for debugging. + AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 15, + AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1, + AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT +}; + +/// @brief The hsa_ext_control_directives_t specifies the values for the HSAIL +/// control directives. These control how the finalizer generates code. This +/// struct is used both as an argument to hsaFinalizeKernel to specify values for +/// the control directives, and is used in HsaKernelCode to record the values of +/// the control directives that the finalize used when generating the code which +/// either came from the finalizer argument or explicit HSAIL control +/// directives. See the definition of the control directives in HSA Programmer's +/// Reference Manual which also defines how the values specified as finalizer +/// arguments have to agree with the control directives in the HSAIL code. +typedef struct hsa_ext_control_directives_s { + /// This is a bit set indicating which control directives have been + /// specified. If the value is 0 then there are no control directives specified + /// and the rest of the fields can be ignored. The bits are accessed using the + /// hsa_ext_control_directives_present_mask_t. Any control directive that is not + /// enabled in this bit set must have the value of all 0s. + hsa_ext_control_directive_present64_t enabled_control_directives; + + /// If enableBreakExceptions is not enabled then must be 0, otherwise must be + /// non-0 and specifies the set of HSAIL exceptions that must have the BREAK + /// policy enabled. If this set is not empty then the generated code may have + /// lower performance than if the set is empty. If the kernel being finalized + /// has any enablebreakexceptions control directives, then the values specified + /// by this argument are unioned with the values in these control + /// directives. If any of the functions the kernel calls have an + /// enablebreakexceptions control directive, then they must be equal or a + /// subset of, this union. + hsa_ext_exception_kind16_t enable_break_exceptions; + + /// If enableDetectExceptions is not enabled then must be 0, otherwise must be + /// non-0 and specifies the set of HSAIL exceptions that must have the DETECT + /// policy enabled. If this set is not empty then the generated code may have + /// lower performance than if the set is empty. However, an implementation + /// should endeavour to make the performance impact small. If the kernel being + /// finalized has any enabledetectexceptions control directives, then the + /// values specified by this argument are unioned with the values in these + /// control directives. If any of the functions the kernel calls have an + /// enabledetectexceptions control directive, then they must be equal or a + /// subset of, this union. + hsa_ext_exception_kind16_t enable_detect_exceptions; + + /// If maxDynamicGroupSize is not enabled then must be 0, and any amount of + /// dynamic group segment can be allocated for a dispatch, otherwise the value + /// specifies the maximum number of bytes of dynamic group segment that can be + /// allocated for a dispatch. If the kernel being finalized has any + /// maxdynamicsize control directives, then the values must be the same, and + /// must be the same as this argument if it is enabled. This value can be used + /// by the finalizer to determine the maximum number of bytes of group memory + /// used by each work-group by adding this value to the group memory required + /// for all group segment variables used by the kernel and all functions it + /// calls, and group memory used to implement other HSAIL features such as + /// fbarriers and the detect exception operations. This can allow the finalizer + /// to determine the expected number of work-groups that can be executed by a + /// compute unit and allow more resources to be allocated to the work-items if + /// it is known that fewer work-groups can be executed due to group memory + /// limitations. + uint32_t max_dynamic_group_size; + + /// If maxFlatGridSize is not enabled then must be 0, otherwise must be greater + /// than 0. See HSA Programmer's Reference Manual description of + /// maxflatgridsize control directive. + uint32_t max_flat_grid_size; + + /// If maxFlatWorkgroupSize is not enabled then must be 0, otherwise must be + /// greater than 0. See HSA Programmer's Reference Manual description of + /// maxflatworkgroupsize control directive. + uint32_t max_flat_workgroup_size; + + /// If requestedWorkgroupsPerCu is not enabled then must be 0, and the + /// finalizer is free to generate ISA that may result in any number of + /// work-groups executing on a single compute unit. Otherwise, the finalizer + /// should attempt to generate ISA that will allow the specified number of + /// work-groups to execute on a single compute unit. This is only a hint and + /// can be ignored by the finalizer. If the kernel being finalized, or any of + /// the functions it calls, has a requested control directive, then the values + /// must be the same. This can be used to determine the number of resources + /// that should be allocated to a single work-group and work-item. For example, + /// a low value may allow more resources to be allocated, resulting in higher + /// per work-item performance, as it is known there will never be more than the + /// specified number of work-groups actually executing on the compute + /// unit. Conversely, a high value may allocate fewer resources, resulting in + /// lower per work-item performance, which is offset by the fact it allows more + /// work-groups to actually execute on the compute unit. + uint32_t requested_workgroups_per_cu; + + /// If not enabled then all elements for Dim3 must be 0, otherwise every + /// element must be greater than 0. See HSA Programmer's Reference Manual + /// description of requiredgridsize control directive. + hsa_dim3_t required_grid_size; + + /// If requiredWorkgroupSize is not enabled then all elements for Dim3 must be + /// 0, and the produced code can be dispatched with any legal work-group range + /// consistent with the dispatch dimensions. Otherwise, the code produced must + /// always be dispatched with the specified work-group range. No element of the + /// specified range must be 0. It must be consistent with required_dimensions + /// and max_flat_workgroup_size. If the kernel being finalized, or any of the + /// functions it calls, has a requiredworkgroupsize control directive, then the + /// values must be the same. Specifying a value can allow the finalizer to + /// optimize work-group id operations, and if the number of work-items in the + /// work-group is less than the WAVESIZE then barrier operations can be + /// optimized to just a memory fence. + hsa_dim3_t required_workgroup_size; + + /// If requiredDim is not enabled then must be 0 and the produced kernel code + /// can be dispatched with 1, 2 or 3 dimensions. If enabled then the value is + /// 1..3 and the code produced must only be dispatched with a dimension that + /// matches. Other values are illegal. If the kernel being finalized, or any of + /// the functions it calls, has a requireddimsize control directive, then the + /// values must be the same. This can be used to optimize the code generated to + /// compute the absolute and flat work-group and work-item id, and the dim + /// HSAIL operations. + uint8_t required_dim; + + /// Reserved. Must be 0. + uint8_t reserved[75]; +} hsa_ext_control_directives_t; + +/// AMD Kernel Code Object (amd_kernel_code_t). GPU CP uses the AMD Kernel +/// Code Object to set up the hardware to execute the kernel dispatch. +/// +/// Initial Kernel Register State. +/// +/// Initial kernel register state will be set up by CP/SPI prior to the start +/// of execution of every wavefront. This is limited by the constraints of the +/// current hardware. +/// +/// The order of the SGPR registers is defined, but the Finalizer can specify +/// which ones are actually setup in the amd_kernel_code_t object using the +/// enable_sgpr_* bit fields. The register numbers used for enabled registers +/// are dense starting at SGPR0: the first enabled register is SGPR0, the next +/// enabled register is SGPR1 etc.; disabled registers do not have an SGPR +/// number. +/// +/// The initial SGPRs comprise up to 16 User SRGPs that are set up by CP and +/// apply to all waves of the grid. It is possible to specify more than 16 User +/// SGPRs using the enable_sgpr_* bit fields, in which case only the first 16 +/// are actually initialized. These are then immediately followed by the System +/// SGPRs that are set up by ADC/SPI and can have different values for each wave +/// of the grid dispatch. +/// +/// SGPR register initial state is defined as follows: +/// +/// Private Segment Buffer (enable_sgpr_private_segment_buffer): +/// Number of User SGPR registers: 4. V# that can be used, together with +/// Scratch Wave Offset as an offset, to access the Private/Spill/Arg +/// segments using a segment address. It must be set as follows: +/// - Base address: of the scratch memory area used by the dispatch. It +/// does not include the scratch wave offset. It will be the per process +/// SH_HIDDEN_PRIVATE_BASE_VMID plus any offset from this dispatch (for +/// example there may be a per pipe offset, or per AQL Queue offset). +/// - Stride + data_format: Element Size * Index Stride (???) +/// - Cache swizzle: ??? +/// - Swizzle enable: SH_STATIC_MEM_CONFIG.SWIZZLE_ENABLE (must be 1 for +/// scratch) +/// - Num records: Flat Scratch Work Item Size / Element Size (???) +/// - Dst_sel_*: ??? +/// - Num_format: ??? +/// - Element_size: SH_STATIC_MEM_CONFIG.ELEMENT_SIZE (will be DWORD, must +/// agree with amd_kernel_code_t.privateElementSize) +/// - Index_stride: SH_STATIC_MEM_CONFIG.INDEX_STRIDE (will be 64 as must +/// be number of wavefront lanes for scratch, must agree with +/// amd_kernel_code_t.wavefrontSize) +/// - Add tid enable: 1 +/// - ATC: from SH_MEM_CONFIG.PRIVATE_ATC, +/// - Hash_enable: ??? +/// - Heap: ??? +/// - Mtype: from SH_STATIC_MEM_CONFIG.PRIVATE_MTYPE +/// - Type: 0 (a buffer) (???) +/// +/// Dispatch Ptr (enable_sgpr_dispatch_ptr): +/// Number of User SGPR registers: 2. 64 bit address of AQL dispatch packet +/// for kernel actually executing. +/// +/// Queue Ptr (enable_sgpr_queue_ptr): +/// Number of User SGPR registers: 2. 64 bit address of AmdQueue object for +/// AQL queue on which the dispatch packet was queued. +/// +/// Kernarg Segment Ptr (enable_sgpr_kernarg_segment_ptr): +/// Number of User SGPR registers: 2. 64 bit address of Kernarg segment. This +/// is directly copied from the kernargPtr in the dispatch packet. Having CP +/// load it once avoids loading it at the beginning of every wavefront. +/// +/// Dispatch Id (enable_sgpr_dispatch_id): +/// Number of User SGPR registers: 2. 64 bit Dispatch ID of the dispatch +/// packet being executed. +/// +/// Flat Scratch Init (enable_sgpr_flat_scratch_init): +/// Number of User SGPR registers: 2. This is 2 SGPRs. +/// +/// For CI/VI: +/// The first SGPR is a 32 bit byte offset from SH_MEM_HIDDEN_PRIVATE_BASE +/// to base of memory for scratch for this dispatch. This is the same offset +/// used in computing the Scratch Segment Buffer base address. The value of +/// Scratch Wave Offset must be added by the kernel code and moved to +/// SGPRn-4 for use as the FLAT SCRATCH BASE in flat memory instructions. +/// +/// The second SGPR is 32 bit byte size of a single work-itemÂ’s scratch +/// memory usage. This is directly loaded from the dispatch packet Private +/// Segment Byte Size and rounded up to a multiple of DWORD. +/// +/// \todo [Does CP need to round this to >4 byte alignment?] +/// +/// The kernel code must move to SGPRn-3 for use as the FLAT SCRATCH SIZE in +/// flat memory instructions. Having CP load it once avoids loading it at +/// the beginning of every wavefront. +/// +/// For PI: +/// This is the 64 bit base address of the scratch backing memory for +/// allocated by CP for this dispatch. +/// +/// Private Segment Size (enable_sgpr_private_segment_size): +/// Number of User SGPR registers: 1. The 32 bit byte size of a single +/// work-itemÂ’s scratch memory allocation. This is the value from the dispatch +/// packet. Private Segment Byte Size rounded up by CP to a multiple of DWORD. +/// +/// \todo [Does CP need to round this to >4 byte alignment?] +/// +/// Having CP load it once avoids loading it at the beginning of every +/// wavefront. +/// +/// \todo [This will not be used for CI/VI since it is the same value as +/// the second SGPR of Flat Scratch Init. However, it is need for PI which +/// changes meaning of Flat Scratchg Init..] +/// +/// Grid Work-Group Count X (enable_sgpr_grid_workgroup_count_x): +/// Number of User SGPR registers: 1. 32 bit count of the number of +/// work-groups in the X dimension for the grid being executed. Computed from +/// the fields in the HsaDispatchPacket as +/// ((gridSize.x+workgroupSize.x-1)/workgroupSize.x). +/// +/// Grid Work-Group Count Y (enable_sgpr_grid_workgroup_count_y): +/// Number of User SGPR registers: 1. 32 bit count of the number of +/// work-groups in the Y dimension for the grid being executed. Computed from +/// the fields in the HsaDispatchPacket as +/// ((gridSize.y+workgroupSize.y-1)/workgroupSize.y). +/// +/// Only initialized if <16 previous SGPRs initialized. +/// +/// Grid Work-Group Count Z (enable_sgpr_grid_workgroup_count_z): +/// Number of User SGPR registers: 1. 32 bit count of the number of +/// work-groups in the Z dimension for the grid being executed. Computed +/// from the fields in the HsaDispatchPacket as +/// ((gridSize.z+workgroupSize.z-1)/workgroupSize.z). +/// +/// Only initialized if <16 previous SGPRs initialized. +/// +/// Work-Group Id X (enable_sgpr_workgroup_id_x): +/// Number of System SGPR registers: 1. 32 bit work group id in X dimension +/// of grid for wavefront. Always present. +/// +/// Work-Group Id Y (enable_sgpr_workgroup_id_y): +/// Number of System SGPR registers: 1. 32 bit work group id in Y dimension +/// of grid for wavefront. +/// +/// Work-Group Id Z (enable_sgpr_workgroup_id_z): +/// Number of System SGPR registers: 1. 32 bit work group id in Z dimension +/// of grid for wavefront. If present then Work-group Id Y will also be +/// present +/// +/// Work-Group Info (enable_sgpr_workgroup_info): +/// Number of System SGPR registers: 1. {first_wave, 14Â’b0000, +/// ordered_append_term[10:0], threadgroup_size_in_waves[5:0]} +/// +/// Private Segment Wave Byte Offset +/// (enable_sgpr_private_segment_wave_byte_offset): +/// Number of System SGPR registers: 1. 32 bit byte offset from base of +/// dispatch scratch base. Must be used as an offset with Private/Spill/Arg +/// segment address when using Scratch Segment Buffer. It must be added to +/// Flat Scratch Offset if setting up FLAT SCRATCH for flat addressing. +/// +/// +/// The order of the VGPR registers is defined, but the Finalizer can specify +/// which ones are actually setup in the amd_kernel_code_t object using the +/// enableVgpr* bit fields. The register numbers used for enabled registers +/// are dense starting at VGPR0: the first enabled register is VGPR0, the next +/// enabled register is VGPR1 etc.; disabled registers do not have an VGPR +/// number. +/// +/// VGPR register initial state is defined as follows: +/// +/// Work-Item Id X (always initialized): +/// Number of registers: 1. 32 bit work item id in X dimension of work-group +/// for wavefront lane. +/// +/// Work-Item Id X (enable_vgpr_workitem_id > 0): +/// Number of registers: 1. 32 bit work item id in Y dimension of work-group +/// for wavefront lane. +/// +/// Work-Item Id X (enable_vgpr_workitem_id > 0): +/// Number of registers: 1. 32 bit work item id in Z dimension of work-group +/// for wavefront lane. +/// +/// +/// The setting of registers is being done by existing GPU hardware as follows: +/// 1) SGPRs before the Work-Group Ids are set by CP using the 16 User Data +/// registers. +/// 2) Work-group Id registers X, Y, Z are set by SPI which supports any +/// combination including none. +/// 3) Scratch Wave Offset is also set by SPI which is why its value cannot +/// be added into the value Flat Scratch Offset which would avoid the +/// Finalizer generated prolog having to do the add. +/// 4) The VGPRs are set by SPI which only supports specifying either (X), +/// (X, Y) or (X, Y, Z). +/// +/// Flat Scratch Dispatch Offset and Flat Scratch Size are adjacent SGRRs so +/// they can be moved as a 64 bit value to the hardware required SGPRn-3 and +/// SGPRn-4 respectively using the Finalizer ?FLAT_SCRATCH? Register. +/// +/// The global segment can be accessed either using flat operations or buffer +/// operations. If buffer operations are used then the Global Buffer used to +/// access HSAIL Global/Readonly/Kernarg (which are combine) segments using a +/// segment address is not passed into the kernel code by CP since its base +/// address is always 0. Instead the Finalizer generates prolog code to +/// initialize 4 SGPRs with a V# that has the following properties, and then +/// uses that in the buffer instructions: +/// - base address of 0 +/// - no swizzle +/// - ATC=1 +/// - MTYPE set to support memory coherence specified in +/// amd_kernel_code_t.globalMemoryCoherence +/// +/// When the Global Buffer is used to access the Kernarg segment, must add the +/// dispatch packet kernArgPtr to a kernarg segment address before using this V#. +/// Alternatively scalar loads can be used if the kernarg offset is uniform, as +/// the kernarg segment is constant for the duration of the kernel execution. +/// +typedef struct amd_kernel_code_s { + /// The AMD major version of the Code Object. Must be the value + /// AMD_CODE_VERSION_MAJOR. + amd_code_version32_t amd_code_version_major; + + /// The AMD minor version of the Code Object. Minor versions must be + /// backward compatible. Must be the value + /// AMD_CODE_VERSION_MINOR. + amd_code_version32_t amd_code_version_minor; + + /// The byte size of this struct. Must be set to + /// sizeof(amd_kernel_code_t). Used for backward + /// compatibility. + uint32_t struct_byte_size; + + /// The target chip instruction set for which code has been + /// generated. Values are from the E_SC_INSTRUCTION_SET enumeration + /// in sc/Interface/SCCommon.h. + uint32_t target_chip; + + /// Byte offset (possibly negative) from start of amd_kernel_code_t + /// object to kernel's entry point instruction. The actual code for + /// the kernel is required to be 256 byte aligned to match hardware + /// requirements (SQ cache line is 16). The code must be position + /// independent code (PIC) for AMD devices to give runtime the + /// option of copying code to discrete GPU memory or APU L2 + /// cache. The Finalizer should endeavour to allocate all kernel + /// machine code in contiguous memory pages so that a device + /// pre-fetcher will tend to only pre-fetch Kernel Code objects, + /// improving cache performance. + int64_t kernel_code_entry_byte_offset; + + /// Range of bytes to consider prefetching expressed as an offset + /// and size. The offset is from the start (possibly negative) of + /// amd_kernel_code_t object. Set both to 0 if no prefetch + /// information is available. + /// + /// \todo ttye 11/15/2013 Is the prefetch definition we want? Did + /// not make the size a uint64_t as prefetching more than 4GiB seems + /// excessive. + int64_t kernel_code_prefetch_byte_offset; + uint64_t kernel_code_prefetch_byte_size; + + /// Number of bytes of scratch backing memory required for full + /// occupancy of target chip. This takes into account the number of + /// bytes of scratch per work-item, the wavefront size, the maximum + /// number of wavefronts per CU, and the number of CUs. This is an + /// upper limit on scratch. If the grid being dispatched is small it + /// may only need less than this. If the kernel uses no scratch, or + /// the Finalizer has not computed this value, it must be 0. + uint64_t max_scratch_backing_memory_byte_size; + + /// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and + /// COMPUTE_PGM_RSRC2 registers. + amd_compute_pgm_resource_register64_t compute_pgm_resource_registers; + + /// Code properties. See amd_code_property_mask_t for a full list of + /// properties. + amd_code_property32_t code_properties; + + /// The amount of memory required for the combined private, spill + /// and arg segments for a work-item in bytes. If + /// is_dynamic_callstack is 1 then additional space must be added to + /// this value for the call stack. + uint32_t workitem_private_segment_byte_size; + + /// The amount of group segment memory required by a work-group in + /// bytes. This does not include any dynamically allocated group + /// segment memory that may be added when the kernel is + /// dispatched. + uint32_t workgroup_group_segment_byte_size; + + /// Number of byte of GDS required by kernel dispatch. Must be 0 if + /// not using GDS. + uint32_t gds_segment_byte_size; + + /// The size in bytes of the kernarg segment that holds the values + /// of the arguments to the kernel. This could be used by CP to + /// prefetch the kernarg segment pointed to by the dispatch packet. + uint64_t kernarg_segment_byte_size; + + /// Number of fbarrier's used in the kernel and all functions it + /// calls. If the implementation uses group memory to allocate the + /// fbarriers then that amount must already be included in the + /// workgroup_group_segment_byte_size total. + uint32_t workgroup_fbarrier_count; + + /// Number of scalar registers used by a wavefront. This includes + /// the special SGPRs for VCC, Flat Scratch Base, Flat Scratch Size + /// and XNACK (for GFX8 (VI)). It does not include the 16 SGPR added if a + /// trap handler is enabled. Used to set COMPUTE_PGM_RSRC1.SGPRS. + uint16_t wavefront_sgpr_count; + + /// Number of vector registers used by each work-item. Used to set + /// COMPUTE_PGM_RSRC1.VGPRS. + uint16_t workitem_vgpr_count; + + /// If reserved_vgpr_count is 0 then must be 0. Otherwise, this is the + /// first fixed VGPR number reserved. + uint16_t reserved_vgpr_first; + + /// The number of consecutive VGPRs reserved by the client. If + /// is_debug_supported then this count includes VGPRs reserved + /// for debugger use. + uint16_t reserved_vgpr_count; + + /// If reserved_sgpr_count is 0 then must be 0. Otherwise, this is the + /// first fixed SGPR number reserved. + uint16_t reserved_sgpr_first; + + /// The number of consecutive SGPRs reserved by the client. If + /// is_debug_supported then this count includes SGPRs reserved + /// for debugger use. + uint16_t reserved_sgpr_count; + + /// If is_debug_supported is 0 then must be 0. Otherwise, this is the + /// fixed SGPR number used to hold the wave scratch offset for the + /// entire kernel execution, or uint16_t(-1) if the register is not + /// used or not known. + uint16_t debug_wavefront_private_segment_offset_sgpr; + + /// If is_debug_supported is 0 then must be 0. Otherwise, this is the + /// fixed SGPR number of the first of 4 SGPRs used to hold the + /// scratch V# used for the entire kernel execution, or uint16_t(-1) + /// if the registers are not used or not known. + uint16_t debug_private_segment_buffer_sgpr; + + /// The maximum byte alignment of variables used by the kernel in + /// the specified memory segment. Expressed as a power of two. Must + /// be at least HSA_POWERTWO_16. + hsa_powertwo8_t kernarg_segment_alignment; + hsa_powertwo8_t group_segment_alignment; + hsa_powertwo8_t private_segment_alignment; + + uint8_t reserved3; + + /// Type of code object. + hsa_ext_code_kind32_t code_type; + + /// Reserved for code properties if any are defined in the future. + /// There are currently no code properties so this field must be 0. + uint32_t reserved4; + + /// Wavefront size expressed as a power of two. Must be a power of 2 + /// in range 1..64 inclusive. Used to support runtime query that + /// obtains wavefront size, which may be used by application to + /// allocated dynamic group memory and set the dispatch work-group + /// size. + hsa_powertwo8_t wavefront_size; + + /// The optimization level specified when the kernel was + /// finalized. + uint8_t optimization_level; + + /// The HSAIL profile defines which features are used. This + /// information is from the HSAIL version directive. If this + /// amd_kernel_code_t is not generated from an HSAIL compilation + /// unit then must be 0. + hsa_ext_brig_profile8_t hsail_profile; + + /// The HSAIL machine model gives the address sizes used by the + /// code. This information is from the HSAIL version directive. If + /// not generated from an HSAIL compilation unit then must still + /// indicate for what machine mode the code is generated. + hsa_ext_brig_machine_model8_t hsail_machine_model; + + /// The HSAIL major version. This information is from the HSAIL + /// version directive. If this amd_kernel_code_t is not + /// generated from an HSAIL compilation unit then must be 0. + uint32_t hsail_version_major; + + /// The HSAIL minor version. This information is from the HSAIL + /// version directive. If this amd_kernel_code_t is not + /// generated from an HSAIL compilation unit then must be 0. + uint32_t hsail_version_minor; + + /// Reserved for HSAIL target options if any are defined in the + /// future. There are currently no target options so this field + /// must be 0. + uint16_t reserved5; + + /// Reserved. Must be 0. + uint16_t reserved6; + + /// The values should be the actually values used by the finalizer + /// in generating the code. This may be the union of values + /// specified as finalizer arguments and explicit HSAIL control + /// directives. If the finalizer chooses to ignore a control + /// directive, and not generate constrained code, then the control + /// directive should not be marked as enabled even though it was + /// present in the HSAIL or finalizer argument. The values are + /// intended to reflect the constraints that the code actually + /// requires to correctly execute, not the values that were + /// actually specified at finalize time. + hsa_ext_control_directives_t control_directive; + + /// The code can immediately follow the amd_kernel_code_t, or can + /// come after subsequent amd_kernel_code_t structs when there are + /// multiple kernels in the compilation unit. + +} amd_kernel_code_t; + +#endif // AMDKERNELCODET_H diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp new file mode 100644 index 00000000000..0c9a68804a3 --- /dev/null +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -0,0 +1,1380 @@ +//===-- AMDGPUAsmParser.cpp - Parse SI asm to MCInst instructions ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIDefines.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCParser/MCAsmLexer.h" +#include "llvm/MC/MCParser/MCAsmParser.h" +#include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCTargetAsmParser.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +namespace { + +struct OptionalOperand; + +class AMDGPUOperand : public MCParsedAsmOperand { + enum KindTy { + Token, + Immediate, + Register, + Expression + } Kind; + + SMLoc StartLoc, EndLoc; + +public: + AMDGPUOperand(enum KindTy K) : MCParsedAsmOperand(), Kind(K) {} + + MCContext *Ctx; + + enum ImmTy { + ImmTyNone, + ImmTyDSOffset0, + ImmTyDSOffset1, + ImmTyGDS, + ImmTyOffset, + ImmTyGLC, + ImmTySLC, + ImmTyTFE, + ImmTyClamp, + ImmTyOMod + }; + + struct TokOp { + const char *Data; + unsigned Length; + }; + + struct ImmOp { + bool IsFPImm; + ImmTy Type; + int64_t Val; + }; + + struct RegOp { + unsigned RegNo; + int Modifiers; + const MCRegisterInfo *TRI; + bool IsForcedVOP3; + }; + + union { + TokOp Tok; + ImmOp Imm; + RegOp Reg; + const MCExpr *Expr; + }; + + void addImmOperands(MCInst &Inst, unsigned N) const { + Inst.addOperand(MCOperand::createImm(getImm())); + } + + StringRef getToken() const { + return StringRef(Tok.Data, Tok.Length); + } + + void addRegOperands(MCInst &Inst, unsigned N) const { + Inst.addOperand(MCOperand::createReg(getReg())); + } + + void addRegOrImmOperands(MCInst &Inst, unsigned N) const { + if (isReg()) + addRegOperands(Inst, N); + else + addImmOperands(Inst, N); + } + + void addRegWithInputModsOperands(MCInst &Inst, unsigned N) const { + Inst.addOperand(MCOperand::createImm( + Reg.Modifiers == -1 ? 0 : Reg.Modifiers)); + addRegOperands(Inst, N); + } + + void addSoppBrTargetOperands(MCInst &Inst, unsigned N) const { + if (isImm()) + addImmOperands(Inst, N); + else { + assert(isExpr()); + Inst.addOperand(MCOperand::createExpr(Expr)); + } + } + + bool defaultTokenHasSuffix() const { + StringRef Token(Tok.Data, Tok.Length); + + return Token.endswith("_e32") || Token.endswith("_e64"); + } + + bool isToken() const override { + return Kind == Token; + } + + bool isImm() const override { + return Kind == Immediate; + } + + bool isInlineImm() const { + float F = BitsToFloat(Imm.Val); + // TODO: Add 0.5pi for VI + return isImm() && ((Imm.Val <= 64 && Imm.Val >= -16) || + (F == 0.0 || F == 0.5 || F == -0.5 || F == 1.0 || F == -1.0 || + F == 2.0 || F == -2.0 || F == 4.0 || F == -4.0)); + } + + bool isDSOffset0() const { + assert(isImm()); + return Imm.Type == ImmTyDSOffset0; + } + + bool isDSOffset1() const { + assert(isImm()); + return Imm.Type == ImmTyDSOffset1; + } + + int64_t getImm() const { + return Imm.Val; + } + + enum ImmTy getImmTy() const { + assert(isImm()); + return Imm.Type; + } + + bool isRegKind() const { + return Kind == Register; + } + + bool isReg() const override { + return Kind == Register && Reg.Modifiers == -1; + } + + bool isRegWithInputMods() const { + return Kind == Register && (Reg.IsForcedVOP3 || Reg.Modifiers != -1); + } + + void setModifiers(unsigned Mods) { + assert(isReg()); + Reg.Modifiers = Mods; + } + + bool hasModifiers() const { + assert(isRegKind()); + return Reg.Modifiers != -1; + } + + unsigned getReg() const override { + return Reg.RegNo; + } + + bool isRegOrImm() const { + return isReg() || isImm(); + } + + bool isRegClass(unsigned RCID) const { + return Reg.TRI->getRegClass(RCID).contains(getReg()); + } + + bool isSCSrc32() const { + return isInlineImm() || (isReg() && isRegClass(AMDGPU::SReg_32RegClassID)); + } + + bool isSSrc32() const { + return isImm() || (isReg() && isRegClass(AMDGPU::SReg_32RegClassID)); + } + + bool isSSrc64() const { + return isImm() || isInlineImm() || + (isReg() && isRegClass(AMDGPU::SReg_64RegClassID)); + } + + bool isVCSrc32() const { + return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID)); + } + + bool isVCSrc64() const { + return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_64RegClassID)); + } + + bool isVSrc32() const { + return isImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID)); + } + + bool isVSrc64() const { + return isImm() || (isReg() && isRegClass(AMDGPU::VS_64RegClassID)); + } + + bool isMem() const override { + return false; + } + + bool isExpr() const { + return Kind == Expression; + } + + bool isSoppBrTarget() const { + return isExpr() || isImm(); + } + + SMLoc getStartLoc() const override { + return StartLoc; + } + + SMLoc getEndLoc() const override { + return EndLoc; + } + + void print(raw_ostream &OS) const override { } + + static std::unique_ptr CreateImm(int64_t Val, SMLoc Loc, + enum ImmTy Type = ImmTyNone, + bool IsFPImm = false) { + auto Op = llvm::make_unique(Immediate); + Op->Imm.Val = Val; + Op->Imm.IsFPImm = IsFPImm; + Op->Imm.Type = Type; + Op->StartLoc = Loc; + Op->EndLoc = Loc; + return Op; + } + + static std::unique_ptr CreateToken(StringRef Str, SMLoc Loc, + bool HasExplicitEncodingSize = true) { + auto Res = llvm::make_unique(Token); + Res->Tok.Data = Str.data(); + Res->Tok.Length = Str.size(); + Res->StartLoc = Loc; + Res->EndLoc = Loc; + return Res; + } + + static std::unique_ptr CreateReg(unsigned RegNo, SMLoc S, + SMLoc E, + const MCRegisterInfo *TRI, + bool ForceVOP3) { + auto Op = llvm::make_unique(Register); + Op->Reg.RegNo = RegNo; + Op->Reg.TRI = TRI; + Op->Reg.Modifiers = -1; + Op->Reg.IsForcedVOP3 = ForceVOP3; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static std::unique_ptr CreateExpr(const class MCExpr *Expr, SMLoc S) { + auto Op = llvm::make_unique(Expression); + Op->Expr = Expr; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + bool isDSOffset() const; + bool isDSOffset01() const; + bool isSWaitCnt() const; + bool isMubufOffset() const; +}; + +class AMDGPUAsmParser : public MCTargetAsmParser { + MCSubtargetInfo &STI; + const MCInstrInfo &MII; + MCAsmParser &Parser; + + unsigned ForcedEncodingSize; + /// @name Auto-generated Match Functions + /// { + +#define GET_ASSEMBLER_HEADER +#include "AMDGPUGenAsmMatcher.inc" + + /// } + +public: + AMDGPUAsmParser(MCSubtargetInfo &STI, MCAsmParser &_Parser, + const MCInstrInfo &MII, + const MCTargetOptions &Options) + : MCTargetAsmParser(), STI(STI), MII(MII), Parser(_Parser), + ForcedEncodingSize(0){ + + if (STI.getFeatureBits().none()) { + // Set default features. + STI.ToggleFeature("SOUTHERN_ISLANDS"); + } + + setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + } + + unsigned getForcedEncodingSize() const { + return ForcedEncodingSize; + } + + void setForcedEncodingSize(unsigned Size) { + ForcedEncodingSize = Size; + } + + bool isForcedVOP3() const { + return ForcedEncodingSize == 64; + } + + bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; + unsigned checkTargetMatchPredicate(MCInst &Inst) override; + bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, MCStreamer &Out, + uint64_t &ErrorInfo, + bool MatchingInlineAsm) override; + bool ParseDirective(AsmToken DirectiveID) override; + OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic); + bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + SMLoc NameLoc, OperandVector &Operands) override; + + OperandMatchResultTy parseIntWithPrefix(const char *Prefix, int64_t &Int, + int64_t Default = 0); + OperandMatchResultTy parseIntWithPrefix(const char *Prefix, + OperandVector &Operands, + enum AMDGPUOperand::ImmTy ImmTy = + AMDGPUOperand::ImmTyNone); + OperandMatchResultTy parseNamedBit(const char *Name, OperandVector &Operands, + enum AMDGPUOperand::ImmTy ImmTy = + AMDGPUOperand::ImmTyNone); + OperandMatchResultTy parseOptionalOps( + const ArrayRef &OptionalOps, + OperandVector &Operands); + + + void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands); + void cvtDS(MCInst &Inst, const OperandVector &Operands); + OperandMatchResultTy parseDSOptionalOps(OperandVector &Operands); + OperandMatchResultTy parseDSOff01OptionalOps(OperandVector &Operands); + OperandMatchResultTy parseDSOffsetOptional(OperandVector &Operands); + + bool parseCnt(int64_t &IntVal); + OperandMatchResultTy parseSWaitCntOps(OperandVector &Operands); + OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands); + + OperandMatchResultTy parseFlatOptionalOps(OperandVector &Operands); + OperandMatchResultTy parseFlatAtomicOptionalOps(OperandVector &Operands); + void cvtFlat(MCInst &Inst, const OperandVector &Operands); + + void cvtMubuf(MCInst &Inst, const OperandVector &Operands); + OperandMatchResultTy parseOffset(OperandVector &Operands); + OperandMatchResultTy parseMubufOptionalOps(OperandVector &Operands); + OperandMatchResultTy parseGLC(OperandVector &Operands); + OperandMatchResultTy parseSLC(OperandVector &Operands); + OperandMatchResultTy parseTFE(OperandVector &Operands); + + OperandMatchResultTy parseDMask(OperandVector &Operands); + OperandMatchResultTy parseUNorm(OperandVector &Operands); + OperandMatchResultTy parseR128(OperandVector &Operands); + + void cvtVOP3(MCInst &Inst, const OperandVector &Operands); + OperandMatchResultTy parseVOP3OptionalOps(OperandVector &Operands); +}; + +struct OptionalOperand { + const char *Name; + AMDGPUOperand::ImmTy Type; + bool IsBit; + int64_t Default; + bool (*ConvertResult)(int64_t&); +}; + +} + +static unsigned getRegClass(bool IsVgpr, unsigned RegWidth) { + if (IsVgpr) { + switch (RegWidth) { + default: llvm_unreachable("Unknown register width"); + case 1: return AMDGPU::VGPR_32RegClassID; + case 2: return AMDGPU::VReg_64RegClassID; + case 3: return AMDGPU::VReg_96RegClassID; + case 4: return AMDGPU::VReg_128RegClassID; + case 8: return AMDGPU::VReg_256RegClassID; + case 16: return AMDGPU::VReg_512RegClassID; + } + } + + switch (RegWidth) { + default: llvm_unreachable("Unknown register width"); + case 1: return AMDGPU::SGPR_32RegClassID; + case 2: return AMDGPU::SGPR_64RegClassID; + case 4: return AMDGPU::SReg_128RegClassID; + case 8: return AMDGPU::SReg_256RegClassID; + case 16: return AMDGPU::SReg_512RegClassID; + } +} + +static unsigned getRegForName(const StringRef &RegName) { + + return StringSwitch(RegName) + .Case("exec", AMDGPU::EXEC) + .Case("vcc", AMDGPU::VCC) + .Case("flat_scr", AMDGPU::FLAT_SCR) + .Case("m0", AMDGPU::M0) + .Case("scc", AMDGPU::SCC) + .Case("flat_scr_lo", AMDGPU::FLAT_SCR_LO) + .Case("flat_scr_hi", AMDGPU::FLAT_SCR_HI) + .Case("vcc_lo", AMDGPU::VCC_LO) + .Case("vcc_hi", AMDGPU::VCC_HI) + .Case("exec_lo", AMDGPU::EXEC_LO) + .Case("exec_hi", AMDGPU::EXEC_HI) + .Default(0); +} + +bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) { + const AsmToken Tok = Parser.getTok(); + StartLoc = Tok.getLoc(); + EndLoc = Tok.getEndLoc(); + const StringRef &RegName = Tok.getString(); + RegNo = getRegForName(RegName); + + if (RegNo) { + Parser.Lex(); + return false; + } + + // Match vgprs and sgprs + if (RegName[0] != 's' && RegName[0] != 'v') + return true; + + bool IsVgpr = RegName[0] == 'v'; + unsigned RegWidth; + unsigned RegIndexInClass; + if (RegName.size() > 1) { + // We have a 32-bit register + RegWidth = 1; + if (RegName.substr(1).getAsInteger(10, RegIndexInClass)) + return true; + Parser.Lex(); + } else { + // We have a register greater than 32-bits. + + int64_t RegLo, RegHi; + Parser.Lex(); + if (getLexer().isNot(AsmToken::LBrac)) + return true; + + Parser.Lex(); + if (getParser().parseAbsoluteExpression(RegLo)) + return true; + + if (getLexer().isNot(AsmToken::Colon)) + return true; + + Parser.Lex(); + if (getParser().parseAbsoluteExpression(RegHi)) + return true; + + if (getLexer().isNot(AsmToken::RBrac)) + return true; + + Parser.Lex(); + RegWidth = (RegHi - RegLo) + 1; + if (IsVgpr) { + // VGPR registers aren't aligned. + RegIndexInClass = RegLo; + } else { + // SGPR registers are aligned. Max alignment is 4 dwords. + RegIndexInClass = RegLo / std::min(RegWidth, 4u); + } + } + + const MCRegisterInfo *TRC = getContext().getRegisterInfo(); + unsigned RC = getRegClass(IsVgpr, RegWidth); + if (RegIndexInClass > TRC->getRegClass(RC).getNumRegs()) + return true; + RegNo = TRC->getRegClass(RC).getRegister(RegIndexInClass); + return false; +} + +unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { + + uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags; + + if ((getForcedEncodingSize() == 32 && (TSFlags & SIInstrFlags::VOP3)) || + (getForcedEncodingSize() == 64 && !(TSFlags & SIInstrFlags::VOP3))) + return Match_InvalidOperand; + + return Match_Success; +} + + +bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, + MCStreamer &Out, + uint64_t &ErrorInfo, + bool MatchingInlineAsm) { + MCInst Inst; + + switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) { + default: break; + case Match_Success: + Inst.setLoc(IDLoc); + Out.EmitInstruction(Inst, STI); + return false; + case Match_MissingFeature: + return Error(IDLoc, "instruction not supported on this GPU"); + + case Match_MnemonicFail: + return Error(IDLoc, "unrecognized instruction mnemonic"); + + case Match_InvalidOperand: { + SMLoc ErrorLoc = IDLoc; + if (ErrorInfo != ~0ULL) { + if (ErrorInfo >= Operands.size()) { + if (isForcedVOP3()) { + // If 64-bit encoding has been forced we can end up with no + // clamp or omod operands if none of the registers have modifiers, + // so we need to add these to the operand list. + AMDGPUOperand &LastOp = + ((AMDGPUOperand &)*Operands[Operands.size() - 1]); + if (LastOp.isRegKind() || + (LastOp.isImm() && + LastOp.getImmTy() != AMDGPUOperand::ImmTyNone)) { + SMLoc S = Parser.getTok().getLoc(); + Operands.push_back(AMDGPUOperand::CreateImm(0, S, + AMDGPUOperand::ImmTyClamp)); + Operands.push_back(AMDGPUOperand::CreateImm(0, S, + AMDGPUOperand::ImmTyOMod)); + bool Res = MatchAndEmitInstruction(IDLoc, Opcode, Operands, + Out, ErrorInfo, + MatchingInlineAsm); + if (!Res) + return Res; + } + + } + return Error(IDLoc, "too few operands for instruction"); + } + + ErrorLoc = ((AMDGPUOperand &)*Operands[ErrorInfo]).getStartLoc(); + if (ErrorLoc == SMLoc()) + ErrorLoc = IDLoc; + } + return Error(ErrorLoc, "invalid operand for instruction"); + } + } + llvm_unreachable("Implement any new match types added!"); +} + +bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { + return true; +} + +static bool operandsHaveModifiers(const OperandVector &Operands) { + + for (unsigned i = 0, e = Operands.size(); i != e; ++i) { + const AMDGPUOperand &Op = ((AMDGPUOperand&)*Operands[i]); + if (Op.isRegKind() && Op.hasModifiers()) + return true; + if (Op.isImm() && (Op.getImmTy() == AMDGPUOperand::ImmTyOMod || + Op.getImmTy() == AMDGPUOperand::ImmTyClamp)) + return true; + } + return false; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { + + // Try to parse with a custom parser + OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic); + + // If we successfully parsed the operand or if there as an error parsing, + // we are done. + // + // If we are parsing after we reach EndOfStatement then this means we + // are appending default values to the Operands list. This is only done + // by custom parser, so we shouldn't continue on to the generic parsing. + if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail || + getLexer().is(AsmToken::EndOfStatement)) + return ResTy; + + bool Negate = false, Abs = false; + if (getLexer().getKind()== AsmToken::Minus) { + Parser.Lex(); + Negate = true; + } + + if (getLexer().getKind() == AsmToken::Pipe) { + Parser.Lex(); + Abs = true; + } + + switch(getLexer().getKind()) { + case AsmToken::Integer: { + SMLoc S = Parser.getTok().getLoc(); + int64_t IntVal; + if (getParser().parseAbsoluteExpression(IntVal)) + return MatchOperand_ParseFail; + APInt IntVal32(32, IntVal); + if (IntVal32.getSExtValue() != IntVal) { + Error(S, "invalid immediate: only 32-bit values are legal"); + return MatchOperand_ParseFail; + } + + IntVal = IntVal32.getSExtValue(); + if (Negate) + IntVal *= -1; + Operands.push_back(AMDGPUOperand::CreateImm(IntVal, S)); + return MatchOperand_Success; + } + case AsmToken::Real: { + // FIXME: We should emit an error if a double precisions floating-point + // value is used. I'm not sure the best way to detect this. + SMLoc S = Parser.getTok().getLoc(); + int64_t IntVal; + if (getParser().parseAbsoluteExpression(IntVal)) + return MatchOperand_ParseFail; + + APFloat F((float)BitsToDouble(IntVal)); + if (Negate) + F.changeSign(); + Operands.push_back( + AMDGPUOperand::CreateImm(F.bitcastToAPInt().getZExtValue(), S)); + return MatchOperand_Success; + } + case AsmToken::Identifier: { + SMLoc S, E; + unsigned RegNo; + if (!ParseRegister(RegNo, S, E)) { + + bool HasModifiers = operandsHaveModifiers(Operands); + unsigned Modifiers = 0; + + if (Negate) + Modifiers |= 0x1; + + if (Abs) { + if (getLexer().getKind() != AsmToken::Pipe) + return MatchOperand_ParseFail; + Parser.Lex(); + Modifiers |= 0x2; + } + + if (Modifiers && !HasModifiers) { + // We are adding a modifier to src1 or src2 and previous sources + // don't have modifiers, so we need to go back and empty modifers + // for each previous source. + for (unsigned PrevRegIdx = Operands.size() - 1; PrevRegIdx > 1; + --PrevRegIdx) { + + AMDGPUOperand &RegOp = ((AMDGPUOperand&)*Operands[PrevRegIdx]); + RegOp.setModifiers(0); + } + } + + + Operands.push_back(AMDGPUOperand::CreateReg( + RegNo, S, E, getContext().getRegisterInfo(), + isForcedVOP3())); + + if (HasModifiers || Modifiers) { + AMDGPUOperand &RegOp = ((AMDGPUOperand&)*Operands[Operands.size() - 1]); + RegOp.setModifiers(Modifiers); + + } + } else { + Operands.push_back(AMDGPUOperand::CreateToken(Parser.getTok().getString(), + S)); + Parser.Lex(); + } + return MatchOperand_Success; + } + default: + return MatchOperand_NoMatch; + } +} + +bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, + StringRef Name, + SMLoc NameLoc, OperandVector &Operands) { + + // Clear any forced encodings from the previous instruction. + setForcedEncodingSize(0); + + if (Name.endswith("_e64")) + setForcedEncodingSize(64); + else if (Name.endswith("_e32")) + setForcedEncodingSize(32); + + // Add the instruction mnemonic + Operands.push_back(AMDGPUOperand::CreateToken(Name, NameLoc)); + + while (!getLexer().is(AsmToken::EndOfStatement)) { + AMDGPUAsmParser::OperandMatchResultTy Res = parseOperand(Operands, Name); + + // Eat the comma or space if there is one. + if (getLexer().is(AsmToken::Comma)) + Parser.Lex(); + + switch (Res) { + case MatchOperand_Success: break; + case MatchOperand_ParseFail: return Error(getLexer().getLoc(), + "failed parsing operand."); + case MatchOperand_NoMatch: return Error(getLexer().getLoc(), + "not a valid operand."); + } + } + + // Once we reach end of statement, continue parsing so we can add default + // values for optional arguments. + AMDGPUAsmParser::OperandMatchResultTy Res; + while ((Res = parseOperand(Operands, Name)) != MatchOperand_NoMatch) { + if (Res != MatchOperand_Success) + return Error(getLexer().getLoc(), "failed parsing operand."); + } + return false; +} + +//===----------------------------------------------------------------------===// +// Utility functions +//===----------------------------------------------------------------------===// + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int, + int64_t Default) { + + // We are at the end of the statement, and this is a default argument, so + // use a default value. + if (getLexer().is(AsmToken::EndOfStatement)) { + Int = Default; + return MatchOperand_Success; + } + + switch(getLexer().getKind()) { + default: return MatchOperand_NoMatch; + case AsmToken::Identifier: { + StringRef OffsetName = Parser.getTok().getString(); + if (!OffsetName.equals(Prefix)) + return MatchOperand_NoMatch; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Colon)) + return MatchOperand_ParseFail; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Integer)) + return MatchOperand_ParseFail; + + if (getParser().parseAbsoluteExpression(Int)) + return MatchOperand_ParseFail; + break; + } + } + return MatchOperand_Success; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands, + enum AMDGPUOperand::ImmTy ImmTy) { + + SMLoc S = Parser.getTok().getLoc(); + int64_t Offset = 0; + + AMDGPUAsmParser::OperandMatchResultTy Res = parseIntWithPrefix(Prefix, Offset); + if (Res != MatchOperand_Success) + return Res; + + Operands.push_back(AMDGPUOperand::CreateImm(Offset, S, ImmTy)); + return MatchOperand_Success; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands, + enum AMDGPUOperand::ImmTy ImmTy) { + int64_t Bit = 0; + SMLoc S = Parser.getTok().getLoc(); + + // We are at the end of the statement, and this is a default argument, so + // use a default value. + if (getLexer().isNot(AsmToken::EndOfStatement)) { + switch(getLexer().getKind()) { + case AsmToken::Identifier: { + StringRef Tok = Parser.getTok().getString(); + if (Tok == Name) { + Bit = 1; + Parser.Lex(); + } else if (Tok.startswith("no") && Tok.endswith(Name)) { + Bit = 0; + Parser.Lex(); + } else { + return MatchOperand_NoMatch; + } + break; + } + default: + return MatchOperand_NoMatch; + } + } + + Operands.push_back(AMDGPUOperand::CreateImm(Bit, S, ImmTy)); + return MatchOperand_Success; +} + +static bool operandsHasOptionalOp(const OperandVector &Operands, + const OptionalOperand &OOp) { + for (unsigned i = 0; i < Operands.size(); i++) { + const AMDGPUOperand &ParsedOp = ((const AMDGPUOperand &)*Operands[i]); + if ((ParsedOp.isImm() && ParsedOp.getImmTy() == OOp.Type) || + (ParsedOp.isToken() && ParsedOp.getToken() == OOp.Name)) + return true; + + } + return false; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseOptionalOps(const ArrayRef &OptionalOps, + OperandVector &Operands) { + SMLoc S = Parser.getTok().getLoc(); + for (const OptionalOperand &Op : OptionalOps) { + if (operandsHasOptionalOp(Operands, Op)) + continue; + AMDGPUAsmParser::OperandMatchResultTy Res; + int64_t Value; + if (Op.IsBit) { + Res = parseNamedBit(Op.Name, Operands, Op.Type); + if (Res == MatchOperand_NoMatch) + continue; + return Res; + } + + Res = parseIntWithPrefix(Op.Name, Value, Op.Default); + + if (Res == MatchOperand_NoMatch) + continue; + + if (Res != MatchOperand_Success) + return Res; + + if (Op.ConvertResult && !Op.ConvertResult(Value)) { + return MatchOperand_ParseFail; + } + + Operands.push_back(AMDGPUOperand::CreateImm(Value, S, Op.Type)); + return MatchOperand_Success; + } + return MatchOperand_NoMatch; +} + +//===----------------------------------------------------------------------===// +// ds +//===----------------------------------------------------------------------===// + +static const OptionalOperand DSOptionalOps [] = { + {"offset", AMDGPUOperand::ImmTyOffset, false, 0, nullptr}, + {"gds", AMDGPUOperand::ImmTyGDS, true, 0, nullptr} +}; + +static const OptionalOperand DSOptionalOpsOff01 [] = { + {"offset0", AMDGPUOperand::ImmTyDSOffset0, false, 0, nullptr}, + {"offset1", AMDGPUOperand::ImmTyDSOffset1, false, 0, nullptr}, + {"gds", AMDGPUOperand::ImmTyGDS, true, 0, nullptr} +}; + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseDSOptionalOps(OperandVector &Operands) { + return parseOptionalOps(DSOptionalOps, Operands); +} +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseDSOff01OptionalOps(OperandVector &Operands) { + return parseOptionalOps(DSOptionalOpsOff01, Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseDSOffsetOptional(OperandVector &Operands) { + SMLoc S = Parser.getTok().getLoc(); + AMDGPUAsmParser::OperandMatchResultTy Res = + parseIntWithPrefix("offset", Operands, AMDGPUOperand::ImmTyOffset); + if (Res == MatchOperand_NoMatch) { + Operands.push_back(AMDGPUOperand::CreateImm(0, S, + AMDGPUOperand::ImmTyOffset)); + Res = MatchOperand_Success; + } + return Res; +} + +bool AMDGPUOperand::isDSOffset() const { + return isImm() && isUInt<16>(getImm()); +} + +bool AMDGPUOperand::isDSOffset01() const { + return isImm() && isUInt<8>(getImm()); +} + +void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst, + const OperandVector &Operands) { + + std::map OptionalIdx; + + for (unsigned i = 1, e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + + // Add the register arguments + if (Op.isReg()) { + Op.addRegOperands(Inst, 1); + continue; + } + + // Handle optional arguments + OptionalIdx[Op.getImmTy()] = i; + } + + unsigned Offset0Idx = OptionalIdx[AMDGPUOperand::ImmTyDSOffset0]; + unsigned Offset1Idx = OptionalIdx[AMDGPUOperand::ImmTyDSOffset1]; + unsigned GDSIdx = OptionalIdx[AMDGPUOperand::ImmTyGDS]; + + ((AMDGPUOperand &)*Operands[Offset0Idx]).addImmOperands(Inst, 1); // offset0 + ((AMDGPUOperand &)*Operands[Offset1Idx]).addImmOperands(Inst, 1); // offset1 + ((AMDGPUOperand &)*Operands[GDSIdx]).addImmOperands(Inst, 1); // gds + Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0 +} + +void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) { + + std::map OptionalIdx; + bool GDSOnly = false; + + for (unsigned i = 1, e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + + // Add the register arguments + if (Op.isReg()) { + Op.addRegOperands(Inst, 1); + continue; + } + + if (Op.isToken() && Op.getToken() == "gds") { + GDSOnly = true; + continue; + } + + // Handle optional arguments + OptionalIdx[Op.getImmTy()] = i; + } + + unsigned OffsetIdx = OptionalIdx[AMDGPUOperand::ImmTyOffset]; + ((AMDGPUOperand &)*Operands[OffsetIdx]).addImmOperands(Inst, 1); // offset + + if (!GDSOnly) { + unsigned GDSIdx = OptionalIdx[AMDGPUOperand::ImmTyGDS]; + ((AMDGPUOperand &)*Operands[GDSIdx]).addImmOperands(Inst, 1); // gds + } + Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0 +} + + +//===----------------------------------------------------------------------===// +// s_waitcnt +//===----------------------------------------------------------------------===// + +bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) { + StringRef CntName = Parser.getTok().getString(); + int64_t CntVal; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::LParen)) + return true; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Integer)) + return true; + + if (getParser().parseAbsoluteExpression(CntVal)) + return true; + + if (getLexer().isNot(AsmToken::RParen)) + return true; + + Parser.Lex(); + if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma)) + Parser.Lex(); + + int CntShift; + int CntMask; + + if (CntName == "vmcnt") { + CntMask = 0xf; + CntShift = 0; + } else if (CntName == "expcnt") { + CntMask = 0x7; + CntShift = 4; + } else if (CntName == "lgkmcnt") { + CntMask = 0x7; + CntShift = 8; + } else { + return true; + } + + IntVal &= ~(CntMask << CntShift); + IntVal |= (CntVal << CntShift); + return false; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) { + // Disable all counters by default. + // vmcnt [3:0] + // expcnt [6:4] + // lgkmcnt [10:8] + int64_t CntVal = 0x77f; + SMLoc S = Parser.getTok().getLoc(); + + switch(getLexer().getKind()) { + default: return MatchOperand_ParseFail; + case AsmToken::Integer: + // The operand can be an integer value. + if (getParser().parseAbsoluteExpression(CntVal)) + return MatchOperand_ParseFail; + break; + + case AsmToken::Identifier: + do { + if (parseCnt(CntVal)) + return MatchOperand_ParseFail; + } while(getLexer().isNot(AsmToken::EndOfStatement)); + break; + } + Operands.push_back(AMDGPUOperand::CreateImm(CntVal, S)); + return MatchOperand_Success; +} + +bool AMDGPUOperand::isSWaitCnt() const { + return isImm(); +} + +//===----------------------------------------------------------------------===// +// sopp branch targets +//===----------------------------------------------------------------------===// + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) { + SMLoc S = Parser.getTok().getLoc(); + + switch (getLexer().getKind()) { + default: return MatchOperand_ParseFail; + case AsmToken::Integer: { + int64_t Imm; + if (getParser().parseAbsoluteExpression(Imm)) + return MatchOperand_ParseFail; + Operands.push_back(AMDGPUOperand::CreateImm(Imm, S)); + return MatchOperand_Success; + } + + case AsmToken::Identifier: + Operands.push_back(AMDGPUOperand::CreateExpr( + MCSymbolRefExpr::create(getContext().getOrCreateSymbol( + Parser.getTok().getString()), getContext()), S)); + Parser.Lex(); + return MatchOperand_Success; + } +} + +//===----------------------------------------------------------------------===// +// flat +//===----------------------------------------------------------------------===// + +static const OptionalOperand FlatOptionalOps [] = { + {"glc", AMDGPUOperand::ImmTyGLC, true, 0, nullptr}, + {"slc", AMDGPUOperand::ImmTySLC, true, 0, nullptr}, + {"tfe", AMDGPUOperand::ImmTyTFE, true, 0, nullptr} +}; + +static const OptionalOperand FlatAtomicOptionalOps [] = { + {"slc", AMDGPUOperand::ImmTySLC, true, 0, nullptr}, + {"tfe", AMDGPUOperand::ImmTyTFE, true, 0, nullptr} +}; + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseFlatOptionalOps(OperandVector &Operands) { + return parseOptionalOps(FlatOptionalOps, Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseFlatAtomicOptionalOps(OperandVector &Operands) { + return parseOptionalOps(FlatAtomicOptionalOps, Operands); +} + +void AMDGPUAsmParser::cvtFlat(MCInst &Inst, + const OperandVector &Operands) { + std::map OptionalIdx; + + for (unsigned i = 1, e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + + // Add the register arguments + if (Op.isReg()) { + Op.addRegOperands(Inst, 1); + continue; + } + + // Handle 'glc' token which is sometimes hard-coded into the + // asm string. There are no MCInst operands for these. + if (Op.isToken()) + continue; + + // Handle optional arguments + OptionalIdx[Op.getImmTy()] = i; + + } + + // flat atomic instructions don't have a glc argument. + if (OptionalIdx.count(AMDGPUOperand::ImmTyGLC)) { + unsigned GLCIdx = OptionalIdx[AMDGPUOperand::ImmTyGLC]; + ((AMDGPUOperand &)*Operands[GLCIdx]).addImmOperands(Inst, 1); + } + + unsigned SLCIdx = OptionalIdx[AMDGPUOperand::ImmTySLC]; + unsigned TFEIdx = OptionalIdx[AMDGPUOperand::ImmTyTFE]; + + ((AMDGPUOperand &)*Operands[SLCIdx]).addImmOperands(Inst, 1); + ((AMDGPUOperand &)*Operands[TFEIdx]).addImmOperands(Inst, 1); +} + +//===----------------------------------------------------------------------===// +// mubuf +//===----------------------------------------------------------------------===// + +static const OptionalOperand MubufOptionalOps [] = { + {"offset", AMDGPUOperand::ImmTyOffset, false, 0, nullptr}, + {"glc", AMDGPUOperand::ImmTyGLC, true, 0, nullptr}, + {"slc", AMDGPUOperand::ImmTySLC, true, 0, nullptr}, + {"tfe", AMDGPUOperand::ImmTyTFE, true, 0, nullptr} +}; + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseMubufOptionalOps(OperandVector &Operands) { + return parseOptionalOps(MubufOptionalOps, Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseOffset(OperandVector &Operands) { + return parseIntWithPrefix("offset", Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseGLC(OperandVector &Operands) { + return parseNamedBit("glc", Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseSLC(OperandVector &Operands) { + return parseNamedBit("slc", Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseTFE(OperandVector &Operands) { + return parseNamedBit("tfe", Operands); +} + +bool AMDGPUOperand::isMubufOffset() const { + return isImm() && isUInt<12>(getImm()); +} + +void AMDGPUAsmParser::cvtMubuf(MCInst &Inst, + const OperandVector &Operands) { + std::map OptionalIdx; + + for (unsigned i = 1, e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + + // Add the register arguments + if (Op.isReg()) { + Op.addRegOperands(Inst, 1); + continue; + } + + // Handle the case where soffset is an immediate + if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) { + Op.addImmOperands(Inst, 1); + continue; + } + + // Handle tokens like 'offen' which are sometimes hard-coded into the + // asm string. There are no MCInst operands for these. + if (Op.isToken()) { + continue; + } + assert(Op.isImm()); + + // Handle optional arguments + OptionalIdx[Op.getImmTy()] = i; + } + + assert(OptionalIdx.size() == 4); + + unsigned OffsetIdx = OptionalIdx[AMDGPUOperand::ImmTyOffset]; + unsigned GLCIdx = OptionalIdx[AMDGPUOperand::ImmTyGLC]; + unsigned SLCIdx = OptionalIdx[AMDGPUOperand::ImmTySLC]; + unsigned TFEIdx = OptionalIdx[AMDGPUOperand::ImmTyTFE]; + + ((AMDGPUOperand &)*Operands[OffsetIdx]).addImmOperands(Inst, 1); + ((AMDGPUOperand &)*Operands[GLCIdx]).addImmOperands(Inst, 1); + ((AMDGPUOperand &)*Operands[SLCIdx]).addImmOperands(Inst, 1); + ((AMDGPUOperand &)*Operands[TFEIdx]).addImmOperands(Inst, 1); +} + +//===----------------------------------------------------------------------===// +// mimg +//===----------------------------------------------------------------------===// + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseDMask(OperandVector &Operands) { + return parseIntWithPrefix("dmask", Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseUNorm(OperandVector &Operands) { + return parseNamedBit("unorm", Operands); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseR128(OperandVector &Operands) { + return parseNamedBit("r128", Operands); +} + +//===----------------------------------------------------------------------===// +// vop3 +//===----------------------------------------------------------------------===// + +static bool ConvertOmodMul(int64_t &Mul) { + if (Mul != 1 && Mul != 2 && Mul != 4) + return false; + + Mul >>= 1; + return true; +} + +static bool ConvertOmodDiv(int64_t &Div) { + if (Div == 1) { + Div = 0; + return true; + } + + if (Div == 2) { + Div = 3; + return true; + } + + return false; +} + +static const OptionalOperand VOP3OptionalOps [] = { + {"clamp", AMDGPUOperand::ImmTyClamp, true, 0, nullptr}, + {"mul", AMDGPUOperand::ImmTyOMod, false, 1, ConvertOmodMul}, + {"div", AMDGPUOperand::ImmTyOMod, false, 1, ConvertOmodDiv}, +}; + +static bool isVOP3(OperandVector &Operands) { + if (operandsHaveModifiers(Operands)) + return true; + + AMDGPUOperand &DstOp = ((AMDGPUOperand&)*Operands[1]); + + if (DstOp.isReg() && DstOp.isRegClass(AMDGPU::SGPR_64RegClassID)) + return true; + + if (Operands.size() >= 5) + return true; + + if (Operands.size() > 3) { + AMDGPUOperand &Src1Op = ((AMDGPUOperand&)*Operands[3]); + if (Src1Op.getReg() && (Src1Op.isRegClass(AMDGPU::SReg_32RegClassID) || + Src1Op.isRegClass(AMDGPU::SReg_64RegClassID))) + return true; + } + return false; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseVOP3OptionalOps(OperandVector &Operands) { + + // The value returned by this function may change after parsing + // an operand so store the original value here. + bool HasModifiers = operandsHaveModifiers(Operands); + + bool IsVOP3 = isVOP3(Operands); + if (HasModifiers || IsVOP3 || + getLexer().isNot(AsmToken::EndOfStatement) || + getForcedEncodingSize() == 64) { + + AMDGPUAsmParser::OperandMatchResultTy Res = + parseOptionalOps(VOP3OptionalOps, Operands); + + if (!HasModifiers && Res == MatchOperand_Success) { + // We have added a modifier operation, so we need to make sure all + // previous register operands have modifiers + for (unsigned i = 2, e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand&)*Operands[i]); + if (Op.isReg()) + Op.setModifiers(0); + } + } + return Res; + } + return MatchOperand_NoMatch; +} + +void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) { + ((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1); + unsigned i = 2; + + std::map OptionalIdx; + + if (operandsHaveModifiers(Operands)) { + for (unsigned e = Operands.size(); i != e; ++i) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + + if (Op.isRegWithInputMods()) { + ((AMDGPUOperand &)*Operands[i]).addRegWithInputModsOperands(Inst, 2); + continue; + } + OptionalIdx[Op.getImmTy()] = i; + } + + unsigned ClampIdx = OptionalIdx[AMDGPUOperand::ImmTyClamp]; + unsigned OModIdx = OptionalIdx[AMDGPUOperand::ImmTyOMod]; + + ((AMDGPUOperand &)*Operands[ClampIdx]).addImmOperands(Inst, 1); + ((AMDGPUOperand &)*Operands[OModIdx]).addImmOperands(Inst, 1); + } else { + for (unsigned e = Operands.size(); i != e; ++i) + ((AMDGPUOperand &)*Operands[i]).addRegOrImmOperands(Inst, 1); + } +} + +/// Force static initialization. +extern "C" void LLVMInitializeAMDGPUAsmParser() { + RegisterMCAsmParser A(TheAMDGPUTarget); + RegisterMCAsmParser B(TheGCNTarget); +} + +#define GET_REGISTER_MATCHER +#define GET_MATCHER_IMPLEMENTATION +#include "AMDGPUGenAsmMatcher.inc" + diff --git a/lib/Target/AMDGPU/AsmParser/CMakeLists.txt b/lib/Target/AMDGPU/AsmParser/CMakeLists.txt new file mode 100644 index 00000000000..21ddc4eb83d --- /dev/null +++ b/lib/Target/AMDGPU/AsmParser/CMakeLists.txt @@ -0,0 +1,3 @@ +add_llvm_library(LLVMAMDGPUAsmParser + AMDGPUAsmParser.cpp + ) diff --git a/lib/Target/AMDGPU/AsmParser/LLVMBuild.txt b/lib/Target/AMDGPU/AsmParser/LLVMBuild.txt new file mode 100644 index 00000000000..63d44d1e06f --- /dev/null +++ b/lib/Target/AMDGPU/AsmParser/LLVMBuild.txt @@ -0,0 +1,23 @@ +;===- ./lib/Target/AMDGPU/AsmParser/LLVMBuild.txt -------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = AMDGPUAsmParser +parent = AMDGPU +required_libraries = MC MCParser AMDGPUDesc AMDGPUInfo Support +add_to_library_groups = AMDGPU diff --git a/lib/Target/AMDGPU/AsmParser/Makefile b/lib/Target/AMDGPU/AsmParser/Makefile new file mode 100644 index 00000000000..e6689b54b6b --- /dev/null +++ b/lib/Target/AMDGPU/AsmParser/Makefile @@ -0,0 +1,15 @@ +##===- lib/Target/R600/AsmParser/Makefile ----------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## +LEVEL = ../../../.. +LIBRARYNAME = LLVMR600AsmParser + +# Hack: we need to include 'main' R600 target directory to grab private headers +CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. + +include $(LEVEL)/Makefile.common diff --git a/lib/Target/AMDGPU/CIInstructions.td b/lib/Target/AMDGPU/CIInstructions.td new file mode 100644 index 00000000000..2f5fdbe9207 --- /dev/null +++ b/lib/Target/AMDGPU/CIInstructions.td @@ -0,0 +1,149 @@ +//===-- CIInstructions.td - CI Instruction Defintions ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// Instruction definitions for CI and newer. +//===----------------------------------------------------------------------===// + + +def isCIVI : Predicate < + "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || " + "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS" +>, AssemblerPredicate<"FeatureCIInsts">; + +def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">; + +//===----------------------------------------------------------------------===// +// VOP1 Instructions +//===----------------------------------------------------------------------===// + +let SubtargetPredicate = isCIVI in { + +defm V_TRUNC_F64 : VOP1Inst , "v_trunc_f64", + VOP_F64_F64, ftrunc +>; +defm V_CEIL_F64 : VOP1Inst , "v_ceil_f64", + VOP_F64_F64, fceil +>; +defm V_FLOOR_F64 : VOP1Inst , "v_floor_f64", + VOP_F64_F64, ffloor +>; +defm V_RNDNE_F64 : VOP1Inst , "v_rndne_f64", + VOP_F64_F64, frint +>; +defm V_LOG_LEGACY_F32 : VOP1Inst , "v_log_legacy_f32", + VOP_F32_F32 +>; +defm V_EXP_LEGACY_F32 : VOP1Inst , "v_exp_legacy_f32", + VOP_F32_F32 +>; + +//===----------------------------------------------------------------------===// +// Flat Instructions +//===----------------------------------------------------------------------===// + +def FLAT_LOAD_UBYTE : FLAT_Load_Helper <0x8, "flat_load_ubyte", VGPR_32>; +def FLAT_LOAD_SBYTE : FLAT_Load_Helper <0x9, "flat_load_sbyte", VGPR_32>; +def FLAT_LOAD_USHORT : FLAT_Load_Helper <0xa, "flat_load_ushort", VGPR_32>; +def FLAT_LOAD_SSHORT : FLAT_Load_Helper <0xb, "flat_load_sshort", VGPR_32>; +def FLAT_LOAD_DWORD : FLAT_Load_Helper <0xc, "flat_load_dword", VGPR_32>; +def FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <0xd, "flat_load_dwordx2", VReg_64>; +def FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <0xe, "flat_load_dwordx4", VReg_128>; +def FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <0xf, "flat_load_dwordx3", VReg_96>; +def FLAT_STORE_BYTE : FLAT_Store_Helper <0x18, "flat_store_byte", VGPR_32>; +def FLAT_STORE_SHORT : FLAT_Store_Helper <0x1a, "flat_store_short", VGPR_32>; +def FLAT_STORE_DWORD : FLAT_Store_Helper <0x1c, "flat_store_dword", VGPR_32>; +def FLAT_STORE_DWORDX2 : FLAT_Store_Helper < + 0x1d, "flat_store_dwordx2", VReg_64 +>; +def FLAT_STORE_DWORDX4 : FLAT_Store_Helper < + 0x1e, "flat_store_dwordx4", VReg_128 +>; +def FLAT_STORE_DWORDX3 : FLAT_Store_Helper < + 0x1f, "flat_store_dwordx3", VReg_96 +>; +defm FLAT_ATOMIC_SWAP : FLAT_ATOMIC <0x30, "flat_atomic_swap", VGPR_32>; +defm FLAT_ATOMIC_CMPSWAP : FLAT_ATOMIC < + 0x31, "flat_atomic_cmpswap", VGPR_32, VReg_64 +>; +defm FLAT_ATOMIC_ADD : FLAT_ATOMIC <0x32, "flat_atomic_add", VGPR_32>; +defm FLAT_ATOMIC_SUB : FLAT_ATOMIC <0x33, "flat_atomic_sub", VGPR_32>; +defm FLAT_ATOMIC_RSUB : FLAT_ATOMIC <0x34, "flat_atomic_rsub", VGPR_32>; +defm FLAT_ATOMIC_SMIN : FLAT_ATOMIC <0x35, "flat_atomic_smin", VGPR_32>; +defm FLAT_ATOMIC_UMIN : FLAT_ATOMIC <0x36, "flat_atomic_umin", VGPR_32>; +defm FLAT_ATOMIC_SMAX : FLAT_ATOMIC <0x37, "flat_atomic_smax", VGPR_32>; +defm FLAT_ATOMIC_UMAX : FLAT_ATOMIC <0x38, "flat_atomic_umax", VGPR_32>; +defm FLAT_ATOMIC_AND : FLAT_ATOMIC <0x39, "flat_atomic_and", VGPR_32>; +defm FLAT_ATOMIC_OR : FLAT_ATOMIC <0x3a, "flat_atomic_or", VGPR_32>; +defm FLAT_ATOMIC_XOR : FLAT_ATOMIC <0x3b, "flat_atomic_xor", VGPR_32>; +defm FLAT_ATOMIC_INC : FLAT_ATOMIC <0x3c, "flat_atomic_inc", VGPR_32>; +defm FLAT_ATOMIC_DEC : FLAT_ATOMIC <0x3d, "flat_atomic_dec", VGPR_32>; +defm FLAT_ATOMIC_FCMPSWAP : FLAT_ATOMIC < + 0x3e, "flat_atomic_fcmpswap", VGPR_32, VReg_64 +>; +defm FLAT_ATOMIC_FMIN : FLAT_ATOMIC <0x3f, "flat_atomic_fmin", VGPR_32>; +defm FLAT_ATOMIC_FMAX : FLAT_ATOMIC <0x40, "flat_atomic_fmax", VGPR_32>; +defm FLAT_ATOMIC_SWAP_X2 : FLAT_ATOMIC <0x50, "flat_atomic_swap_x2", VReg_64>; +defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_ATOMIC < + 0x51, "flat_atomic_cmpswap_x2", VReg_64, VReg_128 +>; +defm FLAT_ATOMIC_ADD_X2 : FLAT_ATOMIC <0x52, "flat_atomic_add_x2", VReg_64>; +defm FLAT_ATOMIC_SUB_X2 : FLAT_ATOMIC <0x53, "flat_atomic_sub_x2", VReg_64>; +defm FLAT_ATOMIC_RSUB_X2 : FLAT_ATOMIC <0x54, "flat_atomic_rsub_x2", VReg_64>; +defm FLAT_ATOMIC_SMIN_X2 : FLAT_ATOMIC <0x55, "flat_atomic_smin_x2", VReg_64>; +defm FLAT_ATOMIC_UMIN_X2 : FLAT_ATOMIC <0x56, "flat_atomic_umin_x2", VReg_64>; +defm FLAT_ATOMIC_SMAX_X2 : FLAT_ATOMIC <0x57, "flat_atomic_smax_x2", VReg_64>; +defm FLAT_ATOMIC_UMAX_X2 : FLAT_ATOMIC <0x58, "flat_atomic_umax_x2", VReg_64>; +defm FLAT_ATOMIC_AND_X2 : FLAT_ATOMIC <0x59, "flat_atomic_and_x2", VReg_64>; +defm FLAT_ATOMIC_OR_X2 : FLAT_ATOMIC <0x5a, "flat_atomic_or_x2", VReg_64>; +defm FLAT_ATOMIC_XOR_X2 : FLAT_ATOMIC <0x5b, "flat_atomic_xor_x2", VReg_64>; +defm FLAT_ATOMIC_INC_X2 : FLAT_ATOMIC <0x5c, "flat_atomic_inc_x2", VReg_64>; +defm FLAT_ATOMIC_DEC_X2 : FLAT_ATOMIC <0x5d, "flat_atomic_dec_x2", VReg_64>; +defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_ATOMIC < + 0x5e, "flat_atomic_fcmpswap_x2", VReg_64, VReg_128 +>; +defm FLAT_ATOMIC_FMIN_X2 : FLAT_ATOMIC <0x5f, "flat_atomic_fmin_x2", VReg_64>; +defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC <0x60, "flat_atomic_fmax_x2", VReg_64>; + +} // End SubtargetPredicate = isCIVI + +//===----------------------------------------------------------------------===// +// Flat Patterns +//===----------------------------------------------------------------------===// + +let Predicates = [HasFlatAddressSpace] in { + +class FLATLoad_Pattern : + Pat <(vt (flat_ld i64:$ptr)), + (Instr_ADDR64 $ptr, 0, 0, 0) +>; + +def : FLATLoad_Pattern ; +def : FLATLoad_Pattern ; +def : FLATLoad_Pattern ; +def : FLATLoad_Pattern ; +def : FLATLoad_Pattern ; +def : FLATLoad_Pattern ; +def : FLATLoad_Pattern ; +def : FLATLoad_Pattern ; +def : FLATLoad_Pattern ; + +class FLATStore_Pattern : + Pat <(st vt:$value, i64:$ptr), + (Instr $value, $ptr, 0, 0, 0) + >; + +def : FLATStore_Pattern ; +def : FLATStore_Pattern ; +def : FLATStore_Pattern ; +def : FLATStore_Pattern ; +def : FLATStore_Pattern ; +def : FLATStore_Pattern ; + +} // End HasFlatAddressSpace predicate + diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt new file mode 100644 index 00000000000..3e5ff1f3c6d --- /dev/null +++ b/lib/Target/AMDGPU/CMakeLists.txt @@ -0,0 +1,64 @@ +set(LLVM_TARGET_DEFINITIONS AMDGPU.td) + +tablegen(LLVM AMDGPUGenRegisterInfo.inc -gen-register-info) +tablegen(LLVM AMDGPUGenInstrInfo.inc -gen-instr-info) +tablegen(LLVM AMDGPUGenDAGISel.inc -gen-dag-isel) +tablegen(LLVM AMDGPUGenCallingConv.inc -gen-callingconv) +tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget) +tablegen(LLVM AMDGPUGenIntrinsics.inc -gen-tgt-intrinsic) +tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter) +tablegen(LLVM AMDGPUGenDFAPacketizer.inc -gen-dfa-packetizer) +tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer) +tablegen(LLVM AMDGPUGenAsmMatcher.inc -gen-asm-matcher) +add_public_tablegen_target(AMDGPUCommonTableGen) + +add_llvm_target(AMDGPUCodeGen + AMDILCFGStructurizer.cpp + AMDGPUAlwaysInlinePass.cpp + AMDGPUAsmPrinter.cpp + AMDGPUFrameLowering.cpp + AMDGPUIntrinsicInfo.cpp + AMDGPUISelDAGToDAG.cpp + AMDGPUMCInstLower.cpp + AMDGPUMachineFunction.cpp + AMDGPUSubtarget.cpp + AMDGPUTargetMachine.cpp + AMDGPUTargetTransformInfo.cpp + AMDGPUISelLowering.cpp + AMDGPUInstrInfo.cpp + AMDGPUPromoteAlloca.cpp + AMDGPURegisterInfo.cpp + R600ClauseMergePass.cpp + R600ControlFlowFinalizer.cpp + R600EmitClauseMarkers.cpp + R600ExpandSpecialInstrs.cpp + R600InstrInfo.cpp + R600ISelLowering.cpp + R600MachineFunctionInfo.cpp + R600MachineScheduler.cpp + R600OptimizeVectorRegisters.cpp + R600Packetizer.cpp + R600RegisterInfo.cpp + R600TextureIntrinsicsReplacer.cpp + SIAnnotateControlFlow.cpp + SIFixControlFlowLiveIntervals.cpp + SIFixSGPRCopies.cpp + SIFixSGPRLiveRanges.cpp + SIFoldOperands.cpp + SIInsertWaits.cpp + SIInstrInfo.cpp + SIISelLowering.cpp + SILoadStoreOptimizer.cpp + SILowerControlFlow.cpp + SILowerI1Copies.cpp + SIMachineFunctionInfo.cpp + SIPrepareScratchRegs.cpp + SIRegisterInfo.cpp + SIShrinkInstructions.cpp + SITypeRewriter.cpp + ) + +add_subdirectory(AsmParser) +add_subdirectory(InstPrinter) +add_subdirectory(TargetInfo) +add_subdirectory(MCTargetDesc) diff --git a/lib/Target/AMDGPU/CaymanInstructions.td b/lib/Target/AMDGPU/CaymanInstructions.td new file mode 100644 index 00000000000..ba4df82a6d3 --- /dev/null +++ b/lib/Target/AMDGPU/CaymanInstructions.td @@ -0,0 +1,226 @@ +//===-- CaymanInstructions.td - CM Instruction defs -------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TableGen definitions for instructions which are available only on Cayman +// family GPUs. +// +//===----------------------------------------------------------------------===// + +def isCayman : Predicate<"Subtarget->hasCaymanISA()">; + +//===----------------------------------------------------------------------===// +// Cayman Instructions +//===----------------------------------------------------------------------===// + +let Predicates = [isCayman] in { + +def MULADD_INT24_cm : R600_3OP <0x08, "MULADD_INT24", + [(set i32:$dst, (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2))], VecALU +>; +def MUL_INT24_cm : R600_2OP <0x5B, "MUL_INT24", + [(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))], VecALU +>; + +def : IMad24Pat; + +let isVector = 1 in { + +def RECIP_IEEE_cm : RECIP_IEEE_Common<0x86>; + +def MULLO_INT_cm : MULLO_INT_Common<0x8F>; +def MULHI_INT_cm : MULHI_INT_Common<0x90>; +def MULLO_UINT_cm : MULLO_UINT_Common<0x91>; +def MULHI_UINT_cm : MULHI_UINT_Common<0x92>; +def RECIPSQRT_CLAMPED_cm : RECIPSQRT_CLAMPED_Common<0x87>; +def EXP_IEEE_cm : EXP_IEEE_Common<0x81>; +def LOG_IEEE_cm : LOG_IEEE_Common<0x83>; +def RECIP_CLAMPED_cm : RECIP_CLAMPED_Common<0x84>; +def RECIPSQRT_IEEE_cm : RECIPSQRT_IEEE_Common<0x89>; +def SIN_cm : SIN_Common<0x8D>; +def COS_cm : COS_Common<0x8E>; +} // End isVector = 1 + +def : RsqPat; + +def : POW_Common ; + +defm DIV_cm : DIV_Common; +defm : Expand24UBitOps; + +// RECIP_UINT emulation for Cayman +// The multiplication scales from [0,1] to the unsigned integer range +def : Pat < + (AMDGPUurecip i32:$src0), + (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg $src0)), + (MOV_IMM_I32 CONST.FP_UINT_MAX_PLUS_1))) +>; + + def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> { + let ADDR = 0; + let POP_COUNT = 0; + let COUNT = 0; + } + + +def : Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>; + +class RAT_STORE_DWORD mask> : + CF_MEM_RAT_CACHELESS <0x14, 0, mask, + (ins rc:$rw_gpr, R600_TReg32_X:$index_gpr), + "STORE_DWORD $rw_gpr, $index_gpr", + [(global_store vt:$rw_gpr, i32:$index_gpr)]> { + let eop = 0; // This bit is not used on Cayman. +} + +def RAT_STORE_DWORD32 : RAT_STORE_DWORD ; +def RAT_STORE_DWORD64 : RAT_STORE_DWORD ; +def RAT_STORE_DWORD128 : RAT_STORE_DWORD ; + +class VTX_READ_cm buffer_id, dag outs, list pattern> + : VTX_WORD0_cm, VTX_READ { + + // Static fields + let VC_INST = 0; + let FETCH_TYPE = 2; + let FETCH_WHOLE_QUAD = 0; + let BUFFER_ID = buffer_id; + let SRC_REL = 0; + // XXX: We can infer this field based on the SRC_GPR. This would allow us + // to store vertex addresses in any channel, not just X. + let SRC_SEL_X = 0; + let SRC_SEL_Y = 0; + let STRUCTURED_READ = 0; + let LDS_REQ = 0; + let COALESCED_READ = 0; + + let Inst{31-0} = Word0; +} + +class VTX_READ_8_cm buffer_id, list pattern> + : VTX_READ_cm <"VTX_READ_8 $dst_gpr, $src_gpr", buffer_id, + (outs R600_TReg32_X:$dst_gpr), pattern> { + + let DST_SEL_X = 0; + let DST_SEL_Y = 7; // Masked + let DST_SEL_Z = 7; // Masked + let DST_SEL_W = 7; // Masked + let DATA_FORMAT = 1; // FMT_8 +} + +class VTX_READ_16_cm buffer_id, list pattern> + : VTX_READ_cm <"VTX_READ_16 $dst_gpr, $src_gpr", buffer_id, + (outs R600_TReg32_X:$dst_gpr), pattern> { + let DST_SEL_X = 0; + let DST_SEL_Y = 7; // Masked + let DST_SEL_Z = 7; // Masked + let DST_SEL_W = 7; // Masked + let DATA_FORMAT = 5; // FMT_16 + +} + +class VTX_READ_32_cm buffer_id, list pattern> + : VTX_READ_cm <"VTX_READ_32 $dst_gpr, $src_gpr", buffer_id, + (outs R600_TReg32_X:$dst_gpr), pattern> { + + let DST_SEL_X = 0; + let DST_SEL_Y = 7; // Masked + let DST_SEL_Z = 7; // Masked + let DST_SEL_W = 7; // Masked + let DATA_FORMAT = 0xD; // COLOR_32 + + // This is not really necessary, but there were some GPU hangs that appeared + // to be caused by ALU instructions in the next instruction group that wrote + // to the $src_gpr registers of the VTX_READ. + // e.g. + // %T3_X = VTX_READ_PARAM_32_eg %T2_X, 24 + // %T2_X = MOV %ZERO + //Adding this constraint prevents this from happening. + let Constraints = "$src_gpr.ptr = $dst_gpr"; +} + +class VTX_READ_64_cm buffer_id, list pattern> + : VTX_READ_cm <"VTX_READ_64 $dst_gpr, $src_gpr", buffer_id, + (outs R600_Reg64:$dst_gpr), pattern> { + + let DST_SEL_X = 0; + let DST_SEL_Y = 1; + let DST_SEL_Z = 7; + let DST_SEL_W = 7; + let DATA_FORMAT = 0x1D; // COLOR_32_32 +} + +class VTX_READ_128_cm buffer_id, list pattern> + : VTX_READ_cm <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id, + (outs R600_Reg128:$dst_gpr), pattern> { + + let DST_SEL_X = 0; + let DST_SEL_Y = 1; + let DST_SEL_Z = 2; + let DST_SEL_W = 3; + let DATA_FORMAT = 0x22; // COLOR_32_32_32_32 + + // XXX: Need to force VTX_READ_128 instructions to write to the same register + // that holds its buffer address to avoid potential hangs. We can't use + // the same constraint as VTX_READ_32_eg, because the $src_gpr.ptr and $dst + // registers are different sizes. +} + +//===----------------------------------------------------------------------===// +// VTX Read from parameter memory space +//===----------------------------------------------------------------------===// +def VTX_READ_PARAM_8_cm : VTX_READ_8_cm <0, + [(set i32:$dst_gpr, (load_param_exti8 ADDRVTX_READ:$src_gpr))] +>; + +def VTX_READ_PARAM_16_cm : VTX_READ_16_cm <0, + [(set i32:$dst_gpr, (load_param_exti16 ADDRVTX_READ:$src_gpr))] +>; + +def VTX_READ_PARAM_32_cm : VTX_READ_32_cm <0, + [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] +>; + +def VTX_READ_PARAM_64_cm : VTX_READ_64_cm <0, + [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] +>; + +def VTX_READ_PARAM_128_cm : VTX_READ_128_cm <0, + [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] +>; + +//===----------------------------------------------------------------------===// +// VTX Read from global memory space +//===----------------------------------------------------------------------===// + +// 8-bit reads +def VTX_READ_GLOBAL_8_cm : VTX_READ_8_cm <1, + [(set i32:$dst_gpr, (az_extloadi8_global ADDRVTX_READ:$src_gpr))] +>; + +def VTX_READ_GLOBAL_16_cm : VTX_READ_16_cm <1, + [(set i32:$dst_gpr, (az_extloadi16_global ADDRVTX_READ:$src_gpr))] +>; + +// 32-bit reads +def VTX_READ_GLOBAL_32_cm : VTX_READ_32_cm <1, + [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +>; + +// 64-bit reads +def VTX_READ_GLOBAL_64_cm : VTX_READ_64_cm <1, + [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +>; + +// 128-bit reads +def VTX_READ_GLOBAL_128_cm : VTX_READ_128_cm <1, + [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +>; + +} // End isCayman + diff --git a/lib/Target/AMDGPU/EvergreenInstructions.td b/lib/Target/AMDGPU/EvergreenInstructions.td new file mode 100644 index 00000000000..7adcd46fe19 --- /dev/null +++ b/lib/Target/AMDGPU/EvergreenInstructions.td @@ -0,0 +1,670 @@ +//===-- EvergreenInstructions.td - EG Instruction defs ----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TableGen definitions for instructions which are: +// - Available to Evergreen and newer VLIW4/VLIW5 GPUs +// - Available only on Evergreen family GPUs. +// +//===----------------------------------------------------------------------===// + +def isEG : Predicate< + "Subtarget->getGeneration() >= AMDGPUSubtarget::EVERGREEN && " + "Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS && " + "!Subtarget->hasCaymanISA()" +>; + +def isEGorCayman : Predicate< + "Subtarget->getGeneration() == AMDGPUSubtarget::EVERGREEN ||" + "Subtarget->getGeneration() ==AMDGPUSubtarget::NORTHERN_ISLANDS" +>; + +//===----------------------------------------------------------------------===// +// Evergreen / Cayman store instructions +//===----------------------------------------------------------------------===// + +let Predicates = [isEGorCayman] in { + +class CF_MEM_RAT_CACHELESS rat_inst, bits<4> rat_id, bits<4> mask, dag ins, + string name, list pattern> + : EG_CF_RAT <0x57, rat_inst, rat_id, mask, (outs), ins, + "MEM_RAT_CACHELESS "#name, pattern>; + +class CF_MEM_RAT rat_inst, bits<4> rat_id, dag ins, string name, + list pattern> + : EG_CF_RAT <0x56, rat_inst, rat_id, 0xf /* mask */, (outs), ins, + "MEM_RAT "#name, pattern>; + +def RAT_MSKOR : CF_MEM_RAT <0x11, 0, + (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr), + "MSKOR $rw_gpr.XW, $index_gpr", + [(mskor_global v4i32:$rw_gpr, i32:$index_gpr)] +> { + let eop = 0; +} + +} // End let Predicates = [isEGorCayman] + +//===----------------------------------------------------------------------===// +// Evergreen Only instructions +//===----------------------------------------------------------------------===// + +let Predicates = [isEG] in { + +def RECIP_IEEE_eg : RECIP_IEEE_Common<0x86>; +defm DIV_eg : DIV_Common; + +def MULLO_INT_eg : MULLO_INT_Common<0x8F>; +def MULHI_INT_eg : MULHI_INT_Common<0x90>; +def MULLO_UINT_eg : MULLO_UINT_Common<0x91>; +def MULHI_UINT_eg : MULHI_UINT_Common<0x92>; +def RECIP_UINT_eg : RECIP_UINT_Common<0x94>; +def RECIPSQRT_CLAMPED_eg : RECIPSQRT_CLAMPED_Common<0x87>; +def EXP_IEEE_eg : EXP_IEEE_Common<0x81>; +def LOG_IEEE_eg : LOG_IEEE_Common<0x83>; +def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>; +def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>; +def : RsqPat; +def SIN_eg : SIN_Common<0x8D>; +def COS_eg : COS_Common<0x8E>; + +def : POW_Common ; +def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>; + +defm : Expand24IBitOps; + +//===----------------------------------------------------------------------===// +// Memory read/write instructions +//===----------------------------------------------------------------------===// + +let usesCustomInserter = 1 in { + +// 32-bit store +def RAT_WRITE_CACHELESS_32_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x1, + (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), + "STORE_RAW $rw_gpr, $index_gpr, $eop", + [(global_store i32:$rw_gpr, i32:$index_gpr)] +>; + +// 64-bit store +def RAT_WRITE_CACHELESS_64_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x3, + (ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), + "STORE_RAW $rw_gpr.XY, $index_gpr, $eop", + [(global_store v2i32:$rw_gpr, i32:$index_gpr)] +>; + +//128-bit store +def RAT_WRITE_CACHELESS_128_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0xf, + (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), + "STORE_RAW $rw_gpr.XYZW, $index_gpr, $eop", + [(global_store v4i32:$rw_gpr, i32:$index_gpr)] +>; + +} // End usesCustomInserter = 1 + +class VTX_READ_eg buffer_id, dag outs, list pattern> + : VTX_WORD0_eg, VTX_READ { + + // Static fields + let VC_INST = 0; + let FETCH_TYPE = 2; + let FETCH_WHOLE_QUAD = 0; + let BUFFER_ID = buffer_id; + let SRC_REL = 0; + // XXX: We can infer this field based on the SRC_GPR. This would allow us + // to store vertex addresses in any channel, not just X. + let SRC_SEL_X = 0; + + let Inst{31-0} = Word0; +} + +class VTX_READ_8_eg buffer_id, list pattern> + : VTX_READ_eg <"VTX_READ_8 $dst_gpr, $src_gpr", buffer_id, + (outs R600_TReg32_X:$dst_gpr), pattern> { + + let MEGA_FETCH_COUNT = 1; + let DST_SEL_X = 0; + let DST_SEL_Y = 7; // Masked + let DST_SEL_Z = 7; // Masked + let DST_SEL_W = 7; // Masked + let DATA_FORMAT = 1; // FMT_8 +} + +class VTX_READ_16_eg buffer_id, list pattern> + : VTX_READ_eg <"VTX_READ_16 $dst_gpr, $src_gpr", buffer_id, + (outs R600_TReg32_X:$dst_gpr), pattern> { + let MEGA_FETCH_COUNT = 2; + let DST_SEL_X = 0; + let DST_SEL_Y = 7; // Masked + let DST_SEL_Z = 7; // Masked + let DST_SEL_W = 7; // Masked + let DATA_FORMAT = 5; // FMT_16 + +} + +class VTX_READ_32_eg buffer_id, list pattern> + : VTX_READ_eg <"VTX_READ_32 $dst_gpr, $src_gpr", buffer_id, + (outs R600_TReg32_X:$dst_gpr), pattern> { + + let MEGA_FETCH_COUNT = 4; + let DST_SEL_X = 0; + let DST_SEL_Y = 7; // Masked + let DST_SEL_Z = 7; // Masked + let DST_SEL_W = 7; // Masked + let DATA_FORMAT = 0xD; // COLOR_32 + + // This is not really necessary, but there were some GPU hangs that appeared + // to be caused by ALU instructions in the next instruction group that wrote + // to the $src_gpr registers of the VTX_READ. + // e.g. + // %T3_X = VTX_READ_PARAM_32_eg %T2_X, 24 + // %T2_X = MOV %ZERO + //Adding this constraint prevents this from happening. + let Constraints = "$src_gpr.ptr = $dst_gpr"; +} + +class VTX_READ_64_eg buffer_id, list pattern> + : VTX_READ_eg <"VTX_READ_64 $dst_gpr.XY, $src_gpr", buffer_id, + (outs R600_Reg64:$dst_gpr), pattern> { + + let MEGA_FETCH_COUNT = 8; + let DST_SEL_X = 0; + let DST_SEL_Y = 1; + let DST_SEL_Z = 7; + let DST_SEL_W = 7; + let DATA_FORMAT = 0x1D; // COLOR_32_32 +} + +class VTX_READ_128_eg buffer_id, list pattern> + : VTX_READ_eg <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id, + (outs R600_Reg128:$dst_gpr), pattern> { + + let MEGA_FETCH_COUNT = 16; + let DST_SEL_X = 0; + let DST_SEL_Y = 1; + let DST_SEL_Z = 2; + let DST_SEL_W = 3; + let DATA_FORMAT = 0x22; // COLOR_32_32_32_32 + + // XXX: Need to force VTX_READ_128 instructions to write to the same register + // that holds its buffer address to avoid potential hangs. We can't use + // the same constraint as VTX_READ_32_eg, because the $src_gpr.ptr and $dst + // registers are different sizes. +} + +//===----------------------------------------------------------------------===// +// VTX Read from parameter memory space +//===----------------------------------------------------------------------===// + +def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <0, + [(set i32:$dst_gpr, (load_param_exti8 ADDRVTX_READ:$src_gpr))] +>; + +def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <0, + [(set i32:$dst_gpr, (load_param_exti16 ADDRVTX_READ:$src_gpr))] +>; + +def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0, + [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] +>; + +def VTX_READ_PARAM_64_eg : VTX_READ_64_eg <0, + [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] +>; + +def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0, + [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] +>; + +//===----------------------------------------------------------------------===// +// VTX Read from global memory space +//===----------------------------------------------------------------------===// + +// 8-bit reads +def VTX_READ_GLOBAL_8_eg : VTX_READ_8_eg <1, + [(set i32:$dst_gpr, (az_extloadi8_global ADDRVTX_READ:$src_gpr))] +>; + +def VTX_READ_GLOBAL_16_eg : VTX_READ_16_eg <1, + [(set i32:$dst_gpr, (az_extloadi16_global ADDRVTX_READ:$src_gpr))] +>; + +// 32-bit reads +def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1, + [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +>; + +// 64-bit reads +def VTX_READ_GLOBAL_64_eg : VTX_READ_64_eg <1, + [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +>; + +// 128-bit reads +def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1, + [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +>; + +} // End Predicates = [isEG] + +//===----------------------------------------------------------------------===// +// Evergreen / Cayman Instructions +//===----------------------------------------------------------------------===// + +let Predicates = [isEGorCayman] in { + +// Should be predicated on FeatureFP64 +// def FMA_64 : R600_3OP < +// 0xA, "FMA_64", +// [(set f64:$dst, (fma f64:$src0, f64:$src1, f64:$src2))] +// >; + +// BFE_UINT - bit_extract, an optimization for mask and shift +// Src0 = Input +// Src1 = Offset +// Src2 = Width +// +// bit_extract = (Input << (32 - Offset - Width)) >> (32 - Width) +// +// Example Usage: +// (Offset, Width) +// +// (0, 8) = (Input << 24) >> 24 = (Input & 0xff) >> 0 +// (8, 8) = (Input << 16) >> 24 = (Input & 0xffff) >> 8 +// (16, 8) = (Input << 8) >> 24 = (Input & 0xffffff) >> 16 +// (24, 8) = (Input << 0) >> 24 = (Input & 0xffffffff) >> 24 +def BFE_UINT_eg : R600_3OP <0x4, "BFE_UINT", + [(set i32:$dst, (AMDGPUbfe_u32 i32:$src0, i32:$src1, i32:$src2))], + VecALU +>; + +def BFE_INT_eg : R600_3OP <0x5, "BFE_INT", + [(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))], + VecALU +>; + +def : BFEPattern ; + +def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", + [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))], + VecALU +>; + +def : Pat<(i32 (sext_inreg i32:$src, i1)), + (BFE_INT_eg i32:$src, (i32 ZERO), (i32 ONE_INT))>; +def : Pat<(i32 (sext_inreg i32:$src, i8)), + (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 8))>; +def : Pat<(i32 (sext_inreg i32:$src, i16)), + (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 16))>; + +defm : BFIPatterns ; + +def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT", + [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))], + VecALU +>; + +def MULADD_UINT24_eg : R600_3OP <0x10, "MULADD_UINT24", + [(set i32:$dst, (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2))], VecALU +>; + +def : UMad24Pat; + +def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>; +def : ROTRPattern ; +def MULADD_eg : MULADD_Common<0x14>; +def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>; +def FMA_eg : FMA_Common<0x7>; +def ASHR_eg : ASHR_Common<0x15>; +def LSHR_eg : LSHR_Common<0x16>; +def LSHL_eg : LSHL_Common<0x17>; +def CNDE_eg : CNDE_Common<0x19>; +def CNDGT_eg : CNDGT_Common<0x1A>; +def CNDGE_eg : CNDGE_Common<0x1B>; +def MUL_LIT_eg : MUL_LIT_Common<0x1F>; +def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>; +def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24", + [(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))], VecALU +>; +def DOT4_eg : DOT4_Common<0xBE>; +defm CUBE_eg : CUBE_Common<0xC0>; + +def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>; + +def ADDC_UINT : R600_2OP_Helper <0x52, "ADDC_UINT", AMDGPUcarry>; +def SUBB_UINT : R600_2OP_Helper <0x53, "SUBB_UINT", AMDGPUborrow>; + +def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", ctlz_zero_undef, VecALU>; +def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>; + +let hasSideEffects = 1 in { + def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>; +} + +def TGSI_LIT_Z_eg : TGSI_LIT_Z_Common; + +def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> { + let Pattern = []; + let Itinerary = AnyALU; +} + +def INT_TO_FLT_eg : INT_TO_FLT_Common<0x9B>; + +def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> { + let Pattern = []; +} + +def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>; + +def GROUP_BARRIER : InstR600 < + (outs), (ins), " GROUP_BARRIER", [(int_AMDGPU_barrier_local), (int_AMDGPU_barrier_global)], AnyALU>, + R600ALU_Word0, + R600ALU_Word1_OP2 <0x54> { + + let dst = 0; + let dst_rel = 0; + let src0 = 0; + let src0_rel = 0; + let src0_neg = 0; + let src0_abs = 0; + let src1 = 0; + let src1_rel = 0; + let src1_neg = 0; + let src1_abs = 0; + let write = 0; + let omod = 0; + let clamp = 0; + let last = 1; + let bank_swizzle = 0; + let pred_sel = 0; + let update_exec_mask = 0; + let update_pred = 0; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; + + let ALUInst = 1; +} + +def : Pat < + (int_AMDGPU_barrier_global), + (GROUP_BARRIER) +>; + +//===----------------------------------------------------------------------===// +// LDS Instructions +//===----------------------------------------------------------------------===// +class R600_LDS op, dag outs, dag ins, string asm, + list pattern = []> : + + InstR600 , + R600_ALU_LDS_Word0, + R600LDS_Word1 { + + bits<6> offset = 0; + let lds_op = op; + + let Word1{27} = offset{0}; + let Word1{12} = offset{1}; + let Word1{28} = offset{2}; + let Word1{31} = offset{3}; + let Word0{12} = offset{4}; + let Word0{25} = offset{5}; + + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; + + let ALUInst = 1; + let HasNativeOperands = 1; + let UseNamedOperandTable = 1; +} + +class R600_LDS_1A lds_op, string name, list pattern> : R600_LDS < + lds_op, + (outs R600_Reg32:$dst), + (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel, + LAST:$last, R600_Pred:$pred_sel, + BANK_SWIZZLE:$bank_swizzle), + " "#name#" $last OQAP, $src0$src0_rel $pred_sel", + pattern + > { + + let src1 = 0; + let src1_rel = 0; + let src2 = 0; + let src2_rel = 0; + + let usesCustomInserter = 1; + let LDS_1A = 1; + let DisableEncoding = "$dst"; +} + +class R600_LDS_1A1D lds_op, dag outs, string name, list pattern, + string dst =""> : + R600_LDS < + lds_op, outs, + (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel, + R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel, + LAST:$last, R600_Pred:$pred_sel, + BANK_SWIZZLE:$bank_swizzle), + " "#name#" $last "#dst#"$src0$src0_rel, $src1$src1_rel, $pred_sel", + pattern + > { + + field string BaseOp; + + let src2 = 0; + let src2_rel = 0; + let LDS_1A1D = 1; +} + +class R600_LDS_1A1D_NORET lds_op, string name, list pattern> : + R600_LDS_1A1D { + let BaseOp = name; +} + +class R600_LDS_1A1D_RET lds_op, string name, list pattern> : + R600_LDS_1A1D { + + let BaseOp = name; + let usesCustomInserter = 1; + let DisableEncoding = "$dst"; +} + +class R600_LDS_1A2D lds_op, dag outs, string name, list pattern, + string dst =""> : + R600_LDS < + lds_op, outs, + (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel, + R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel, + R600_Reg32:$src2, REL:$src2_rel, SEL:$src2_sel, + LAST:$last, R600_Pred:$pred_sel, BANK_SWIZZLE:$bank_swizzle), + " "#name# "$last "#dst#"$src0$src0_rel, $src1$src1_rel, $src2$src2_rel, $pred_sel", + pattern> { + + field string BaseOp; + + let LDS_1A1D = 0; + let LDS_1A2D = 1; +} + +class R600_LDS_1A2D_NORET lds_op, string name, list pattern> : + R600_LDS_1A2D { + let BaseOp = name; +} + +class R600_LDS_1A2D_RET lds_op, string name, list pattern> : + R600_LDS_1A2D { + + let BaseOp = name; + let usesCustomInserter = 1; + let DisableEncoding = "$dst"; +} + +def LDS_ADD : R600_LDS_1A1D_NORET <0x0, "LDS_ADD", [] >; +def LDS_SUB : R600_LDS_1A1D_NORET <0x1, "LDS_SUB", [] >; +def LDS_AND : R600_LDS_1A1D_NORET <0x9, "LDS_AND", [] >; +def LDS_OR : R600_LDS_1A1D_NORET <0xa, "LDS_OR", [] >; +def LDS_XOR : R600_LDS_1A1D_NORET <0xb, "LDS_XOR", [] >; +def LDS_WRXCHG: R600_LDS_1A1D_NORET <0xd, "LDS_WRXCHG", [] >; +def LDS_CMPST: R600_LDS_1A2D_NORET <0x10, "LDS_CMPST", [] >; +def LDS_MIN_INT : R600_LDS_1A1D_NORET <0x5, "LDS_MIN_INT", [] >; +def LDS_MAX_INT : R600_LDS_1A1D_NORET <0x6, "LDS_MAX_INT", [] >; +def LDS_MIN_UINT : R600_LDS_1A1D_NORET <0x7, "LDS_MIN_UINT", [] >; +def LDS_MAX_UINT : R600_LDS_1A1D_NORET <0x8, "LDS_MAX_UINT", [] >; +def LDS_WRITE : R600_LDS_1A1D_NORET <0xD, "LDS_WRITE", + [(local_store (i32 R600_Reg32:$src1), R600_Reg32:$src0)] +>; +def LDS_BYTE_WRITE : R600_LDS_1A1D_NORET<0x12, "LDS_BYTE_WRITE", + [(truncstorei8_local i32:$src1, i32:$src0)] +>; +def LDS_SHORT_WRITE : R600_LDS_1A1D_NORET<0x13, "LDS_SHORT_WRITE", + [(truncstorei16_local i32:$src1, i32:$src0)] +>; +def LDS_ADD_RET : R600_LDS_1A1D_RET <0x20, "LDS_ADD", + [(set i32:$dst, (atomic_load_add_local i32:$src0, i32:$src1))] +>; +def LDS_SUB_RET : R600_LDS_1A1D_RET <0x21, "LDS_SUB", + [(set i32:$dst, (atomic_load_sub_local i32:$src0, i32:$src1))] +>; +def LDS_AND_RET : R600_LDS_1A1D_RET <0x29, "LDS_AND", + [(set i32:$dst, (atomic_load_and_local i32:$src0, i32:$src1))] +>; +def LDS_OR_RET : R600_LDS_1A1D_RET <0x2a, "LDS_OR", + [(set i32:$dst, (atomic_load_or_local i32:$src0, i32:$src1))] +>; +def LDS_XOR_RET : R600_LDS_1A1D_RET <0x2b, "LDS_XOR", + [(set i32:$dst, (atomic_load_xor_local i32:$src0, i32:$src1))] +>; +def LDS_MIN_INT_RET : R600_LDS_1A1D_RET <0x25, "LDS_MIN_INT", + [(set i32:$dst, (atomic_load_min_local i32:$src0, i32:$src1))] +>; +def LDS_MAX_INT_RET : R600_LDS_1A1D_RET <0x26, "LDS_MAX_INT", + [(set i32:$dst, (atomic_load_max_local i32:$src0, i32:$src1))] +>; +def LDS_MIN_UINT_RET : R600_LDS_1A1D_RET <0x27, "LDS_MIN_UINT", + [(set i32:$dst, (atomic_load_umin_local i32:$src0, i32:$src1))] +>; +def LDS_MAX_UINT_RET : R600_LDS_1A1D_RET <0x28, "LDS_MAX_UINT", + [(set i32:$dst, (atomic_load_umax_local i32:$src0, i32:$src1))] +>; +def LDS_WRXCHG_RET : R600_LDS_1A1D_RET <0x2d, "LDS_WRXCHG", + [(set i32:$dst, (atomic_swap_local i32:$src0, i32:$src1))] +>; +def LDS_CMPST_RET : R600_LDS_1A2D_RET <0x30, "LDS_CMPST", + [(set i32:$dst, (atomic_cmp_swap_32_local i32:$src0, i32:$src1, i32:$src2))] +>; +def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET", + [(set (i32 R600_Reg32:$dst), (local_load R600_Reg32:$src0))] +>; +def LDS_BYTE_READ_RET : R600_LDS_1A <0x36, "LDS_BYTE_READ_RET", + [(set i32:$dst, (sextloadi8_local i32:$src0))] +>; +def LDS_UBYTE_READ_RET : R600_LDS_1A <0x37, "LDS_UBYTE_READ_RET", + [(set i32:$dst, (az_extloadi8_local i32:$src0))] +>; +def LDS_SHORT_READ_RET : R600_LDS_1A <0x38, "LDS_SHORT_READ_RET", + [(set i32:$dst, (sextloadi16_local i32:$src0))] +>; +def LDS_USHORT_READ_RET : R600_LDS_1A <0x39, "LDS_USHORT_READ_RET", + [(set i32:$dst, (az_extloadi16_local i32:$src0))] +>; + +// TRUNC is used for the FLT_TO_INT instructions to work around a +// perceived problem where the rounding modes are applied differently +// depending on the instruction and the slot they are in. +// See: +// https://bugs.freedesktop.org/show_bug.cgi?id=50232 +// Mesa commit: a1a0974401c467cb86ef818f22df67c21774a38c +// +// XXX: Lowering SELECT_CC will sometimes generate fp_to_[su]int nodes, +// which do not need to be truncated since the fp values are 0.0f or 1.0f. +// We should look into handling these cases separately. +def : Pat<(fp_to_sint f32:$src0), (FLT_TO_INT_eg (TRUNC $src0))>; + +def : Pat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>; + +// SHA-256 Patterns +def : SHA256MaPattern ; + +def EG_ExportSwz : ExportSwzInst { + let Word1{19-16} = 0; // BURST_COUNT + let Word1{20} = 0; // VALID_PIXEL_MODE + let Word1{21} = eop; + let Word1{29-22} = inst; + let Word1{30} = 0; // MARK + let Word1{31} = 1; // BARRIER +} +defm : ExportPattern; + +def EG_ExportBuf : ExportBufInst { + let Word1{19-16} = 0; // BURST_COUNT + let Word1{20} = 0; // VALID_PIXEL_MODE + let Word1{21} = eop; + let Word1{29-22} = inst; + let Word1{30} = 0; // MARK + let Word1{31} = 1; // BARRIER +} +defm : SteamOutputExportPattern; + +def CF_TC_EG : CF_CLAUSE_EG<1, (ins i32imm:$ADDR, i32imm:$COUNT), + "TEX $COUNT @$ADDR"> { + let POP_COUNT = 0; +} +def CF_VC_EG : CF_CLAUSE_EG<2, (ins i32imm:$ADDR, i32imm:$COUNT), + "VTX $COUNT @$ADDR"> { + let POP_COUNT = 0; +} +def WHILE_LOOP_EG : CF_CLAUSE_EG<6, (ins i32imm:$ADDR), + "LOOP_START_DX10 @$ADDR"> { + let POP_COUNT = 0; + let COUNT = 0; +} +def END_LOOP_EG : CF_CLAUSE_EG<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> { + let POP_COUNT = 0; + let COUNT = 0; +} +def LOOP_BREAK_EG : CF_CLAUSE_EG<9, (ins i32imm:$ADDR), + "LOOP_BREAK @$ADDR"> { + let POP_COUNT = 0; + let COUNT = 0; +} +def CF_CONTINUE_EG : CF_CLAUSE_EG<8, (ins i32imm:$ADDR), + "CONTINUE @$ADDR"> { + let POP_COUNT = 0; + let COUNT = 0; +} +def CF_JUMP_EG : CF_CLAUSE_EG<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "JUMP @$ADDR POP:$POP_COUNT"> { + let COUNT = 0; +} +def CF_PUSH_EG : CF_CLAUSE_EG<11, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "PUSH @$ADDR POP:$POP_COUNT"> { + let COUNT = 0; +} +def CF_ELSE_EG : CF_CLAUSE_EG<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "ELSE @$ADDR POP:$POP_COUNT"> { + let COUNT = 0; +} +def CF_CALL_FS_EG : CF_CLAUSE_EG<19, (ins), "CALL_FS"> { + let ADDR = 0; + let COUNT = 0; + let POP_COUNT = 0; +} +def POP_EG : CF_CLAUSE_EG<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "POP @$ADDR POP:$POP_COUNT"> { + let COUNT = 0; +} +def CF_END_EG : CF_CLAUSE_EG<0, (ins), "CF_END"> { + let COUNT = 0; + let POP_COUNT = 0; + let ADDR = 0; + let END_OF_PROGRAM = 1; +} + +} // End Predicates = [isEGorCayman] diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp new file mode 100644 index 00000000000..e811d5cff22 --- /dev/null +++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -0,0 +1,642 @@ +//===-- AMDGPUInstPrinter.cpp - AMDGPU MC Inst -> ASM ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +// \file +//===----------------------------------------------------------------------===// + +#include "AMDGPUInstPrinter.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIDefines.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/MathExtras.h" + +using namespace llvm; + +void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, + StringRef Annot, const MCSubtargetInfo &STI) { + OS.flush(); + printInstruction(MI, OS); + + printAnnotation(OS, Annot); +} + +void AMDGPUInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatHex(MI->getOperand(OpNo).getImm() & 0xff); +} + +void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatHex(MI->getOperand(OpNo).getImm() & 0xffff); +} + +void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff); +} + +void AMDGPUInstPrinter::printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatDec(MI->getOperand(OpNo).getImm() & 0xff); +} + +void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff); +} + +void AMDGPUInstPrinter::printOffen(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " offen"; +} + +void AMDGPUInstPrinter::printIdxen(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " idxen"; +} + +void AMDGPUInstPrinter::printAddr64(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " addr64"; +} + +void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) { + O << " offset:"; + printU16ImmDecOperand(MI, OpNo, O); + } +} + +void AMDGPUInstPrinter::printDSOffset(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + uint16_t Imm = MI->getOperand(OpNo).getImm(); + if (Imm != 0) { + O << " offset:"; + printU16ImmDecOperand(MI, OpNo, O); + } +} + +void AMDGPUInstPrinter::printDSOffset0(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) { + O << " offset0:"; + printU8ImmDecOperand(MI, OpNo, O); + } +} + +void AMDGPUInstPrinter::printDSOffset1(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) { + O << " offset1:"; + printU8ImmDecOperand(MI, OpNo, O); + } +} + +void AMDGPUInstPrinter::printGDS(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " gds"; +} + +void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " glc"; +} + +void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " slc"; +} + +void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " tfe"; +} + +void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O, + const MCRegisterInfo &MRI) { + switch (reg) { + case AMDGPU::VCC: + O << "vcc"; + return; + case AMDGPU::SCC: + O << "scc"; + return; + case AMDGPU::EXEC: + O << "exec"; + return; + case AMDGPU::M0: + O << "m0"; + return; + case AMDGPU::FLAT_SCR: + O << "flat_scratch"; + return; + case AMDGPU::VCC_LO: + O << "vcc_lo"; + return; + case AMDGPU::VCC_HI: + O << "vcc_hi"; + return; + case AMDGPU::EXEC_LO: + O << "exec_lo"; + return; + case AMDGPU::EXEC_HI: + O << "exec_hi"; + return; + case AMDGPU::FLAT_SCR_LO: + O << "flat_scratch_lo"; + return; + case AMDGPU::FLAT_SCR_HI: + O << "flat_scratch_hi"; + return; + default: + break; + } + + char Type; + unsigned NumRegs; + + if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(reg)) { + Type = 'v'; + NumRegs = 1; + } else if (MRI.getRegClass(AMDGPU::SGPR_32RegClassID).contains(reg)) { + Type = 's'; + NumRegs = 1; + } else if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(reg)) { + Type = 'v'; + NumRegs = 2; + } else if (MRI.getRegClass(AMDGPU::SReg_64RegClassID).contains(reg)) { + Type = 's'; + NumRegs = 2; + } else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(reg)) { + Type = 'v'; + NumRegs = 4; + } else if (MRI.getRegClass(AMDGPU::SReg_128RegClassID).contains(reg)) { + Type = 's'; + NumRegs = 4; + } else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(reg)) { + Type = 'v'; + NumRegs = 3; + } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(reg)) { + Type = 'v'; + NumRegs = 8; + } else if (MRI.getRegClass(AMDGPU::SReg_256RegClassID).contains(reg)) { + Type = 's'; + NumRegs = 8; + } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(reg)) { + Type = 'v'; + NumRegs = 16; + } else if (MRI.getRegClass(AMDGPU::SReg_512RegClassID).contains(reg)) { + Type = 's'; + NumRegs = 16; + } else { + O << getRegisterName(reg); + return; + } + + // The low 8 bits of the encoding value is the register index, for both VGPRs + // and SGPRs. + unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1); + if (NumRegs == 1) { + O << Type << RegIdx; + return; + } + + O << Type << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']'; +} + +void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3) + O << "_e64 "; + else + O << "_e32 "; + + printOperand(MI, OpNo, O); +} + +void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, raw_ostream &O) { + int32_t SImm = static_cast(Imm); + if (SImm >= -16 && SImm <= 64) { + O << SImm; + return; + } + + if (Imm == FloatToBits(0.0f)) + O << "0.0"; + else if (Imm == FloatToBits(1.0f)) + O << "1.0"; + else if (Imm == FloatToBits(-1.0f)) + O << "-1.0"; + else if (Imm == FloatToBits(0.5f)) + O << "0.5"; + else if (Imm == FloatToBits(-0.5f)) + O << "-0.5"; + else if (Imm == FloatToBits(2.0f)) + O << "2.0"; + else if (Imm == FloatToBits(-2.0f)) + O << "-2.0"; + else if (Imm == FloatToBits(4.0f)) + O << "4.0"; + else if (Imm == FloatToBits(-4.0f)) + O << "-4.0"; + else + O << formatHex(static_cast(Imm)); +} + +void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, raw_ostream &O) { + int64_t SImm = static_cast(Imm); + if (SImm >= -16 && SImm <= 64) { + O << SImm; + return; + } + + if (Imm == DoubleToBits(0.0)) + O << "0.0"; + else if (Imm == DoubleToBits(1.0)) + O << "1.0"; + else if (Imm == DoubleToBits(-1.0)) + O << "-1.0"; + else if (Imm == DoubleToBits(0.5)) + O << "0.5"; + else if (Imm == DoubleToBits(-0.5)) + O << "-0.5"; + else if (Imm == DoubleToBits(2.0)) + O << "2.0"; + else if (Imm == DoubleToBits(-2.0)) + O << "-2.0"; + else if (Imm == DoubleToBits(4.0)) + O << "4.0"; + else if (Imm == DoubleToBits(-4.0)) + O << "-4.0"; + else + llvm_unreachable("64-bit literal constants not supported"); +} + +void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + switch (Op.getReg()) { + // This is the default predicate state, so we don't need to print it. + case AMDGPU::PRED_SEL_OFF: + break; + + default: + printRegOperand(Op.getReg(), O, MRI); + break; + } + } else if (Op.isImm()) { + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + int RCID = Desc.OpInfo[OpNo].RegClass; + if (RCID != -1) { + const MCRegisterClass &ImmRC = MRI.getRegClass(RCID); + if (ImmRC.getSize() == 4) + printImmediate32(Op.getImm(), O); + else if (ImmRC.getSize() == 8) + printImmediate64(Op.getImm(), O); + else + llvm_unreachable("Invalid register class size"); + } else if (Desc.OpInfo[OpNo].OperandType == MCOI::OPERAND_IMMEDIATE) { + printImmediate32(Op.getImm(), O); + } else { + // We hit this for the immediate instruction bits that don't yet have a + // custom printer. + // TODO: Eventually this should be unnecessary. + O << formatDec(Op.getImm()); + } + } else if (Op.isFPImm()) { + // We special case 0.0 because otherwise it will be printed as an integer. + if (Op.getFPImm() == 0.0) + O << "0.0"; + else { + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + const MCRegisterClass &ImmRC = MRI.getRegClass(Desc.OpInfo[OpNo].RegClass); + + if (ImmRC.getSize() == 4) + printImmediate32(FloatToBits(Op.getFPImm()), O); + else if (ImmRC.getSize() == 8) + printImmediate64(DoubleToBits(Op.getFPImm()), O); + else + llvm_unreachable("Invalid register class size"); + } + } else if (Op.isExpr()) { + const MCExpr *Exp = Op.getExpr(); + Exp->print(O, &MAI); + } else { + llvm_unreachable("unknown operand type in printOperand"); + } +} + +void AMDGPUInstPrinter::printOperandAndMods(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned InputModifiers = MI->getOperand(OpNo).getImm(); + if (InputModifiers & SISrcMods::NEG) + O << '-'; + if (InputModifiers & SISrcMods::ABS) + O << '|'; + printOperand(MI, OpNo + 1, O); + if (InputModifiers & SISrcMods::ABS) + O << '|'; +} + +void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNum).getImm(); + + if (Imm == 2) { + O << "P0"; + } else if (Imm == 1) { + O << "P20"; + } else if (Imm == 0) { + O << "P10"; + } else { + llvm_unreachable("Invalid interpolation parameter slot"); + } +} + +void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printOperand(MI, OpNo, O); + O << ", "; + printOperand(MI, OpNo + 1, O); +} + +void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo, + raw_ostream &O, StringRef Asm, + StringRef Default) { + const MCOperand &Op = MI->getOperand(OpNo); + assert(Op.isImm()); + if (Op.getImm() == 1) { + O << Asm; + } else { + O << Default; + } +} + +void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printIfSet(MI, OpNo, O, "|"); +} + +void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printIfSet(MI, OpNo, O, "_SAT"); +} + +void AMDGPUInstPrinter::printClampSI(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " clamp"; +} + +void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + int Imm = MI->getOperand(OpNo).getImm(); + if (Imm == SIOutMods::MUL2) + O << " mul:2"; + else if (Imm == SIOutMods::MUL4) + O << " mul:4"; + else if (Imm == SIOutMods::DIV2) + O << " div:2"; +} + +void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + int32_t Imm = MI->getOperand(OpNo).getImm(); + O << Imm << '(' << BitsToFloat(Imm) << ')'; +} + +void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printIfSet(MI, OpNo, O, "*", " "); +} + +void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printIfSet(MI, OpNo, O, "-"); +} + +void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + switch (MI->getOperand(OpNo).getImm()) { + default: break; + case 1: + O << " * 2.0"; + break; + case 2: + O << " * 4.0"; + break; + case 3: + O << " / 2.0"; + break; + } +} + +void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printIfSet(MI, OpNo, O, "+"); +} + +void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printIfSet(MI, OpNo, O, "ExecMask,"); +} + +void AMDGPUInstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printIfSet(MI, OpNo, O, "Pred,"); +} + +void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.getImm() == 0) { + O << " (MASKED)"; + } +} + +void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const char * chans = "XYZW"; + int sel = MI->getOperand(OpNo).getImm(); + + int chan = sel & 3; + sel >>= 2; + + if (sel >= 512) { + sel -= 512; + int cb = sel >> 12; + sel &= 4095; + O << cb << '[' << sel << ']'; + } else if (sel >= 448) { + sel -= 448; + O << sel; + } else if (sel >= 0){ + O << sel; + } + + if (sel >= 0) + O << '.' << chans[chan]; +} + +void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + int BankSwizzle = MI->getOperand(OpNo).getImm(); + switch (BankSwizzle) { + case 1: + O << "BS:VEC_021/SCL_122"; + break; + case 2: + O << "BS:VEC_120/SCL_212"; + break; + case 3: + O << "BS:VEC_102/SCL_221"; + break; + case 4: + O << "BS:VEC_201"; + break; + case 5: + O << "BS:VEC_210"; + break; + default: + break; + } + return; +} + +void AMDGPUInstPrinter::printRSel(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned Sel = MI->getOperand(OpNo).getImm(); + switch (Sel) { + case 0: + O << 'X'; + break; + case 1: + O << 'Y'; + break; + case 2: + O << 'Z'; + break; + case 3: + O << 'W'; + break; + case 4: + O << '0'; + break; + case 5: + O << '1'; + break; + case 7: + O << '_'; + break; + default: + break; + } +} + +void AMDGPUInstPrinter::printCT(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned CT = MI->getOperand(OpNo).getImm(); + switch (CT) { + case 0: + O << 'U'; + break; + case 1: + O << 'N'; + break; + default: + break; + } +} + +void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + int KCacheMode = MI->getOperand(OpNo).getImm(); + if (KCacheMode > 0) { + int KCacheBank = MI->getOperand(OpNo - 2).getImm(); + O << "CB" << KCacheBank << ':'; + int KCacheAddr = MI->getOperand(OpNo + 2).getImm(); + int LineSize = (KCacheMode == 1) ? 16 : 32; + O << KCacheAddr * 16 << '-' << KCacheAddr * 16 + LineSize; + } +} + +void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned SImm16 = MI->getOperand(OpNo).getImm(); + unsigned Msg = SImm16 & 0xF; + if (Msg == 2 || Msg == 3) { + unsigned Op = (SImm16 >> 4) & 0xF; + if (Msg == 3) + O << "Gs_done("; + else + O << "Gs("; + if (Op == 0) { + O << "nop"; + } else { + unsigned Stream = (SImm16 >> 8) & 0x3; + if (Op == 1) + O << "cut"; + else if (Op == 2) + O << "emit"; + else if (Op == 3) + O << "emit-cut"; + O << " stream " << Stream; + } + O << "), [m0] "; + } else if (Msg == 1) + O << "interrupt "; + else if (Msg == 15) + O << "system "; + else + O << "unknown(" << Msg << ") "; +} + +void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + // Note: Mask values are taken from SIInsertWaits.cpp and not from ISA docs + // SIInsertWaits.cpp bits usage does not match ISA docs description but it + // works so it might be a misprint in docs. + unsigned SImm16 = MI->getOperand(OpNo).getImm(); + unsigned Vmcnt = SImm16 & 0xF; + unsigned Expcnt = (SImm16 >> 4) & 0xF; + unsigned Lgkmcnt = (SImm16 >> 8) & 0xF; + + bool NeedSpace = false; + + if (Vmcnt != 0xF) { + O << "vmcnt(" << Vmcnt << ')'; + NeedSpace = true; + } + + if (Expcnt != 0x7) { + if (NeedSpace) + O << ' '; + O << "expcnt(" << Expcnt << ')'; + NeedSpace = true; + } + + if (Lgkmcnt != 0x7) { + if (NeedSpace) + O << ' '; + O << "lgkmcnt(" << Lgkmcnt << ')'; + } +} + +#include "AMDGPUGenAsmWriter.inc" diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h new file mode 100644 index 00000000000..14fb511e923 --- /dev/null +++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h @@ -0,0 +1,88 @@ +//===-- AMDGPUInstPrinter.h - AMDGPU MC Inst -> ASM interface ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H +#define LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { + +class AMDGPUInstPrinter : public MCInstPrinter { +public: + AMDGPUInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : MCInstPrinter(MAI, MII, MRI) {} + + //Autogenerated by tblgen + void printInstruction(const MCInst *MI, raw_ostream &O); + static const char *getRegisterName(unsigned RegNo); + + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, + const MCSubtargetInfo &STI) override; + static void printRegOperand(unsigned RegNo, raw_ostream &O, + const MCRegisterInfo &MRI); + +private: + void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU32ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printOffen(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printIdxen(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printAddr64(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printDSOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printDSOffset0(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printDSOffset1(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printGDS(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printGLC(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printSLC(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printTFE(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printRegOperand(unsigned RegNo, raw_ostream &O); + void printVOPDst(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printImmediate32(uint32_t I, raw_ostream &O); + void printImmediate64(uint64_t I, raw_ostream &O); + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printOperandAndMods(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O, + StringRef Asm, StringRef Default = ""); + static void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printClampSI(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printOModSI(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printOMOD(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printRel(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printUpdateExecMask(const MCInst *MI, unsigned OpNo, + raw_ostream &O); + static void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printBankSwizzle(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printRSel(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printCT(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printSendMsg(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printWaitFlag(const MCInst *MI, unsigned OpNo, raw_ostream &O); +}; + +} // End namespace llvm + +#endif diff --git a/lib/Target/AMDGPU/InstPrinter/CMakeLists.txt b/lib/Target/AMDGPU/InstPrinter/CMakeLists.txt new file mode 100644 index 00000000000..ce63bd553b9 --- /dev/null +++ b/lib/Target/AMDGPU/InstPrinter/CMakeLists.txt @@ -0,0 +1,3 @@ +add_llvm_library(LLVMAMDGPUAsmPrinter + AMDGPUInstPrinter.cpp + ) diff --git a/lib/Target/AMDGPU/InstPrinter/LLVMBuild.txt b/lib/Target/AMDGPU/InstPrinter/LLVMBuild.txt new file mode 100644 index 00000000000..fdb43844dc6 --- /dev/null +++ b/lib/Target/AMDGPU/InstPrinter/LLVMBuild.txt @@ -0,0 +1,24 @@ +;===- ./lib/Target/AMDGPU/InstPrinter/LLVMBuild.txt -----------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = AMDGPUAsmPrinter +parent = AMDGPU +required_libraries = MC Support +add_to_library_groups = AMDGPU + diff --git a/lib/Target/AMDGPU/InstPrinter/Makefile b/lib/Target/AMDGPU/InstPrinter/Makefile new file mode 100644 index 00000000000..a794cc1124e --- /dev/null +++ b/lib/Target/AMDGPU/InstPrinter/Makefile @@ -0,0 +1,15 @@ +#===- lib/Target/R600/AsmPrinter/Makefile ------------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## +LEVEL = ../../../.. +LIBRARYNAME = LLVMR600AsmPrinter + +# Hack: we need to include 'main' x86 target directory to grab private headers +CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. + +include $(LEVEL)/Makefile.common diff --git a/lib/Target/AMDGPU/LLVMBuild.txt b/lib/Target/AMDGPU/LLVMBuild.txt new file mode 100644 index 00000000000..c6861df91ed --- /dev/null +++ b/lib/Target/AMDGPU/LLVMBuild.txt @@ -0,0 +1,33 @@ +;===- ./lib/Target/AMDIL/LLVMBuild.txt -------------------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[common] +subdirectories = AsmParser InstPrinter MCTargetDesc TargetInfo + +[component_0] +type = TargetGroup +name = AMDGPU +parent = Target +has_asmparser = 1 +has_asmprinter = 1 + +[component_1] +type = Library +name = AMDGPUCodeGen +parent = AMDGPU +required_libraries = Analysis AsmPrinter CodeGen Core IPO MC AMDGPUAsmParser AMDGPUAsmPrinter AMDGPUDesc AMDGPUInfo Scalar SelectionDAG Support Target TransformUtils +add_to_library_groups = AMDGPU diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp new file mode 100644 index 00000000000..8bed2deef4c --- /dev/null +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -0,0 +1,144 @@ +//===-- AMDGPUAsmBackend.cpp - AMDGPU Assembler Backend -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "MCTargetDesc/AMDGPUFixupKinds.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCFixupKindInfo.h" +#include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/TargetRegistry.h" + +using namespace llvm; + +namespace { + +class AMDGPUMCObjectWriter : public MCObjectWriter { +public: + AMDGPUMCObjectWriter(raw_pwrite_stream &OS) : MCObjectWriter(OS, true) {} + void executePostLayoutBinding(MCAssembler &Asm, + const MCAsmLayout &Layout) override { + //XXX: Implement if necessary. + } + void recordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout, + const MCFragment *Fragment, const MCFixup &Fixup, + MCValue Target, bool &IsPCRel, + uint64_t &FixedValue) override { + assert(!"Not implemented"); + } + + void writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override; + +}; + +class AMDGPUAsmBackend : public MCAsmBackend { +public: + AMDGPUAsmBackend(const Target &T) + : MCAsmBackend() {} + + unsigned getNumFixupKinds() const override { return AMDGPU::NumTargetFixupKinds; }; + void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, + uint64_t Value, bool IsPCRel) const override; + bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, + const MCRelaxableFragment *DF, + const MCAsmLayout &Layout) const override { + return false; + } + void relaxInstruction(const MCInst &Inst, MCInst &Res) const override { + assert(!"Not implemented"); + } + bool mayNeedRelaxation(const MCInst &Inst) const override { return false; } + bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override; + + const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override; +}; + +} //End anonymous namespace + +void AMDGPUMCObjectWriter::writeObject(MCAssembler &Asm, + const MCAsmLayout &Layout) { + for (MCAssembler::iterator I = Asm.begin(), E = Asm.end(); I != E; ++I) { + Asm.writeSectionData(&*I, Layout); + } +} + +void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, + unsigned DataSize, uint64_t Value, + bool IsPCRel) const { + + switch ((unsigned)Fixup.getKind()) { + default: llvm_unreachable("Unknown fixup kind"); + case AMDGPU::fixup_si_sopp_br: { + uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset()); + *Dst = (Value - 4) / 4; + break; + } + + case AMDGPU::fixup_si_rodata: { + uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset()); + *Dst = Value; + break; + } + + case AMDGPU::fixup_si_end_of_text: { + uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset()); + // The value points to the last instruction in the text section, so we + // need to add 4 bytes to get to the start of the constants. + *Dst = Value + 4; + break; + } + } +} + +const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo( + MCFixupKind Kind) const { + const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = { + // name offset bits flags + { "fixup_si_sopp_br", 0, 16, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_si_rodata", 0, 32, 0 }, + { "fixup_si_end_of_text", 0, 32, MCFixupKindInfo::FKF_IsPCRel } + }; + + if (Kind < FirstTargetFixupKind) + return MCAsmBackend::getFixupKindInfo(Kind); + + return Infos[Kind - FirstTargetFixupKind]; +} + +bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { + OW->WriteZeros(Count); + + return true; +} + +//===----------------------------------------------------------------------===// +// ELFAMDGPUAsmBackend class +//===----------------------------------------------------------------------===// + +namespace { + +class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend { +public: + ELFAMDGPUAsmBackend(const Target &T) : AMDGPUAsmBackend(T) { } + + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + return createAMDGPUELFObjectWriter(OS); + } +}; + +} // end anonymous namespace + +MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, + const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU) { + return new ELFAMDGPUAsmBackend(T); +} diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp new file mode 100644 index 00000000000..59f45ff02d8 --- /dev/null +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -0,0 +1,39 @@ +//===-- AMDGPUELFObjectWriter.cpp - AMDGPU ELF Writer ----------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#include "AMDGPUMCTargetDesc.h" +#include "llvm/MC/MCELFObjectWriter.h" +#include "llvm/MC/MCFixup.h" + +using namespace llvm; + +namespace { + +class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter { +public: + AMDGPUELFObjectWriter(); +protected: + unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, + bool IsPCRel) const override { + return Fixup.getKind(); + } + +}; + + +} // End anonymous namespace + +AMDGPUELFObjectWriter::AMDGPUELFObjectWriter() + : MCELFObjectTargetWriter(false, 0, 0, false) { } + +MCObjectWriter *llvm::createAMDGPUELFObjectWriter(raw_pwrite_stream &OS) { + MCELFObjectTargetWriter *MOTW = new AMDGPUELFObjectWriter(); + return createELFObjectWriter(MOTW, OS, true); +} diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h new file mode 100644 index 00000000000..01021d67ffd --- /dev/null +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h @@ -0,0 +1,34 @@ +//===-- AMDGPUFixupKinds.h - AMDGPU Specific Fixup Entries ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUFIXUPKINDS_H +#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUFIXUPKINDS_H + +#include "llvm/MC/MCFixup.h" + +namespace llvm { +namespace AMDGPU { +enum Fixups { + /// 16-bit PC relative fixup for SOPP branch instructions. + fixup_si_sopp_br = FirstTargetFixupKind, + + /// fixup for global addresses with constant initializers + fixup_si_rodata, + + /// fixup for offset from instruction to end of text section + fixup_si_end_of_text, + + // Marker + LastTargetFixupKind, + NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind +}; +} +} + +#endif diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp new file mode 100644 index 00000000000..028a86dfc7a --- /dev/null +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -0,0 +1,43 @@ +//===-- MCTargetDesc/AMDGPUMCAsmInfo.cpp - Assembly Info ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#include "AMDGPUMCAsmInfo.h" + +using namespace llvm; +AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() { + HasSingleParameterDotFile = false; + //===------------------------------------------------------------------===// + MaxInstLength = 16; + SeparatorString = "\n"; + CommentString = ";"; + PrivateLabelPrefix = ""; + InlineAsmStart = ";#ASMSTART"; + InlineAsmEnd = ";#ASMEND"; + + //===--- Data Emission Directives -------------------------------------===// + ZeroDirective = ".zero"; + AsciiDirective = ".ascii\t"; + AscizDirective = ".asciz\t"; + Data8bitsDirective = ".byte\t"; + Data16bitsDirective = ".short\t"; + Data32bitsDirective = ".long\t"; + Data64bitsDirective = ".quad\t"; + SunStyleELFSectionSwitchSyntax = true; + UsesELFSectionDirectiveForBSS = true; + + //===--- Global Variable Emission Directives --------------------------===// + HasAggressiveSymbolFolding = true; + COMMDirectiveAlignmentIsInBytes = false; + HasDotTypeDotSizeDirective = false; + HasNoDeadStrip = true; + WeakRefDirective = ".weakref\t"; + //===--- Dwarf Emission Directives -----------------------------------===// + SupportsDebugInformation = true; +} diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h new file mode 100644 index 00000000000..a5bac51e356 --- /dev/null +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h @@ -0,0 +1,32 @@ +//===-- MCTargetDesc/AMDGPUMCAsmInfo.h - AMDGPU MCAsm Interface -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCASMINFO_H +#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCASMINFO_H + +#include "llvm/MC/MCAsmInfoELF.h" +namespace llvm { + +class Triple; + +// If you need to create another MCAsmInfo class, which inherits from MCAsmInfo, +// you will need to make sure your new class sets PrivateGlobalPrefix to +// a prefix that won't appeary in a fuction name. The default value +// for PrivateGlobalPrefix is 'L', so it will consider any function starting +// with 'L' as a local symbol. +class AMDGPUMCAsmInfo : public MCAsmInfoELF { +public: + explicit AMDGPUMCAsmInfo(const Triple &TT); +}; +} // namespace llvm +#endif diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp new file mode 100644 index 00000000000..521b3b39bba --- /dev/null +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -0,0 +1,21 @@ +//===-- AMDGPUCodeEmitter.cpp - AMDGPU Code Emitter interface -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief CodeEmitter interface for R600 and SI codegen. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUMCCodeEmitter.h" + +using namespace llvm; + +// pin vtable to this file +void AMDGPUMCCodeEmitter::anchor() {} + diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h new file mode 100644 index 00000000000..c9574276223 --- /dev/null +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h @@ -0,0 +1,50 @@ +//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief CodeEmitter interface for R600 and SI codegen. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCCODEEMITTER_H +#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCCODEEMITTER_H + +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { + +class MCInst; +class MCOperand; +class MCSubtargetInfo; + +class AMDGPUMCCodeEmitter : public MCCodeEmitter { + virtual void anchor(); +public: + + uint64_t getBinaryCodeForInstr(const MCInst &MI, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + + virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + return 0; + } + + virtual unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + return 0; + } +}; + +} // End namespace llvm + +#endif diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp new file mode 100644 index 00000000000..a7d3dd1345f --- /dev/null +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -0,0 +1,90 @@ +//===-- AMDGPUMCTargetDesc.cpp - AMDGPU Target Descriptions ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This file provides AMDGPU specific target descriptions. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUMCTargetDesc.h" +#include "AMDGPUMCAsmInfo.h" +#include "InstPrinter/AMDGPUInstPrinter.h" +#include "SIDefines.h" +#include "llvm/MC/MCCodeGenInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MachineLocation.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetRegistry.h" + +using namespace llvm; + +#define GET_INSTRINFO_MC_DESC +#include "AMDGPUGenInstrInfo.inc" + +#define GET_SUBTARGETINFO_MC_DESC +#include "AMDGPUGenSubtargetInfo.inc" + +#define GET_REGINFO_MC_DESC +#include "AMDGPUGenRegisterInfo.inc" + +static MCInstrInfo *createAMDGPUMCInstrInfo() { + MCInstrInfo *X = new MCInstrInfo(); + InitAMDGPUMCInstrInfo(X); + return X; +} + +static MCRegisterInfo *createAMDGPUMCRegisterInfo(StringRef TT) { + MCRegisterInfo *X = new MCRegisterInfo(); + InitAMDGPUMCRegisterInfo(X, 0); + return X; +} + +static MCSubtargetInfo * +createAMDGPUMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { + MCSubtargetInfo * X = new MCSubtargetInfo(); + InitAMDGPUMCSubtargetInfo(X, TT, CPU, FS); + return X; +} + +static MCCodeGenInfo *createAMDGPUMCCodeGenInfo(StringRef TT, Reloc::Model RM, + CodeModel::Model CM, + CodeGenOpt::Level OL) { + MCCodeGenInfo *X = new MCCodeGenInfo(); + X->initMCCodeGenInfo(RM, CM, OL); + return X; +} + +static MCInstPrinter *createAMDGPUMCInstPrinter(const Triple &T, + unsigned SyntaxVariant, + const MCAsmInfo &MAI, + const MCInstrInfo &MII, + const MCRegisterInfo &MRI) { + return new AMDGPUInstPrinter(MAI, MII, MRI); +} + +extern "C" void LLVMInitializeAMDGPUTargetMC() { + for (Target *T : {&TheAMDGPUTarget, &TheGCNTarget}) { + RegisterMCAsmInfo X(*T); + + TargetRegistry::RegisterMCCodeGenInfo(*T, createAMDGPUMCCodeGenInfo); + TargetRegistry::RegisterMCInstrInfo(*T, createAMDGPUMCInstrInfo); + TargetRegistry::RegisterMCRegInfo(*T, createAMDGPUMCRegisterInfo); + TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo); + TargetRegistry::RegisterMCInstPrinter(*T, createAMDGPUMCInstPrinter); + TargetRegistry::RegisterMCAsmBackend(*T, createAMDGPUAsmBackend); + } + + TargetRegistry::RegisterMCCodeEmitter(TheAMDGPUTarget, + createR600MCCodeEmitter); + TargetRegistry::RegisterMCCodeEmitter(TheGCNTarget, createSIMCCodeEmitter); +} diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h new file mode 100644 index 00000000000..92e29dc7037 --- /dev/null +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -0,0 +1,61 @@ +//===-- AMDGPUMCTargetDesc.h - AMDGPU Target Descriptions -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Provides AMDGPU specific target descriptions. +// +//===----------------------------------------------------------------------===// +// + +#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H +#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H + +#include "llvm/Support/DataTypes.h" +#include "llvm/ADT/StringRef.h" + +namespace llvm { +class MCAsmBackend; +class MCCodeEmitter; +class MCContext; +class MCInstrInfo; +class MCObjectWriter; +class MCRegisterInfo; +class MCSubtargetInfo; +class Target; +class Triple; +class raw_pwrite_stream; +class raw_ostream; + +extern Target TheAMDGPUTarget; +extern Target TheGCNTarget; + +MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + MCContext &Ctx); + +MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + MCContext &Ctx); + +MCAsmBackend *createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI, + const Triple &TT, StringRef CPU); + +MCObjectWriter *createAMDGPUELFObjectWriter(raw_pwrite_stream &OS); +} // End llvm namespace + +#define GET_REGINFO_ENUM +#include "AMDGPUGenRegisterInfo.inc" + +#define GET_INSTRINFO_ENUM +#include "AMDGPUGenInstrInfo.inc" + +#define GET_SUBTARGETINFO_ENUM +#include "AMDGPUGenSubtargetInfo.inc" + +#endif diff --git a/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt b/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt new file mode 100644 index 00000000000..151d0d5f83d --- /dev/null +++ b/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt @@ -0,0 +1,10 @@ + +add_llvm_library(LLVMAMDGPUDesc + AMDGPUAsmBackend.cpp + AMDGPUELFObjectWriter.cpp + AMDGPUMCCodeEmitter.cpp + AMDGPUMCTargetDesc.cpp + AMDGPUMCAsmInfo.cpp + R600MCCodeEmitter.cpp + SIMCCodeEmitter.cpp + ) diff --git a/lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt b/lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt new file mode 100644 index 00000000000..4217bb36297 --- /dev/null +++ b/lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt @@ -0,0 +1,23 @@ +;===- ./lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt -------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = AMDGPUDesc +parent = AMDGPU +required_libraries = MC AMDGPUAsmPrinter AMDGPUInfo Support +add_to_library_groups = AMDGPU diff --git a/lib/Target/AMDGPU/MCTargetDesc/Makefile b/lib/Target/AMDGPU/MCTargetDesc/Makefile new file mode 100644 index 00000000000..8894a7607f4 --- /dev/null +++ b/lib/Target/AMDGPU/MCTargetDesc/Makefile @@ -0,0 +1,16 @@ +##===- lib/Target/AMDGPU/TargetDesc/Makefile ----------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## + +LEVEL = ../../../.. +LIBRARYNAME = LLVMR600Desc + +# Hack: we need to include 'main' target directory to grab private headers +CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. + +include $(LEVEL)/Makefile.common diff --git a/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp new file mode 100644 index 00000000000..e683498d52a --- /dev/null +++ b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp @@ -0,0 +1,181 @@ +//===- R600MCCodeEmitter.cpp - Code Emitter for R600->Cayman GPU families -===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// +/// \brief The R600 code emitter produces machine code that can be executed +/// directly on the GPU device. +// +//===----------------------------------------------------------------------===// + +#include "R600Defines.h" +#include "MCTargetDesc/AMDGPUMCCodeEmitter.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/EndianStream.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +namespace { + +class R600MCCodeEmitter : public AMDGPUMCCodeEmitter { + R600MCCodeEmitter(const R600MCCodeEmitter &) = delete; + void operator=(const R600MCCodeEmitter &) = delete; + const MCInstrInfo &MCII; + const MCRegisterInfo &MRI; + +public: + + R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri) + : MCII(mcii), MRI(mri) { } + + /// \brief Encode the instruction and write it to the OS. + void encodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const override; + + /// \returns the encoding for an MCOperand. + uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const override; +private: + + void EmitByte(unsigned int byte, raw_ostream &OS) const; + + void Emit(uint32_t value, raw_ostream &OS) const; + void Emit(uint64_t value, raw_ostream &OS) const; + + unsigned getHWRegChan(unsigned reg) const; + unsigned getHWReg(unsigned regNo) const; + +}; + +} // End anonymous namespace + +enum RegElement { + ELEMENT_X = 0, + ELEMENT_Y, + ELEMENT_Z, + ELEMENT_W +}; + +enum FCInstr { + FC_IF_PREDICATE = 0, + FC_ELSE, + FC_ENDIF, + FC_BGNLOOP, + FC_ENDLOOP, + FC_BREAK_PREDICATE, + FC_CONTINUE +}; + +MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + MCContext &Ctx) { + return new R600MCCodeEmitter(MCII, MRI); +} + +void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); + if (MI.getOpcode() == AMDGPU::RETURN || + MI.getOpcode() == AMDGPU::FETCH_CLAUSE || + MI.getOpcode() == AMDGPU::ALU_CLAUSE || + MI.getOpcode() == AMDGPU::BUNDLE || + MI.getOpcode() == AMDGPU::KILL) { + return; + } else if (IS_VTX(Desc)) { + uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups, STI); + uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset + if (!(STI.getFeatureBits()[AMDGPU::FeatureCaymanISA])) { + InstWord2 |= 1 << 19; // Mega-Fetch bit + } + + Emit(InstWord01, OS); + Emit(InstWord2, OS); + Emit((uint32_t) 0, OS); + } else if (IS_TEX(Desc)) { + int64_t Sampler = MI.getOperand(14).getImm(); + + int64_t SrcSelect[4] = { + MI.getOperand(2).getImm(), + MI.getOperand(3).getImm(), + MI.getOperand(4).getImm(), + MI.getOperand(5).getImm() + }; + int64_t Offsets[3] = { + MI.getOperand(6).getImm() & 0x1F, + MI.getOperand(7).getImm() & 0x1F, + MI.getOperand(8).getImm() & 0x1F + }; + + uint64_t Word01 = getBinaryCodeForInstr(MI, Fixups, STI); + uint32_t Word2 = Sampler << 15 | SrcSelect[ELEMENT_X] << 20 | + SrcSelect[ELEMENT_Y] << 23 | SrcSelect[ELEMENT_Z] << 26 | + SrcSelect[ELEMENT_W] << 29 | Offsets[0] << 0 | Offsets[1] << 5 | + Offsets[2] << 10; + + Emit(Word01, OS); + Emit(Word2, OS); + Emit((uint32_t) 0, OS); + } else { + uint64_t Inst = getBinaryCodeForInstr(MI, Fixups, STI); + if ((STI.getFeatureBits()[AMDGPU::FeatureR600ALUInst]) && + ((Desc.TSFlags & R600_InstFlag::OP1) || + Desc.TSFlags & R600_InstFlag::OP2)) { + uint64_t ISAOpCode = Inst & (0x3FFULL << 39); + Inst &= ~(0x3FFULL << 39); + Inst |= ISAOpCode << 1; + } + Emit(Inst, OS); + } +} + +void R600MCCodeEmitter::EmitByte(unsigned int Byte, raw_ostream &OS) const { + OS.write((uint8_t) Byte & 0xff); +} + +void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const { + support::endian::Writer(OS).write(Value); +} + +void R600MCCodeEmitter::Emit(uint64_t Value, raw_ostream &OS) const { + support::endian::Writer(OS).write(Value); +} + +unsigned R600MCCodeEmitter::getHWRegChan(unsigned reg) const { + return MRI.getEncodingValue(reg) >> HW_CHAN_SHIFT; +} + +unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const { + return MRI.getEncodingValue(RegNo) & HW_REG_MASK; +} + +uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI, + const MCOperand &MO, + SmallVectorImpl &Fixup, + const MCSubtargetInfo &STI) const { + if (MO.isReg()) { + if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags)) + return MRI.getEncodingValue(MO.getReg()); + return getHWReg(MO.getReg()); + } + + assert(MO.isImm()); + return MO.getImm(); +} + +#include "AMDGPUGenMCCodeEmitter.inc" diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp new file mode 100644 index 00000000000..65a0eeba2b1 --- /dev/null +++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -0,0 +1,289 @@ +//===-- SIMCCodeEmitter.cpp - SI Code Emitter -------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief The SI code emitter produces machine code that can be executed +/// directly on the GPU device. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "MCTargetDesc/AMDGPUFixupKinds.h" +#include "MCTargetDesc/AMDGPUMCCodeEmitter.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIDefines.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCFixup.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +namespace { + +class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { + SIMCCodeEmitter(const SIMCCodeEmitter &) = delete; + void operator=(const SIMCCodeEmitter &) = delete; + const MCInstrInfo &MCII; + const MCRegisterInfo &MRI; + MCContext &Ctx; + + /// \brief Can this operand also contain immediate values? + bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const; + + /// \brief Encode an fp or int literal + uint32_t getLitEncoding(const MCOperand &MO, unsigned OpSize) const; + +public: + SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri, + MCContext &ctx) + : MCII(mcii), MRI(mri), Ctx(ctx) { } + + ~SIMCCodeEmitter() override {} + + /// \brief Encode the instruction and write it to the OS. + void encodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const override; + + /// \returns the encoding for an MCOperand. + uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const override; + + /// \brief Use a fixup to encode the simm16 field for SOPP branch + /// instructions. + unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const override; +}; + +} // End anonymous namespace + +MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + MCContext &Ctx) { + return new SIMCCodeEmitter(MCII, MRI, Ctx); +} + +bool SIMCCodeEmitter::isSrcOperand(const MCInstrDesc &Desc, + unsigned OpNo) const { + unsigned OpType = Desc.OpInfo[OpNo].OperandType; + + return OpType == AMDGPU::OPERAND_REG_IMM32 || + OpType == AMDGPU::OPERAND_REG_INLINE_C; +} + +// Returns the encoding value to use if the given integer is an integer inline +// immediate value, or 0 if it is not. +template +static uint32_t getIntInlineImmEncoding(IntTy Imm) { + if (Imm >= 0 && Imm <= 64) + return 128 + Imm; + + if (Imm >= -16 && Imm <= -1) + return 192 + std::abs(Imm); + + return 0; +} + +static uint32_t getLit32Encoding(uint32_t Val) { + uint32_t IntImm = getIntInlineImmEncoding(static_cast(Val)); + if (IntImm != 0) + return IntImm; + + if (Val == FloatToBits(0.5f)) + return 240; + + if (Val == FloatToBits(-0.5f)) + return 241; + + if (Val == FloatToBits(1.0f)) + return 242; + + if (Val == FloatToBits(-1.0f)) + return 243; + + if (Val == FloatToBits(2.0f)) + return 244; + + if (Val == FloatToBits(-2.0f)) + return 245; + + if (Val == FloatToBits(4.0f)) + return 246; + + if (Val == FloatToBits(-4.0f)) + return 247; + + return 255; +} + +static uint32_t getLit64Encoding(uint64_t Val) { + uint32_t IntImm = getIntInlineImmEncoding(static_cast(Val)); + if (IntImm != 0) + return IntImm; + + if (Val == DoubleToBits(0.5)) + return 240; + + if (Val == DoubleToBits(-0.5)) + return 241; + + if (Val == DoubleToBits(1.0)) + return 242; + + if (Val == DoubleToBits(-1.0)) + return 243; + + if (Val == DoubleToBits(2.0)) + return 244; + + if (Val == DoubleToBits(-2.0)) + return 245; + + if (Val == DoubleToBits(4.0)) + return 246; + + if (Val == DoubleToBits(-4.0)) + return 247; + + return 255; +} + +uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO, + unsigned OpSize) const { + if (MO.isExpr()) + return 255; + + assert(!MO.isFPImm()); + + if (!MO.isImm()) + return ~0; + + if (OpSize == 4) + return getLit32Encoding(static_cast(MO.getImm())); + + assert(OpSize == 8); + + return getLit64Encoding(static_cast(MO.getImm())); +} + +void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + + uint64_t Encoding = getBinaryCodeForInstr(MI, Fixups, STI); + const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); + unsigned bytes = Desc.getSize(); + + for (unsigned i = 0; i < bytes; i++) { + OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff)); + } + + if (bytes > 4) + return; + + // Check for additional literals in SRC0/1/2 (Op 1/2/3) + for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) { + + // Check if this operand should be encoded as [SV]Src + if (!isSrcOperand(Desc, i)) + continue; + + int RCID = Desc.OpInfo[i].RegClass; + const MCRegisterClass &RC = MRI.getRegClass(RCID); + + // Is this operand a literal immediate? + const MCOperand &Op = MI.getOperand(i); + if (getLitEncoding(Op, RC.getSize()) != 255) + continue; + + // Yes! Encode it + int64_t Imm = 0; + + if (Op.isImm()) + Imm = Op.getImm(); + else if (!Op.isExpr()) // Exprs will be replaced with a fixup value. + llvm_unreachable("Must be immediate or expr"); + + for (unsigned j = 0; j < 4; j++) { + OS.write((uint8_t) ((Imm >> (8 * j)) & 0xff)); + } + + // Only one literal value allowed + break; + } +} + +unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpNo); + + if (MO.isExpr()) { + const MCExpr *Expr = MO.getExpr(); + MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_sopp_br; + Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc())); + return 0; + } + + return getMachineOpValue(MI, MO, Fixups, STI); +} + +uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, + const MCOperand &MO, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + if (MO.isReg()) + return MRI.getEncodingValue(MO.getReg()); + + if (MO.isExpr()) { + const MCSymbolRefExpr *Expr = cast(MO.getExpr()); + MCFixupKind Kind; + const MCSymbol *Sym = + Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); + + if (&Expr->getSymbol() == Sym) { + // Add the offset to the beginning of the constant values. + Kind = (MCFixupKind)AMDGPU::fixup_si_end_of_text; + } else { + // This is used for constant data stored in .rodata. + Kind = (MCFixupKind)AMDGPU::fixup_si_rodata; + } + Fixups.push_back(MCFixup::create(4, Expr, Kind, MI.getLoc())); + } + + // Figure out the operand number, needed for isSrcOperand check + unsigned OpNo = 0; + for (unsigned e = MI.getNumOperands(); OpNo < e; ++OpNo) { + if (&MO == &MI.getOperand(OpNo)) + break; + } + + const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); + if (isSrcOperand(Desc, OpNo)) { + int RCID = Desc.OpInfo[OpNo].RegClass; + const MCRegisterClass &RC = MRI.getRegClass(RCID); + + uint32_t Enc = getLitEncoding(MO, RC.getSize()); + if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4)) + return Enc; + + } else if (MO.isImm()) + return MO.getImm(); + + llvm_unreachable("Encoding of this operand type is not supported yet."); + return 0; +} + diff --git a/lib/Target/AMDGPU/Makefile b/lib/Target/AMDGPU/Makefile new file mode 100644 index 00000000000..64a7c8c045c --- /dev/null +++ b/lib/Target/AMDGPU/Makefile @@ -0,0 +1,23 @@ +##===- lib/Target/R600/Makefile ---------------------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## + +LEVEL = ../../.. +LIBRARYNAME = LLVMR600CodeGen +TARGET = AMDGPU + +# Make sure that tblgen is run, first thing. +BUILT_SOURCES = AMDGPUGenRegisterInfo.inc AMDGPUGenInstrInfo.inc \ + AMDGPUGenDAGISel.inc AMDGPUGenSubtargetInfo.inc \ + AMDGPUGenMCCodeEmitter.inc AMDGPUGenCallingConv.inc \ + AMDGPUGenIntrinsics.inc AMDGPUGenDFAPacketizer.inc \ + AMDGPUGenAsmWriter.inc AMDGPUGenAsmMatcher.inc + +DIRS = AsmParser InstPrinter TargetInfo MCTargetDesc + +include $(LEVEL)/Makefile.common diff --git a/lib/Target/AMDGPU/Processors.td b/lib/Target/AMDGPU/Processors.td new file mode 100644 index 00000000000..c0ffede5199 --- /dev/null +++ b/lib/Target/AMDGPU/Processors.td @@ -0,0 +1,137 @@ +//===-- Processors.td - R600 Processor definitions ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +class Proc Features> +: Processor; + +//===----------------------------------------------------------------------===// +// R600 +//===----------------------------------------------------------------------===// +def : Proc<"", R600_VLIW5_Itin, + [FeatureR600, FeatureVertexCache]>; + +def : Proc<"r600", R600_VLIW5_Itin, + [FeatureR600 , FeatureVertexCache, FeatureWavefrontSize64]>; + +def : Proc<"r630", R600_VLIW5_Itin, + [FeatureR600, FeatureVertexCache, FeatureWavefrontSize32]>; + +def : Proc<"rs880", R600_VLIW5_Itin, + [FeatureR600, FeatureWavefrontSize16]>; + +def : Proc<"rv670", R600_VLIW5_Itin, + [FeatureR600, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>; + +//===----------------------------------------------------------------------===// +// R700 +//===----------------------------------------------------------------------===// + +def : Proc<"rv710", R600_VLIW5_Itin, + [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>; + +def : Proc<"rv730", R600_VLIW5_Itin, + [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>; + +def : Proc<"rv770", R600_VLIW5_Itin, + [FeatureR700, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>; + +//===----------------------------------------------------------------------===// +// Evergreen +//===----------------------------------------------------------------------===// + +def : Proc<"cedar", R600_VLIW5_Itin, + [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize32, + FeatureCFALUBug]>; + +def : Proc<"redwood", R600_VLIW5_Itin, + [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64, + FeatureCFALUBug]>; + +def : Proc<"sumo", R600_VLIW5_Itin, + [FeatureEvergreen, FeatureWavefrontSize64, FeatureCFALUBug]>; + +def : Proc<"juniper", R600_VLIW5_Itin, + [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64]>; + +def : Proc<"cypress", R600_VLIW5_Itin, + [FeatureEvergreen, FeatureFP64, FeatureVertexCache, + FeatureWavefrontSize64]>; + +//===----------------------------------------------------------------------===// +// Northern Islands +//===----------------------------------------------------------------------===// + +def : Proc<"barts", R600_VLIW5_Itin, + [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>; + +def : Proc<"turks", R600_VLIW5_Itin, + [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>; + +def : Proc<"caicos", R600_VLIW5_Itin, + [FeatureNorthernIslands, FeatureCFALUBug]>; + +def : Proc<"cayman", R600_VLIW4_Itin, + [FeatureNorthernIslands, FeatureFP64, FeatureCaymanISA]>; + +//===----------------------------------------------------------------------===// +// Southern Islands +//===----------------------------------------------------------------------===// + +def : ProcessorModel<"SI", SIFullSpeedModel, + [FeatureSouthernIslands, FeatureFastFMAF32] +>; + +def : ProcessorModel<"tahiti", SIFullSpeedModel, + [FeatureSouthernIslands, FeatureFastFMAF32] +>; + +def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, [FeatureSouthernIslands]>; + +def : ProcessorModel<"verde", SIQuarterSpeedModel, [FeatureSouthernIslands]>; + +def : ProcessorModel<"oland", SIQuarterSpeedModel, [FeatureSouthernIslands]>; + +def : ProcessorModel<"hainan", SIQuarterSpeedModel, [FeatureSouthernIslands]>; + +//===----------------------------------------------------------------------===// +// Sea Islands +//===----------------------------------------------------------------------===// + +def : ProcessorModel<"bonaire", SIQuarterSpeedModel, + [FeatureSeaIslands, FeatureLDSBankCount32] +>; + +def : ProcessorModel<"kabini", SIQuarterSpeedModel, + [FeatureSeaIslands, FeatureLDSBankCount16] +>; + +def : ProcessorModel<"kaveri", SIQuarterSpeedModel, + [FeatureSeaIslands, FeatureLDSBankCount32] +>; + +def : ProcessorModel<"hawaii", SIFullSpeedModel, + [FeatureSeaIslands, FeatureFastFMAF32, FeatureLDSBankCount32] +>; + +def : ProcessorModel<"mullins", SIQuarterSpeedModel, + [FeatureSeaIslands, FeatureLDSBankCount16]>; + +//===----------------------------------------------------------------------===// +// Volcanic Islands +//===----------------------------------------------------------------------===// + +def : ProcessorModel<"tonga", SIQuarterSpeedModel, + [FeatureVolcanicIslands, FeatureSGPRInitBug] +>; + +def : ProcessorModel<"iceland", SIQuarterSpeedModel, + [FeatureVolcanicIslands, FeatureSGPRInitBug] +>; + +def : ProcessorModel<"carrizo", SIQuarterSpeedModel, [FeatureVolcanicIslands]>; diff --git a/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/lib/Target/AMDGPU/R600ClauseMergePass.cpp new file mode 100644 index 00000000000..3cb90218a7d --- /dev/null +++ b/lib/Target/AMDGPU/R600ClauseMergePass.cpp @@ -0,0 +1,206 @@ +//===-- R600ClauseMergePass - Merge consecutive CF_ALU -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// R600EmitClauseMarker pass emits CFAlu instruction in a conservative maneer. +/// This pass is merging consecutive CFAlus where applicable. +/// It needs to be called after IfCvt for best results. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "R600Defines.h" +#include "R600InstrInfo.h" +#include "R600MachineFunctionInfo.h" +#include "R600RegisterInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "r600mergeclause" + +namespace { + +static bool isCFAlu(const MachineInstr *MI) { + switch (MI->getOpcode()) { + case AMDGPU::CF_ALU: + case AMDGPU::CF_ALU_PUSH_BEFORE: + return true; + default: + return false; + } +} + +class R600ClauseMergePass : public MachineFunctionPass { + +private: + static char ID; + const R600InstrInfo *TII; + + unsigned getCFAluSize(const MachineInstr *MI) const; + bool isCFAluEnabled(const MachineInstr *MI) const; + + /// IfCvt pass can generate "disabled" ALU clause marker that need to be + /// removed and their content affected to the previous alu clause. + /// This function parse instructions after CFAlu until it find a disabled + /// CFAlu and merge the content, or an enabled CFAlu. + void cleanPotentialDisabledCFAlu(MachineInstr *CFAlu) const; + + /// Check whether LatrCFAlu can be merged into RootCFAlu and do it if + /// it is the case. + bool mergeIfPossible(MachineInstr *RootCFAlu, const MachineInstr *LatrCFAlu) + const; + +public: + R600ClauseMergePass(TargetMachine &tm) : MachineFunctionPass(ID) { } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override; +}; + +char R600ClauseMergePass::ID = 0; + +unsigned R600ClauseMergePass::getCFAluSize(const MachineInstr *MI) const { + assert(isCFAlu(MI)); + return MI->getOperand( + TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::COUNT)).getImm(); +} + +bool R600ClauseMergePass::isCFAluEnabled(const MachineInstr *MI) const { + assert(isCFAlu(MI)); + return MI->getOperand( + TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::Enabled)).getImm(); +} + +void R600ClauseMergePass::cleanPotentialDisabledCFAlu(MachineInstr *CFAlu) + const { + int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT); + MachineBasicBlock::iterator I = CFAlu, E = CFAlu->getParent()->end(); + I++; + do { + while (I!= E && !isCFAlu(I)) + I++; + if (I == E) + return; + MachineInstr *MI = I++; + if (isCFAluEnabled(MI)) + break; + CFAlu->getOperand(CntIdx).setImm(getCFAluSize(CFAlu) + getCFAluSize(MI)); + MI->eraseFromParent(); + } while (I != E); +} + +bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu, + const MachineInstr *LatrCFAlu) const { + assert(isCFAlu(RootCFAlu) && isCFAlu(LatrCFAlu)); + int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT); + unsigned RootInstCount = getCFAluSize(RootCFAlu), + LaterInstCount = getCFAluSize(LatrCFAlu); + unsigned CumuledInsts = RootInstCount + LaterInstCount; + if (CumuledInsts >= TII->getMaxAlusPerClause()) { + DEBUG(dbgs() << "Excess inst counts\n"); + return false; + } + if (RootCFAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE) + return false; + // Is KCache Bank 0 compatible ? + int Mode0Idx = + TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE0); + int KBank0Idx = + TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK0); + int KBank0LineIdx = + TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR0); + if (LatrCFAlu->getOperand(Mode0Idx).getImm() && + RootCFAlu->getOperand(Mode0Idx).getImm() && + (LatrCFAlu->getOperand(KBank0Idx).getImm() != + RootCFAlu->getOperand(KBank0Idx).getImm() || + LatrCFAlu->getOperand(KBank0LineIdx).getImm() != + RootCFAlu->getOperand(KBank0LineIdx).getImm())) { + DEBUG(dbgs() << "Wrong KC0\n"); + return false; + } + // Is KCache Bank 1 compatible ? + int Mode1Idx = + TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE1); + int KBank1Idx = + TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK1); + int KBank1LineIdx = + TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR1); + if (LatrCFAlu->getOperand(Mode1Idx).getImm() && + RootCFAlu->getOperand(Mode1Idx).getImm() && + (LatrCFAlu->getOperand(KBank1Idx).getImm() != + RootCFAlu->getOperand(KBank1Idx).getImm() || + LatrCFAlu->getOperand(KBank1LineIdx).getImm() != + RootCFAlu->getOperand(KBank1LineIdx).getImm())) { + DEBUG(dbgs() << "Wrong KC0\n"); + return false; + } + if (LatrCFAlu->getOperand(Mode0Idx).getImm()) { + RootCFAlu->getOperand(Mode0Idx).setImm( + LatrCFAlu->getOperand(Mode0Idx).getImm()); + RootCFAlu->getOperand(KBank0Idx).setImm( + LatrCFAlu->getOperand(KBank0Idx).getImm()); + RootCFAlu->getOperand(KBank0LineIdx).setImm( + LatrCFAlu->getOperand(KBank0LineIdx).getImm()); + } + if (LatrCFAlu->getOperand(Mode1Idx).getImm()) { + RootCFAlu->getOperand(Mode1Idx).setImm( + LatrCFAlu->getOperand(Mode1Idx).getImm()); + RootCFAlu->getOperand(KBank1Idx).setImm( + LatrCFAlu->getOperand(KBank1Idx).getImm()); + RootCFAlu->getOperand(KBank1LineIdx).setImm( + LatrCFAlu->getOperand(KBank1LineIdx).getImm()); + } + RootCFAlu->getOperand(CntIdx).setImm(CumuledInsts); + RootCFAlu->setDesc(TII->get(LatrCFAlu->getOpcode())); + return true; +} + +bool R600ClauseMergePass::runOnMachineFunction(MachineFunction &MF) { + TII = static_cast(MF.getSubtarget().getInstrInfo()); + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + MachineBasicBlock::iterator LatestCFAlu = E; + while (I != E) { + MachineInstr *MI = I++; + if ((!TII->canBeConsideredALU(MI) && !isCFAlu(MI)) || + TII->mustBeLastInClause(MI->getOpcode())) + LatestCFAlu = E; + if (!isCFAlu(MI)) + continue; + cleanPotentialDisabledCFAlu(MI); + + if (LatestCFAlu != E && mergeIfPossible(LatestCFAlu, MI)) { + MI->eraseFromParent(); + } else { + assert(MI->getOperand(8).getImm() && "CF ALU instruction disabled"); + LatestCFAlu = MI; + } + } + } + return false; +} + +const char *R600ClauseMergePass::getPassName() const { + return "R600 Merge Clause Markers Pass"; +} + +} // end anonymous namespace + + +llvm::FunctionPass *llvm::createR600ClauseMergePass(TargetMachine &TM) { + return new R600ClauseMergePass(TM); +} diff --git a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp new file mode 100644 index 00000000000..c8f37f61fc1 --- /dev/null +++ b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp @@ -0,0 +1,679 @@ +//===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass compute turns all control flow pseudo instructions into native one +/// computing their address on the fly ; it also sets STACK_SIZE info. +//===----------------------------------------------------------------------===// + +#include "llvm/Support/Debug.h" +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "R600Defines.h" +#include "R600InstrInfo.h" +#include "R600MachineFunctionInfo.h" +#include "R600RegisterInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "r600cf" + +namespace { + +struct CFStack { + + enum StackItem { + ENTRY = 0, + SUB_ENTRY = 1, + FIRST_NON_WQM_PUSH = 2, + FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3 + }; + + const AMDGPUSubtarget *ST; + std::vector BranchStack; + std::vector LoopStack; + unsigned MaxStackSize; + unsigned CurrentEntries; + unsigned CurrentSubEntries; + + CFStack(const AMDGPUSubtarget *st, unsigned ShaderType) : ST(st), + // We need to reserve a stack entry for CALL_FS in vertex shaders. + MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0), + CurrentEntries(0), CurrentSubEntries(0) { } + + unsigned getLoopDepth(); + bool branchStackContains(CFStack::StackItem); + bool requiresWorkAroundForInst(unsigned Opcode); + unsigned getSubEntrySize(CFStack::StackItem Item); + void updateMaxStackSize(); + void pushBranch(unsigned Opcode, bool isWQM = false); + void pushLoop(); + void popBranch(); + void popLoop(); +}; + +unsigned CFStack::getLoopDepth() { + return LoopStack.size(); +} + +bool CFStack::branchStackContains(CFStack::StackItem Item) { + for (std::vector::const_iterator I = BranchStack.begin(), + E = BranchStack.end(); I != E; ++I) { + if (*I == Item) + return true; + } + return false; +} + +bool CFStack::requiresWorkAroundForInst(unsigned Opcode) { + if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() && + getLoopDepth() > 1) + return true; + + if (!ST->hasCFAluBug()) + return false; + + switch(Opcode) { + default: return false; + case AMDGPU::CF_ALU_PUSH_BEFORE: + case AMDGPU::CF_ALU_ELSE_AFTER: + case AMDGPU::CF_ALU_BREAK: + case AMDGPU::CF_ALU_CONTINUE: + if (CurrentSubEntries == 0) + return false; + if (ST->getWavefrontSize() == 64) { + // We are being conservative here. We only require this work-around if + // CurrentSubEntries > 3 && + // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0) + // + // We have to be conservative, because we don't know for certain that + // our stack allocation algorithm for Evergreen/NI is correct. Applying this + // work-around when CurrentSubEntries > 3 allows us to over-allocate stack + // resources without any problems. + return CurrentSubEntries > 3; + } else { + assert(ST->getWavefrontSize() == 32); + // We are being conservative here. We only require the work-around if + // CurrentSubEntries > 7 && + // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0) + // See the comment on the wavefront size == 64 case for why we are + // being conservative. + return CurrentSubEntries > 7; + } + } +} + +unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { + switch(Item) { + default: + return 0; + case CFStack::FIRST_NON_WQM_PUSH: + assert(!ST->hasCaymanISA()); + if (ST->getGeneration() <= AMDGPUSubtarget::R700) { + // +1 For the push operation. + // +2 Extra space required. + return 3; + } else { + // Some documentation says that this is not necessary on Evergreen, + // but experimentation has show that we need to allocate 1 extra + // sub-entry for the first non-WQM push. + // +1 For the push operation. + // +1 Extra space required. + return 2; + } + case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY: + assert(ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN); + // +1 For the push operation. + // +1 Extra space required. + return 2; + case CFStack::SUB_ENTRY: + return 1; + } +} + +void CFStack::updateMaxStackSize() { + unsigned CurrentStackSize = CurrentEntries + + (RoundUpToAlignment(CurrentSubEntries, 4) / 4); + MaxStackSize = std::max(CurrentStackSize, MaxStackSize); +} + +void CFStack::pushBranch(unsigned Opcode, bool isWQM) { + CFStack::StackItem Item = CFStack::ENTRY; + switch(Opcode) { + case AMDGPU::CF_PUSH_EG: + case AMDGPU::CF_ALU_PUSH_BEFORE: + if (!isWQM) { + if (!ST->hasCaymanISA() && + !branchStackContains(CFStack::FIRST_NON_WQM_PUSH)) + Item = CFStack::FIRST_NON_WQM_PUSH; // May not be required on Evergreen/NI + // See comment in + // CFStack::getSubEntrySize() + else if (CurrentEntries > 0 && + ST->getGeneration() > AMDGPUSubtarget::EVERGREEN && + !ST->hasCaymanISA() && + !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY)) + Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY; + else + Item = CFStack::SUB_ENTRY; + } else + Item = CFStack::ENTRY; + break; + } + BranchStack.push_back(Item); + if (Item == CFStack::ENTRY) + CurrentEntries++; + else + CurrentSubEntries += getSubEntrySize(Item); + updateMaxStackSize(); +} + +void CFStack::pushLoop() { + LoopStack.push_back(CFStack::ENTRY); + CurrentEntries++; + updateMaxStackSize(); +} + +void CFStack::popBranch() { + CFStack::StackItem Top = BranchStack.back(); + if (Top == CFStack::ENTRY) + CurrentEntries--; + else + CurrentSubEntries-= getSubEntrySize(Top); + BranchStack.pop_back(); +} + +void CFStack::popLoop() { + CurrentEntries--; + LoopStack.pop_back(); +} + +class R600ControlFlowFinalizer : public MachineFunctionPass { + +private: + typedef std::pair > ClauseFile; + + enum ControlFlowInstruction { + CF_TC, + CF_VC, + CF_CALL_FS, + CF_WHILE_LOOP, + CF_END_LOOP, + CF_LOOP_BREAK, + CF_LOOP_CONTINUE, + CF_JUMP, + CF_ELSE, + CF_POP, + CF_END + }; + + static char ID; + const R600InstrInfo *TII; + const R600RegisterInfo *TRI; + unsigned MaxFetchInst; + const AMDGPUSubtarget *ST; + + bool IsTrivialInst(MachineInstr *MI) const { + switch (MI->getOpcode()) { + case AMDGPU::KILL: + case AMDGPU::RETURN: + return true; + default: + return false; + } + } + + const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const { + unsigned Opcode = 0; + bool isEg = (ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN); + switch (CFI) { + case CF_TC: + Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600; + break; + case CF_VC: + Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600; + break; + case CF_CALL_FS: + Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600; + break; + case CF_WHILE_LOOP: + Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600; + break; + case CF_END_LOOP: + Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600; + break; + case CF_LOOP_BREAK: + Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600; + break; + case CF_LOOP_CONTINUE: + Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600; + break; + case CF_JUMP: + Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600; + break; + case CF_ELSE: + Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600; + break; + case CF_POP: + Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600; + break; + case CF_END: + if (ST->hasCaymanISA()) { + Opcode = AMDGPU::CF_END_CM; + break; + } + Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600; + break; + } + assert (Opcode && "No opcode selected"); + return TII->get(Opcode); + } + + bool isCompatibleWithClause(const MachineInstr *MI, + std::set &DstRegs) const { + unsigned DstMI, SrcMI; + for (MachineInstr::const_mop_iterator I = MI->operands_begin(), + E = MI->operands_end(); I != E; ++I) { + const MachineOperand &MO = *I; + if (!MO.isReg()) + continue; + if (MO.isDef()) { + unsigned Reg = MO.getReg(); + if (AMDGPU::R600_Reg128RegClass.contains(Reg)) + DstMI = Reg; + else + DstMI = TRI->getMatchingSuperReg(Reg, + TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)), + &AMDGPU::R600_Reg128RegClass); + } + if (MO.isUse()) { + unsigned Reg = MO.getReg(); + if (AMDGPU::R600_Reg128RegClass.contains(Reg)) + SrcMI = Reg; + else + SrcMI = TRI->getMatchingSuperReg(Reg, + TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)), + &AMDGPU::R600_Reg128RegClass); + } + } + if ((DstRegs.find(SrcMI) == DstRegs.end())) { + DstRegs.insert(DstMI); + return true; + } else + return false; + } + + ClauseFile + MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) + const { + MachineBasicBlock::iterator ClauseHead = I; + std::vector ClauseContent; + unsigned AluInstCount = 0; + bool IsTex = TII->usesTextureCache(ClauseHead); + std::set DstRegs; + for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { + if (IsTrivialInst(I)) + continue; + if (AluInstCount >= MaxFetchInst) + break; + if ((IsTex && !TII->usesTextureCache(I)) || + (!IsTex && !TII->usesVertexCache(I))) + break; + if (!isCompatibleWithClause(I, DstRegs)) + break; + AluInstCount ++; + ClauseContent.push_back(I); + } + MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), + getHWInstrDesc(IsTex?CF_TC:CF_VC)) + .addImm(0) // ADDR + .addImm(AluInstCount - 1); // COUNT + return ClauseFile(MIb, std::move(ClauseContent)); + } + + void getLiteral(MachineInstr *MI, std::vector &Lits) const { + static const unsigned LiteralRegs[] = { + AMDGPU::ALU_LITERAL_X, + AMDGPU::ALU_LITERAL_Y, + AMDGPU::ALU_LITERAL_Z, + AMDGPU::ALU_LITERAL_W + }; + const SmallVector, 3 > Srcs = + TII->getSrcs(MI); + for (unsigned i = 0, e = Srcs.size(); i < e; ++i) { + if (Srcs[i].first->getReg() != AMDGPU::ALU_LITERAL_X) + continue; + int64_t Imm = Srcs[i].second; + std::vector::iterator It = + std::find(Lits.begin(), Lits.end(), Imm); + if (It != Lits.end()) { + unsigned Index = It - Lits.begin(); + Srcs[i].first->setReg(LiteralRegs[Index]); + } else { + assert(Lits.size() < 4 && "Too many literals in Instruction Group"); + Srcs[i].first->setReg(LiteralRegs[Lits.size()]); + Lits.push_back(Imm); + } + } + } + + MachineBasicBlock::iterator insertLiterals( + MachineBasicBlock::iterator InsertPos, + const std::vector &Literals) const { + MachineBasicBlock *MBB = InsertPos->getParent(); + for (unsigned i = 0, e = Literals.size(); i < e; i+=2) { + unsigned LiteralPair0 = Literals[i]; + unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0; + InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(), + TII->get(AMDGPU::LITERALS)) + .addImm(LiteralPair0) + .addImm(LiteralPair1); + } + return InsertPos; + } + + ClauseFile + MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) + const { + MachineBasicBlock::iterator ClauseHead = I; + std::vector ClauseContent; + I++; + for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) { + if (IsTrivialInst(I)) { + ++I; + continue; + } + if (!I->isBundle() && !TII->isALUInstr(I->getOpcode())) + break; + std::vector Literals; + if (I->isBundle()) { + MachineInstr *DeleteMI = I; + MachineBasicBlock::instr_iterator BI = I.getInstrIterator(); + while (++BI != E && BI->isBundledWithPred()) { + BI->unbundleFromPred(); + for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = BI->getOperand(i); + if (MO.isReg() && MO.isInternalRead()) + MO.setIsInternalRead(false); + } + getLiteral(BI, Literals); + ClauseContent.push_back(BI); + } + I = BI; + DeleteMI->eraseFromParent(); + } else { + getLiteral(I, Literals); + ClauseContent.push_back(I); + I++; + } + for (unsigned i = 0, e = Literals.size(); i < e; i+=2) { + unsigned literal0 = Literals[i]; + unsigned literal2 = (i + 1 < e)?Literals[i + 1]:0; + MachineInstr *MILit = BuildMI(MBB, I, I->getDebugLoc(), + TII->get(AMDGPU::LITERALS)) + .addImm(literal0) + .addImm(literal2); + ClauseContent.push_back(MILit); + } + } + assert(ClauseContent.size() < 128 && "ALU clause is too big"); + ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1); + return ClauseFile(ClauseHead, std::move(ClauseContent)); + } + + void + EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, + unsigned &CfCount) { + CounterPropagateAddr(Clause.first, CfCount); + MachineBasicBlock *BB = Clause.first->getParent(); + BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE)) + .addImm(CfCount); + for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { + BB->splice(InsertPos, BB, Clause.second[i]); + } + CfCount += 2 * Clause.second.size(); + } + + void + EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, + unsigned &CfCount) { + Clause.first->getOperand(0).setImm(0); + CounterPropagateAddr(Clause.first, CfCount); + MachineBasicBlock *BB = Clause.first->getParent(); + BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE)) + .addImm(CfCount); + for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { + BB->splice(InsertPos, BB, Clause.second[i]); + } + CfCount += Clause.second.size(); + } + + void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const { + MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm()); + } + void CounterPropagateAddr(const std::set &MIs, + unsigned Addr) const { + for (MachineInstr *MI : MIs) { + CounterPropagateAddr(MI, Addr); + } + } + +public: + R600ControlFlowFinalizer(TargetMachine &tm) + : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), ST(nullptr) {} + + bool runOnMachineFunction(MachineFunction &MF) override { + ST = &MF.getSubtarget(); + MaxFetchInst = ST->getTexVTXClauseSize(); + TII = static_cast(ST->getInstrInfo()); + TRI = static_cast(ST->getRegisterInfo()); + R600MachineFunctionInfo *MFI = MF.getInfo(); + + CFStack CFStack(ST, MFI->getShaderType()); + for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME; + ++MB) { + MachineBasicBlock &MBB = *MB; + unsigned CfCount = 0; + std::vector > > LoopStack; + std::vector IfThenElseStack; + if (MFI->getShaderType() == ShaderType::VERTEX) { + BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), + getHWInstrDesc(CF_CALL_FS)); + CfCount++; + } + std::vector FetchClauses, AluClauses; + std::vector LastAlu(1); + std::vector ToPopAfter; + + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E;) { + if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) { + DEBUG(dbgs() << CfCount << ":"; I->dump();); + FetchClauses.push_back(MakeFetchClause(MBB, I)); + CfCount++; + LastAlu.back() = nullptr; + continue; + } + + MachineBasicBlock::iterator MI = I; + if (MI->getOpcode() != AMDGPU::ENDIF) + LastAlu.back() = nullptr; + if (MI->getOpcode() == AMDGPU::CF_ALU) + LastAlu.back() = MI; + I++; + bool RequiresWorkAround = + CFStack.requiresWorkAroundForInst(MI->getOpcode()); + switch (MI->getOpcode()) { + case AMDGPU::CF_ALU_PUSH_BEFORE: + if (RequiresWorkAround) { + DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n"); + BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG)) + .addImm(CfCount + 1) + .addImm(1); + MI->setDesc(TII->get(AMDGPU::CF_ALU)); + CfCount++; + CFStack.pushBranch(AMDGPU::CF_PUSH_EG); + } else + CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE); + + case AMDGPU::CF_ALU: + I = MI; + AluClauses.push_back(MakeALUClause(MBB, I)); + DEBUG(dbgs() << CfCount << ":"; MI->dump();); + CfCount++; + break; + case AMDGPU::WHILELOOP: { + CFStack.pushLoop(); + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_WHILE_LOOP)) + .addImm(1); + std::pair > Pair(CfCount, + std::set()); + Pair.second.insert(MIb); + LoopStack.push_back(std::move(Pair)); + MI->eraseFromParent(); + CfCount++; + break; + } + case AMDGPU::ENDLOOP: { + CFStack.popLoop(); + std::pair > Pair = + std::move(LoopStack.back()); + LoopStack.pop_back(); + CounterPropagateAddr(Pair.second, CfCount); + BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP)) + .addImm(Pair.first + 1); + MI->eraseFromParent(); + CfCount++; + break; + } + case AMDGPU::IF_PREDICATE_SET: { + LastAlu.push_back(nullptr); + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_JUMP)) + .addImm(0) + .addImm(0); + IfThenElseStack.push_back(MIb); + DEBUG(dbgs() << CfCount << ":"; MIb->dump();); + MI->eraseFromParent(); + CfCount++; + break; + } + case AMDGPU::ELSE: { + MachineInstr * JumpInst = IfThenElseStack.back(); + IfThenElseStack.pop_back(); + CounterPropagateAddr(JumpInst, CfCount); + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_ELSE)) + .addImm(0) + .addImm(0); + DEBUG(dbgs() << CfCount << ":"; MIb->dump();); + IfThenElseStack.push_back(MIb); + MI->eraseFromParent(); + CfCount++; + break; + } + case AMDGPU::ENDIF: { + CFStack.popBranch(); + if (LastAlu.back()) { + ToPopAfter.push_back(LastAlu.back()); + } else { + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_POP)) + .addImm(CfCount + 1) + .addImm(1); + (void)MIb; + DEBUG(dbgs() << CfCount << ":"; MIb->dump();); + CfCount++; + } + + MachineInstr *IfOrElseInst = IfThenElseStack.back(); + IfThenElseStack.pop_back(); + CounterPropagateAddr(IfOrElseInst, CfCount); + IfOrElseInst->getOperand(1).setImm(1); + LastAlu.pop_back(); + MI->eraseFromParent(); + break; + } + case AMDGPU::BREAK: { + CfCount ++; + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_LOOP_BREAK)) + .addImm(0); + LoopStack.back().second.insert(MIb); + MI->eraseFromParent(); + break; + } + case AMDGPU::CONTINUE: { + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_LOOP_CONTINUE)) + .addImm(0); + LoopStack.back().second.insert(MIb); + MI->eraseFromParent(); + CfCount++; + break; + } + case AMDGPU::RETURN: { + BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END)); + CfCount++; + MI->eraseFromParent(); + if (CfCount % 2) { + BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD)); + CfCount++; + } + for (unsigned i = 0, e = FetchClauses.size(); i < e; i++) + EmitFetchClause(I, FetchClauses[i], CfCount); + for (unsigned i = 0, e = AluClauses.size(); i < e; i++) + EmitALUClause(I, AluClauses[i], CfCount); + } + default: + if (TII->isExport(MI->getOpcode())) { + DEBUG(dbgs() << CfCount << ":"; MI->dump();); + CfCount++; + } + break; + } + } + for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) { + MachineInstr *Alu = ToPopAfter[i]; + BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu), + TII->get(AMDGPU::CF_ALU_POP_AFTER)) + .addImm(Alu->getOperand(0).getImm()) + .addImm(Alu->getOperand(1).getImm()) + .addImm(Alu->getOperand(2).getImm()) + .addImm(Alu->getOperand(3).getImm()) + .addImm(Alu->getOperand(4).getImm()) + .addImm(Alu->getOperand(5).getImm()) + .addImm(Alu->getOperand(6).getImm()) + .addImm(Alu->getOperand(7).getImm()) + .addImm(Alu->getOperand(8).getImm()); + Alu->eraseFromParent(); + } + MFI->StackSize = CFStack.MaxStackSize; + } + + return false; + } + + const char *getPassName() const override { + return "R600 Control Flow Finalizer Pass"; + } +}; + +char R600ControlFlowFinalizer::ID = 0; + +} // end anonymous namespace + + +llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) { + return new R600ControlFlowFinalizer(TM); +} diff --git a/lib/Target/AMDGPU/R600Defines.h b/lib/Target/AMDGPU/R600Defines.h new file mode 100644 index 00000000000..51d87eda31d --- /dev/null +++ b/lib/Target/AMDGPU/R600Defines.h @@ -0,0 +1,171 @@ +//===-- R600Defines.h - R600 Helper Macros ----------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_R600DEFINES_H +#define LLVM_LIB_TARGET_R600_R600DEFINES_H + +#include "llvm/MC/MCRegisterInfo.h" + +// Operand Flags +#define MO_FLAG_CLAMP (1 << 0) +#define MO_FLAG_NEG (1 << 1) +#define MO_FLAG_ABS (1 << 2) +#define MO_FLAG_MASK (1 << 3) +#define MO_FLAG_PUSH (1 << 4) +#define MO_FLAG_NOT_LAST (1 << 5) +#define MO_FLAG_LAST (1 << 6) +#define NUM_MO_FLAGS 7 + +/// \brief Helper for getting the operand index for the instruction flags +/// operand. +#define GET_FLAG_OPERAND_IDX(Flags) (((Flags) >> 7) & 0x3) + +namespace R600_InstFlag { + enum TIF { + TRANS_ONLY = (1 << 0), + TEX = (1 << 1), + REDUCTION = (1 << 2), + FC = (1 << 3), + TRIG = (1 << 4), + OP3 = (1 << 5), + VECTOR = (1 << 6), + //FlagOperand bits 7, 8 + NATIVE_OPERANDS = (1 << 9), + OP1 = (1 << 10), + OP2 = (1 << 11), + VTX_INST = (1 << 12), + TEX_INST = (1 << 13), + ALU_INST = (1 << 14), + LDS_1A = (1 << 15), + LDS_1A1D = (1 << 16), + IS_EXPORT = (1 << 17), + LDS_1A2D = (1 << 18) + }; +} + +#define HAS_NATIVE_OPERANDS(Flags) ((Flags) & R600_InstFlag::NATIVE_OPERANDS) + +/// \brief Defines for extracting register information from register encoding +#define HW_REG_MASK 0x1ff +#define HW_CHAN_SHIFT 9 + +#define GET_REG_CHAN(reg) ((reg) >> HW_CHAN_SHIFT) +#define GET_REG_INDEX(reg) ((reg) & HW_REG_MASK) + +#define IS_VTX(desc) ((desc).TSFlags & R600_InstFlag::VTX_INST) +#define IS_TEX(desc) ((desc).TSFlags & R600_InstFlag::TEX_INST) + +namespace OpName { + + enum VecOps { + UPDATE_EXEC_MASK_X, + UPDATE_PREDICATE_X, + WRITE_X, + OMOD_X, + DST_REL_X, + CLAMP_X, + SRC0_X, + SRC0_NEG_X, + SRC0_REL_X, + SRC0_ABS_X, + SRC0_SEL_X, + SRC1_X, + SRC1_NEG_X, + SRC1_REL_X, + SRC1_ABS_X, + SRC1_SEL_X, + PRED_SEL_X, + UPDATE_EXEC_MASK_Y, + UPDATE_PREDICATE_Y, + WRITE_Y, + OMOD_Y, + DST_REL_Y, + CLAMP_Y, + SRC0_Y, + SRC0_NEG_Y, + SRC0_REL_Y, + SRC0_ABS_Y, + SRC0_SEL_Y, + SRC1_Y, + SRC1_NEG_Y, + SRC1_REL_Y, + SRC1_ABS_Y, + SRC1_SEL_Y, + PRED_SEL_Y, + UPDATE_EXEC_MASK_Z, + UPDATE_PREDICATE_Z, + WRITE_Z, + OMOD_Z, + DST_REL_Z, + CLAMP_Z, + SRC0_Z, + SRC0_NEG_Z, + SRC0_REL_Z, + SRC0_ABS_Z, + SRC0_SEL_Z, + SRC1_Z, + SRC1_NEG_Z, + SRC1_REL_Z, + SRC1_ABS_Z, + SRC1_SEL_Z, + PRED_SEL_Z, + UPDATE_EXEC_MASK_W, + UPDATE_PREDICATE_W, + WRITE_W, + OMOD_W, + DST_REL_W, + CLAMP_W, + SRC0_W, + SRC0_NEG_W, + SRC0_REL_W, + SRC0_ABS_W, + SRC0_SEL_W, + SRC1_W, + SRC1_NEG_W, + SRC1_REL_W, + SRC1_ABS_W, + SRC1_SEL_W, + PRED_SEL_W, + IMM_0, + IMM_1, + VEC_COUNT + }; + +} + +//===----------------------------------------------------------------------===// +// Config register definitions +//===----------------------------------------------------------------------===// + +#define R_02880C_DB_SHADER_CONTROL 0x02880C +#define S_02880C_KILL_ENABLE(x) (((x) & 0x1) << 6) + +// These fields are the same for all shader types and families. +#define S_NUM_GPRS(x) (((x) & 0xFF) << 0) +#define S_STACK_SIZE(x) (((x) & 0xFF) << 8) +//===----------------------------------------------------------------------===// +// R600, R700 Registers +//===----------------------------------------------------------------------===// + +#define R_028850_SQ_PGM_RESOURCES_PS 0x028850 +#define R_028868_SQ_PGM_RESOURCES_VS 0x028868 + +//===----------------------------------------------------------------------===// +// Evergreen, Northern Islands Registers +//===----------------------------------------------------------------------===// + +#define R_028844_SQ_PGM_RESOURCES_PS 0x028844 +#define R_028860_SQ_PGM_RESOURCES_VS 0x028860 +#define R_028878_SQ_PGM_RESOURCES_GS 0x028878 +#define R_0288D4_SQ_PGM_RESOURCES_LS 0x0288d4 + +#define R_0288E8_SQ_LDS_ALLOC 0x0288E8 + +#endif diff --git a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp new file mode 100644 index 00000000000..fdc20302f4a --- /dev/null +++ b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp @@ -0,0 +1,336 @@ +//===-- R600EmitClauseMarkers.cpp - Emit CF_ALU ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Add CF_ALU. R600 Alu instructions are grouped in clause which can hold +/// 128 Alu instructions ; these instructions can access up to 4 prefetched +/// 4 lines of 16 registers from constant buffers. Such ALU clauses are +/// initiated by CF_ALU instructions. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "R600Defines.h" +#include "R600InstrInfo.h" +#include "R600MachineFunctionInfo.h" +#include "R600RegisterInfo.h" +#include "AMDGPUSubtarget.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +namespace llvm { + void initializeR600EmitClauseMarkersPass(PassRegistry&); +} + +namespace { + +class R600EmitClauseMarkers : public MachineFunctionPass { + +private: + const R600InstrInfo *TII; + int Address; + + unsigned OccupiedDwords(MachineInstr *MI) const { + switch (MI->getOpcode()) { + case AMDGPU::INTERP_PAIR_XY: + case AMDGPU::INTERP_PAIR_ZW: + case AMDGPU::INTERP_VEC_LOAD: + case AMDGPU::DOT_4: + return 4; + case AMDGPU::KILL: + return 0; + default: + break; + } + + // These will be expanded to two ALU instructions in the + // ExpandSpecialInstructions pass. + if (TII->isLDSRetInstr(MI->getOpcode())) + return 2; + + if(TII->isVector(*MI) || + TII->isCubeOp(MI->getOpcode()) || + TII->isReductionOp(MI->getOpcode())) + return 4; + + unsigned NumLiteral = 0; + for (MachineInstr::mop_iterator It = MI->operands_begin(), + E = MI->operands_end(); It != E; ++It) { + MachineOperand &MO = *It; + if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X) + ++NumLiteral; + } + return 1 + NumLiteral; + } + + bool isALU(const MachineInstr *MI) const { + if (TII->isALUInstr(MI->getOpcode())) + return true; + if (TII->isVector(*MI) || TII->isCubeOp(MI->getOpcode())) + return true; + switch (MI->getOpcode()) { + case AMDGPU::PRED_X: + case AMDGPU::INTERP_PAIR_XY: + case AMDGPU::INTERP_PAIR_ZW: + case AMDGPU::INTERP_VEC_LOAD: + case AMDGPU::COPY: + case AMDGPU::DOT_4: + return true; + default: + return false; + } + } + + bool IsTrivialInst(MachineInstr *MI) const { + switch (MI->getOpcode()) { + case AMDGPU::KILL: + case AMDGPU::RETURN: + case AMDGPU::IMPLICIT_DEF: + return true; + default: + return false; + } + } + + std::pair getAccessedBankLine(unsigned Sel) const { + // Sel is (512 + (kc_bank << 12) + ConstIndex) << 2 + // (See also R600ISelLowering.cpp) + // ConstIndex value is in [0, 4095]; + return std::pair( + ((Sel >> 2) - 512) >> 12, // KC_BANK + // Line Number of ConstIndex + // A line contains 16 constant registers however KCX bank can lock + // two line at the same time ; thus we want to get an even line number. + // Line number can be retrieved with (>>4), using (>>5) <<1 generates + // an even number. + ((((Sel >> 2) - 512) & 4095) >> 5) << 1); + } + + bool SubstituteKCacheBank(MachineInstr *MI, + std::vector > &CachedConsts, + bool UpdateInstr = true) const { + std::vector > UsedKCache; + + if (!TII->isALUInstr(MI->getOpcode()) && MI->getOpcode() != AMDGPU::DOT_4) + return true; + + const SmallVectorImpl > &Consts = + TII->getSrcs(MI); + assert((TII->isALUInstr(MI->getOpcode()) || + MI->getOpcode() == AMDGPU::DOT_4) && "Can't assign Const"); + for (unsigned i = 0, n = Consts.size(); i < n; ++i) { + if (Consts[i].first->getReg() != AMDGPU::ALU_CONST) + continue; + unsigned Sel = Consts[i].second; + unsigned Chan = Sel & 3, Index = ((Sel >> 2) - 512) & 31; + unsigned KCacheIndex = Index * 4 + Chan; + const std::pair &BankLine = getAccessedBankLine(Sel); + if (CachedConsts.empty()) { + CachedConsts.push_back(BankLine); + UsedKCache.push_back(std::pair(0, KCacheIndex)); + continue; + } + if (CachedConsts[0] == BankLine) { + UsedKCache.push_back(std::pair(0, KCacheIndex)); + continue; + } + if (CachedConsts.size() == 1) { + CachedConsts.push_back(BankLine); + UsedKCache.push_back(std::pair(1, KCacheIndex)); + continue; + } + if (CachedConsts[1] == BankLine) { + UsedKCache.push_back(std::pair(1, KCacheIndex)); + continue; + } + return false; + } + + if (!UpdateInstr) + return true; + + for (unsigned i = 0, j = 0, n = Consts.size(); i < n; ++i) { + if (Consts[i].first->getReg() != AMDGPU::ALU_CONST) + continue; + switch(UsedKCache[j].first) { + case 0: + Consts[i].first->setReg( + AMDGPU::R600_KC0RegClass.getRegister(UsedKCache[j].second)); + break; + case 1: + Consts[i].first->setReg( + AMDGPU::R600_KC1RegClass.getRegister(UsedKCache[j].second)); + break; + default: + llvm_unreachable("Wrong Cache Line"); + } + j++; + } + return true; + } + + bool canClauseLocalKillFitInClause( + unsigned AluInstCount, + std::vector > KCacheBanks, + MachineBasicBlock::iterator Def, + MachineBasicBlock::iterator BBEnd) { + const R600RegisterInfo &TRI = TII->getRegisterInfo(); + for (MachineInstr::const_mop_iterator + MOI = Def->operands_begin(), + MOE = Def->operands_end(); MOI != MOE; ++MOI) { + if (!MOI->isReg() || !MOI->isDef() || + TRI.isPhysRegLiveAcrossClauses(MOI->getReg())) + continue; + + // Def defines a clause local register, so check that its use will fit + // in the clause. + unsigned LastUseCount = 0; + for (MachineBasicBlock::iterator UseI = Def; UseI != BBEnd; ++UseI) { + AluInstCount += OccupiedDwords(UseI); + // Make sure we won't need to end the clause due to KCache limitations. + if (!SubstituteKCacheBank(UseI, KCacheBanks, false)) + return false; + + // We have reached the maximum instruction limit before finding the + // use that kills this register, so we cannot use this def in the + // current clause. + if (AluInstCount >= TII->getMaxAlusPerClause()) + return false; + + // Register kill flags have been cleared by the time we get to this + // pass, but it is safe to assume that all uses of this register + // occur in the same basic block as its definition, because + // it is illegal for the scheduler to schedule them in + // different blocks. + if (UseI->findRegisterUseOperandIdx(MOI->getReg())) + LastUseCount = AluInstCount; + + if (UseI != Def && UseI->findRegisterDefOperandIdx(MOI->getReg()) != -1) + break; + } + if (LastUseCount) + return LastUseCount <= TII->getMaxAlusPerClause(); + llvm_unreachable("Clause local register live at end of clause."); + } + return true; + } + + MachineBasicBlock::iterator + MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) { + MachineBasicBlock::iterator ClauseHead = I; + std::vector > KCacheBanks; + bool PushBeforeModifier = false; + unsigned AluInstCount = 0; + for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { + if (IsTrivialInst(I)) + continue; + if (!isALU(I)) + break; + if (AluInstCount > TII->getMaxAlusPerClause()) + break; + if (I->getOpcode() == AMDGPU::PRED_X) { + // We put PRED_X in its own clause to ensure that ifcvt won't create + // clauses with more than 128 insts. + // IfCvt is indeed checking that "then" and "else" branches of an if + // statement have less than ~60 insts thus converted clauses can't be + // bigger than ~121 insts (predicate setter needs to be in the same + // clause as predicated alus). + if (AluInstCount > 0) + break; + if (TII->getFlagOp(I).getImm() & MO_FLAG_PUSH) + PushBeforeModifier = true; + AluInstCount ++; + continue; + } + // XXX: GROUP_BARRIER instructions cannot be in the same ALU clause as: + // + // * KILL or INTERP instructions + // * Any instruction that sets UPDATE_EXEC_MASK or UPDATE_PRED bits + // * Uses waterfalling (i.e. INDEX_MODE = AR.X) + // + // XXX: These checks have not been implemented yet. + if (TII->mustBeLastInClause(I->getOpcode())) { + I++; + break; + } + + // If this instruction defines a clause local register, make sure + // its use can fit in this clause. + if (!canClauseLocalKillFitInClause(AluInstCount, KCacheBanks, I, E)) + break; + + if (!SubstituteKCacheBank(I, KCacheBanks)) + break; + AluInstCount += OccupiedDwords(I); + } + unsigned Opcode = PushBeforeModifier ? + AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU; + BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode)) + // We don't use the ADDR field until R600ControlFlowFinalizer pass, where + // it is safe to assume it is 0. However if we always put 0 here, the ifcvt + // pass may assume that identical ALU clause starter at the beginning of a + // true and false branch can be factorized which is not the case. + .addImm(Address++) // ADDR + .addImm(KCacheBanks.empty()?0:KCacheBanks[0].first) // KB0 + .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].first) // KB1 + .addImm(KCacheBanks.empty()?0:2) // KM0 + .addImm((KCacheBanks.size() < 2)?0:2) // KM1 + .addImm(KCacheBanks.empty()?0:KCacheBanks[0].second) // KLINE0 + .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].second) // KLINE1 + .addImm(AluInstCount) // COUNT + .addImm(1); // Enabled + return I; + } + +public: + static char ID; + R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(nullptr), Address(0) { + + initializeR600EmitClauseMarkersPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override { + TII = static_cast(MF.getSubtarget().getInstrInfo()); + + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + MachineBasicBlock::iterator I = MBB.begin(); + if (I->getOpcode() == AMDGPU::CF_ALU) + continue; // BB was already parsed + for (MachineBasicBlock::iterator E = MBB.end(); I != E;) { + if (isALU(I)) + I = MakeALUClause(MBB, I); + else + ++I; + } + } + return false; + } + + const char *getPassName() const override { + return "R600 Emit Clause Markers Pass"; + } +}; + +char R600EmitClauseMarkers::ID = 0; + +} // end anonymous namespace + +INITIALIZE_PASS_BEGIN(R600EmitClauseMarkers, "emitclausemarkers", + "R600 Emit Clause Markters", false, false) +INITIALIZE_PASS_END(R600EmitClauseMarkers, "emitclausemarkers", + "R600 Emit Clause Markters", false, false) + +llvm::FunctionPass *llvm::createR600EmitClauseMarkers() { + return new R600EmitClauseMarkers(); +} + diff --git a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp new file mode 100644 index 00000000000..211d392e8fc --- /dev/null +++ b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp @@ -0,0 +1,349 @@ +//===-- R600ExpandSpecialInstrs.cpp - Expand special instructions ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Vector, Reduction, and Cube instructions need to fill the entire instruction +/// group to work correctly. This pass expands these individual instructions +/// into several instructions that will completely fill the instruction group. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "R600Defines.h" +#include "R600InstrInfo.h" +#include "R600MachineFunctionInfo.h" +#include "R600RegisterInfo.h" +#include "AMDGPUSubtarget.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +namespace { + +class R600ExpandSpecialInstrsPass : public MachineFunctionPass { + +private: + static char ID; + const R600InstrInfo *TII; + + void SetFlagInNewMI(MachineInstr *NewMI, const MachineInstr *OldMI, + unsigned Op); + +public: + R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID), + TII(nullptr) { } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "R600 Expand special instructions pass"; + } +}; + +} // End anonymous namespace + +char R600ExpandSpecialInstrsPass::ID = 0; + +FunctionPass *llvm::createR600ExpandSpecialInstrsPass(TargetMachine &TM) { + return new R600ExpandSpecialInstrsPass(TM); +} + +void R600ExpandSpecialInstrsPass::SetFlagInNewMI(MachineInstr *NewMI, + const MachineInstr *OldMI, unsigned Op) { + int OpIdx = TII->getOperandIdx(*OldMI, Op); + if (OpIdx > -1) { + uint64_t Val = OldMI->getOperand(OpIdx).getImm(); + TII->setImmOperand(NewMI, Op, Val); + } +} + +bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { + TII = static_cast(MF.getSubtarget().getInstrInfo()); + + const R600RegisterInfo &TRI = TII->getRegisterInfo(); + + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + MachineBasicBlock::iterator I = MBB.begin(); + while (I != MBB.end()) { + MachineInstr &MI = *I; + I = std::next(I); + + // Expand LDS_*_RET instructions + if (TII->isLDSRetInstr(MI.getOpcode())) { + int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); + assert(DstIdx != -1); + MachineOperand &DstOp = MI.getOperand(DstIdx); + MachineInstr *Mov = TII->buildMovInstr(&MBB, I, + DstOp.getReg(), AMDGPU::OQAP); + DstOp.setReg(AMDGPU::OQAP); + int LDSPredSelIdx = TII->getOperandIdx(MI.getOpcode(), + AMDGPU::OpName::pred_sel); + int MovPredSelIdx = TII->getOperandIdx(Mov->getOpcode(), + AMDGPU::OpName::pred_sel); + // Copy the pred_sel bit + Mov->getOperand(MovPredSelIdx).setReg( + MI.getOperand(LDSPredSelIdx).getReg()); + } + + switch (MI.getOpcode()) { + default: break; + // Expand PRED_X to one of the PRED_SET instructions. + case AMDGPU::PRED_X: { + uint64_t Flags = MI.getOperand(3).getImm(); + // The native opcode used by PRED_X is stored as an immediate in the + // third operand. + MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I, + MI.getOperand(2).getImm(), // opcode + MI.getOperand(0).getReg(), // dst + MI.getOperand(1).getReg(), // src0 + AMDGPU::ZERO); // src1 + TII->addFlag(PredSet, 0, MO_FLAG_MASK); + if (Flags & MO_FLAG_PUSH) { + TII->setImmOperand(PredSet, AMDGPU::OpName::update_exec_mask, 1); + } else { + TII->setImmOperand(PredSet, AMDGPU::OpName::update_pred, 1); + } + MI.eraseFromParent(); + continue; + } + + case AMDGPU::INTERP_PAIR_XY: { + MachineInstr *BMI; + unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( + MI.getOperand(2).getImm()); + + for (unsigned Chan = 0; Chan < 4; ++Chan) { + unsigned DstReg; + + if (Chan < 2) + DstReg = MI.getOperand(Chan).getReg(); + else + DstReg = Chan == 2 ? AMDGPU::T0_Z : AMDGPU::T0_W; + + BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_XY, + DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg); + + if (Chan > 0) { + BMI->bundleWithPred(); + } + if (Chan >= 2) + TII->addFlag(BMI, 0, MO_FLAG_MASK); + if (Chan != 3) + TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); + } + + MI.eraseFromParent(); + continue; + } + + case AMDGPU::INTERP_PAIR_ZW: { + MachineInstr *BMI; + unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( + MI.getOperand(2).getImm()); + + for (unsigned Chan = 0; Chan < 4; ++Chan) { + unsigned DstReg; + + if (Chan < 2) + DstReg = Chan == 0 ? AMDGPU::T0_X : AMDGPU::T0_Y; + else + DstReg = MI.getOperand(Chan-2).getReg(); + + BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_ZW, + DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg); + + if (Chan > 0) { + BMI->bundleWithPred(); + } + if (Chan < 2) + TII->addFlag(BMI, 0, MO_FLAG_MASK); + if (Chan != 3) + TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); + } + + MI.eraseFromParent(); + continue; + } + + case AMDGPU::INTERP_VEC_LOAD: { + const R600RegisterInfo &TRI = TII->getRegisterInfo(); + MachineInstr *BMI; + unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( + MI.getOperand(1).getImm()); + unsigned DstReg = MI.getOperand(0).getReg(); + + for (unsigned Chan = 0; Chan < 4; ++Chan) { + BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_LOAD_P0, + TRI.getSubReg(DstReg, TRI.getSubRegFromChannel(Chan)), PReg); + if (Chan > 0) { + BMI->bundleWithPred(); + } + if (Chan != 3) + TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); + } + + MI.eraseFromParent(); + continue; + } + case AMDGPU::DOT_4: { + + const R600RegisterInfo &TRI = TII->getRegisterInfo(); + + unsigned DstReg = MI.getOperand(0).getReg(); + unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK; + + for (unsigned Chan = 0; Chan < 4; ++Chan) { + bool Mask = (Chan != TRI.getHWRegChan(DstReg)); + unsigned SubDstReg = + AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan); + MachineInstr *BMI = + TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, SubDstReg); + if (Chan > 0) { + BMI->bundleWithPred(); + } + if (Mask) { + TII->addFlag(BMI, 0, MO_FLAG_MASK); + } + if (Chan != 3) + TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); + unsigned Opcode = BMI->getOpcode(); + // While not strictly necessary from hw point of view, we force + // all src operands of a dot4 inst to belong to the same slot. + unsigned Src0 = BMI->getOperand( + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0)) + .getReg(); + unsigned Src1 = BMI->getOperand( + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1)) + .getReg(); + (void) Src0; + (void) Src1; + if ((TRI.getEncodingValue(Src0) & 0xff) < 127 && + (TRI.getEncodingValue(Src1) & 0xff) < 127) + assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1)); + } + MI.eraseFromParent(); + continue; + } + } + + bool IsReduction = TII->isReductionOp(MI.getOpcode()); + bool IsVector = TII->isVector(MI); + bool IsCube = TII->isCubeOp(MI.getOpcode()); + if (!IsReduction && !IsVector && !IsCube) { + continue; + } + + // Expand the instruction + // + // Reduction instructions: + // T0_X = DP4 T1_XYZW, T2_XYZW + // becomes: + // TO_X = DP4 T1_X, T2_X + // TO_Y (write masked) = DP4 T1_Y, T2_Y + // TO_Z (write masked) = DP4 T1_Z, T2_Z + // TO_W (write masked) = DP4 T1_W, T2_W + // + // Vector instructions: + // T0_X = MULLO_INT T1_X, T2_X + // becomes: + // T0_X = MULLO_INT T1_X, T2_X + // T0_Y (write masked) = MULLO_INT T1_X, T2_X + // T0_Z (write masked) = MULLO_INT T1_X, T2_X + // T0_W (write masked) = MULLO_INT T1_X, T2_X + // + // Cube instructions: + // T0_XYZW = CUBE T1_XYZW + // becomes: + // TO_X = CUBE T1_Z, T1_Y + // T0_Y = CUBE T1_Z, T1_X + // T0_Z = CUBE T1_X, T1_Z + // T0_W = CUBE T1_Y, T1_Z + for (unsigned Chan = 0; Chan < 4; Chan++) { + unsigned DstReg = MI.getOperand( + TII->getOperandIdx(MI, AMDGPU::OpName::dst)).getReg(); + unsigned Src0 = MI.getOperand( + TII->getOperandIdx(MI, AMDGPU::OpName::src0)).getReg(); + unsigned Src1 = 0; + + // Determine the correct source registers + if (!IsCube) { + int Src1Idx = TII->getOperandIdx(MI, AMDGPU::OpName::src1); + if (Src1Idx != -1) { + Src1 = MI.getOperand(Src1Idx).getReg(); + } + } + if (IsReduction) { + unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan); + Src0 = TRI.getSubReg(Src0, SubRegIndex); + Src1 = TRI.getSubReg(Src1, SubRegIndex); + } else if (IsCube) { + static const int CubeSrcSwz[] = {2, 2, 0, 1}; + unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]); + unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]); + Src1 = TRI.getSubReg(Src0, SubRegIndex1); + Src0 = TRI.getSubReg(Src0, SubRegIndex0); + } + + // Determine the correct destination registers; + bool Mask = false; + bool NotLast = true; + if (IsCube) { + unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan); + DstReg = TRI.getSubReg(DstReg, SubRegIndex); + } else { + // Mask the write if the original instruction does not write to + // the current Channel. + Mask = (Chan != TRI.getHWRegChan(DstReg)); + unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK; + DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan); + } + + // Set the IsLast bit + NotLast = (Chan != 3 ); + + // Add the new instruction + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + case AMDGPU::CUBE_r600_pseudo: + Opcode = AMDGPU::CUBE_r600_real; + break; + case AMDGPU::CUBE_eg_pseudo: + Opcode = AMDGPU::CUBE_eg_real; + break; + default: + break; + } + + MachineInstr *NewMI = + TII->buildDefaultInstruction(MBB, I, Opcode, DstReg, Src0, Src1); + + if (Chan != 0) + NewMI->bundleWithPred(); + if (Mask) { + TII->addFlag(NewMI, 0, MO_FLAG_MASK); + } + if (NotLast) { + TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST); + } + SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::clamp); + SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::literal); + SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_abs); + SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_abs); + SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_neg); + SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_neg); + } + MI.eraseFromParent(); + } + } + return false; +} diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp new file mode 100644 index 00000000000..8357b6d9d0e --- /dev/null +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -0,0 +1,2286 @@ +//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Custom DAG lowering for R600 +// +//===----------------------------------------------------------------------===// + +#include "R600ISelLowering.h" +#include "AMDGPUFrameLowering.h" +#include "AMDGPUIntrinsicInfo.h" +#include "AMDGPUSubtarget.h" +#include "R600Defines.h" +#include "R600InstrInfo.h" +#include "R600MachineFunctionInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/Function.h" + +using namespace llvm; + +R600TargetLowering::R600TargetLowering(TargetMachine &TM, + const AMDGPUSubtarget &STI) + : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) { + addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); + addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); + addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); + addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass); + addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass); + addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass); + + computeRegisterProperties(STI.getRegisterInfo()); + + // Set condition code actions + setCondCodeAction(ISD::SETO, MVT::f32, Expand); + setCondCodeAction(ISD::SETUO, MVT::f32, Expand); + setCondCodeAction(ISD::SETLT, MVT::f32, Expand); + setCondCodeAction(ISD::SETLE, MVT::f32, Expand); + setCondCodeAction(ISD::SETOLT, MVT::f32, Expand); + setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); + setCondCodeAction(ISD::SETONE, MVT::f32, Expand); + setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); + setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); + setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); + setCondCodeAction(ISD::SETULT, MVT::f32, Expand); + setCondCodeAction(ISD::SETULE, MVT::f32, Expand); + + setCondCodeAction(ISD::SETLE, MVT::i32, Expand); + setCondCodeAction(ISD::SETLT, MVT::i32, Expand); + setCondCodeAction(ISD::SETULE, MVT::i32, Expand); + setCondCodeAction(ISD::SETULT, MVT::i32, Expand); + + setOperationAction(ISD::FCOS, MVT::f32, Custom); + setOperationAction(ISD::FSIN, MVT::f32, Custom); + + setOperationAction(ISD::SETCC, MVT::v4i32, Expand); + setOperationAction(ISD::SETCC, MVT::v2i32, Expand); + + setOperationAction(ISD::BR_CC, MVT::i32, Expand); + setOperationAction(ISD::BR_CC, MVT::f32, Expand); + setOperationAction(ISD::BRCOND, MVT::Other, Custom); + + setOperationAction(ISD::FSUB, MVT::f32, Expand); + + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom); + + setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); + setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); + + setOperationAction(ISD::SETCC, MVT::i32, Expand); + setOperationAction(ISD::SETCC, MVT::f32, Expand); + setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); + + setOperationAction(ISD::SELECT, MVT::i32, Expand); + setOperationAction(ISD::SELECT, MVT::f32, Expand); + setOperationAction(ISD::SELECT, MVT::v2i32, Expand); + setOperationAction(ISD::SELECT, MVT::v4i32, Expand); + + // ADD, SUB overflow. + // TODO: turn these into Legal? + if (Subtarget->hasCARRY()) + setOperationAction(ISD::UADDO, MVT::i32, Custom); + + if (Subtarget->hasBORROW()) + setOperationAction(ISD::USUBO, MVT::i32, Custom); + + // Expand sign extension of vectors + if (!Subtarget->hasBFE()) + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand); + + if (!Subtarget->hasBFE()) + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand); + + if (!Subtarget->hasBFE()) + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand); + + + // Legalize loads and stores to the private address space. + setOperationAction(ISD::LOAD, MVT::i32, Custom); + setOperationAction(ISD::LOAD, MVT::v2i32, Custom); + setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + + // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address + // spaces, so it is custom lowered to handle those where it isn't. + for (MVT VT : MVT::integer_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom); + + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom); + + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom); + } + + setOperationAction(ISD::STORE, MVT::i8, Custom); + setOperationAction(ISD::STORE, MVT::i32, Custom); + setOperationAction(ISD::STORE, MVT::v2i32, Custom); + setOperationAction(ISD::STORE, MVT::v4i32, Custom); + setTruncStoreAction(MVT::i32, MVT::i8, Custom); + setTruncStoreAction(MVT::i32, MVT::i16, Custom); + + setOperationAction(ISD::LOAD, MVT::i32, Custom); + setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + setOperationAction(ISD::FrameIndex, MVT::i32, Custom); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); + + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); + + setTargetDAGCombine(ISD::FP_ROUND); + setTargetDAGCombine(ISD::FP_TO_SINT); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + setTargetDAGCombine(ISD::SELECT_CC); + setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + + // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32 + // to be Legal/Custom in order to avoid library calls. + setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); + setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); + setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); + + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + + const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; + for (MVT VT : ScalarIntVTs) { + setOperationAction(ISD::ADDC, VT, Expand); + setOperationAction(ISD::SUBC, VT, Expand); + setOperationAction(ISD::ADDE, VT, Expand); + setOperationAction(ISD::SUBE, VT, Expand); + } + + setSchedulingPreference(Sched::Source); +} + +MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( + MachineInstr * MI, MachineBasicBlock * BB) const { + MachineFunction * MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + MachineBasicBlock::iterator I = *MI; + const R600InstrInfo *TII = + static_cast(Subtarget->getInstrInfo()); + + switch (MI->getOpcode()) { + default: + // Replace LDS_*_RET instruction that don't have any uses with the + // equivalent LDS_*_NORET instruction. + if (TII->isLDSRetInstr(MI->getOpcode())) { + int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); + assert(DstIdx != -1); + MachineInstrBuilder NewMI; + // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add + // LDS_1A2D support and remove this special case. + if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) || + MI->getOpcode() == AMDGPU::LDS_CMPST_RET) + return BB; + + NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), + TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode()))); + for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) { + NewMI.addOperand(MI->getOperand(i)); + } + } else { + return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); + } + break; + case AMDGPU::CLAMP_R600: { + MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, + AMDGPU::MOV, + MI->getOperand(0).getReg(), + MI->getOperand(1).getReg()); + TII->addFlag(NewMI, 0, MO_FLAG_CLAMP); + break; + } + + case AMDGPU::FABS_R600: { + MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, + AMDGPU::MOV, + MI->getOperand(0).getReg(), + MI->getOperand(1).getReg()); + TII->addFlag(NewMI, 0, MO_FLAG_ABS); + break; + } + + case AMDGPU::FNEG_R600: { + MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, + AMDGPU::MOV, + MI->getOperand(0).getReg(), + MI->getOperand(1).getReg()); + TII->addFlag(NewMI, 0, MO_FLAG_NEG); + break; + } + + case AMDGPU::MASK_WRITE: { + unsigned maskedRegister = MI->getOperand(0).getReg(); + assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); + MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); + TII->addFlag(defInstr, 0, MO_FLAG_MASK); + break; + } + + case AMDGPU::MOV_IMM_F32: + TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), + MI->getOperand(1).getFPImm()->getValueAPF() + .bitcastToAPInt().getZExtValue()); + break; + case AMDGPU::MOV_IMM_I32: + TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), + MI->getOperand(1).getImm()); + break; + case AMDGPU::CONST_COPY: { + MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV, + MI->getOperand(0).getReg(), AMDGPU::ALU_CONST); + TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel, + MI->getOperand(1).getImm()); + break; + } + + case AMDGPU::RAT_WRITE_CACHELESS_32_eg: + case AMDGPU::RAT_WRITE_CACHELESS_64_eg: + case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { + unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; + + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)) + .addImm(EOP); // Set End of program bit + break; + } + + case AMDGPU::TXD: { + unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); + unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); + MachineOperand &RID = MI->getOperand(4); + MachineOperand &SID = MI->getOperand(5); + unsigned TextureId = MI->getOperand(6).getImm(); + unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; + unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; + + switch (TextureId) { + case 5: // Rect + CTX = CTY = 0; + break; + case 6: // Shadow1D + SrcW = SrcZ; + break; + case 7: // Shadow2D + SrcW = SrcZ; + break; + case 8: // ShadowRect + CTX = CTY = 0; + SrcW = SrcZ; + break; + case 9: // 1DArray + SrcZ = SrcY; + CTZ = 0; + break; + case 10: // 2DArray + CTZ = 0; + break; + case 11: // Shadow1DArray + SrcZ = SrcY; + CTZ = 0; + break; + case 12: // Shadow2DArray + CTZ = 0; + break; + } + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) + .addOperand(MI->getOperand(3)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) + .addOperand(MI->getOperand(2)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G)) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW) + .addReg(T0, RegState::Implicit) + .addReg(T1, RegState::Implicit); + break; + } + + case AMDGPU::TXD_SHADOW: { + unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); + unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); + MachineOperand &RID = MI->getOperand(4); + MachineOperand &SID = MI->getOperand(5); + unsigned TextureId = MI->getOperand(6).getImm(); + unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; + unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; + + switch (TextureId) { + case 5: // Rect + CTX = CTY = 0; + break; + case 6: // Shadow1D + SrcW = SrcZ; + break; + case 7: // Shadow2D + SrcW = SrcZ; + break; + case 8: // ShadowRect + CTX = CTY = 0; + SrcW = SrcZ; + break; + case 9: // 1DArray + SrcZ = SrcY; + CTZ = 0; + break; + case 10: // 2DArray + CTZ = 0; + break; + case 11: // Shadow1DArray + SrcZ = SrcY; + CTZ = 0; + break; + case 12: // Shadow2DArray + CTZ = 0; + break; + } + + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) + .addOperand(MI->getOperand(3)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) + .addOperand(MI->getOperand(2)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G)) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW) + .addReg(T0, RegState::Implicit) + .addReg(T1, RegState::Implicit); + break; + } + + case AMDGPU::BRANCH: + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) + .addOperand(MI->getOperand(0)); + break; + + case AMDGPU::BRANCH_COND_f32: { + MachineInstr *NewMI = + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), + AMDGPU::PREDICATE_BIT) + .addOperand(MI->getOperand(1)) + .addImm(OPCODE_IS_NOT_ZERO) + .addImm(0); // Flags + TII->addFlag(NewMI, 0, MO_FLAG_PUSH); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) + .addOperand(MI->getOperand(0)) + .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); + break; + } + + case AMDGPU::BRANCH_COND_i32: { + MachineInstr *NewMI = + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), + AMDGPU::PREDICATE_BIT) + .addOperand(MI->getOperand(1)) + .addImm(OPCODE_IS_NOT_ZERO_INT) + .addImm(0); // Flags + TII->addFlag(NewMI, 0, MO_FLAG_PUSH); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) + .addOperand(MI->getOperand(0)) + .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); + break; + } + + case AMDGPU::EG_ExportSwz: + case AMDGPU::R600_ExportSwz: { + // Instruction is left unmodified if its not the last one of its type + bool isLastInstructionOfItsType = true; + unsigned InstExportType = MI->getOperand(1).getImm(); + for (MachineBasicBlock::iterator NextExportInst = std::next(I), + EndBlock = BB->end(); NextExportInst != EndBlock; + NextExportInst = std::next(NextExportInst)) { + if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz || + NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) { + unsigned CurrentInstExportType = NextExportInst->getOperand(1) + .getImm(); + if (CurrentInstExportType == InstExportType) { + isLastInstructionOfItsType = false; + break; + } + } + } + bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; + if (!EOP && !isLastInstructionOfItsType) + return BB; + unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40; + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)) + .addOperand(MI->getOperand(2)) + .addOperand(MI->getOperand(3)) + .addOperand(MI->getOperand(4)) + .addOperand(MI->getOperand(5)) + .addOperand(MI->getOperand(6)) + .addImm(CfInst) + .addImm(EOP); + break; + } + case AMDGPU::RETURN: { + // RETURN instructions must have the live-out registers as implicit uses, + // otherwise they appear dead. + R600MachineFunctionInfo *MFI = MF->getInfo(); + MachineInstrBuilder MIB(*MF, MI); + for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i) + MIB.addReg(MFI->LiveOuts[i], RegState::Implicit); + return BB; + } + } + + MI->eraseFromParent(); + return BB; +} + +//===----------------------------------------------------------------------===// +// Custom DAG Lowering Operations +//===----------------------------------------------------------------------===// + +SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + R600MachineFunctionInfo *MFI = MF.getInfo(); + switch (Op.getOpcode()) { + default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); + case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); + case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); + case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG); + case ISD::SRA_PARTS: + case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG); + case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY); + case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW); + case ISD::FCOS: + case ISD::FSIN: return LowerTrig(Op, DAG); + case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); + case ISD::STORE: return LowerSTORE(Op, DAG); + case ISD::LOAD: { + SDValue Result = LowerLOAD(Op, DAG); + assert((!Result.getNode() || + Result.getNode()->getNumValues() == 2) && + "Load should return a value and a chain"); + return Result; + } + + case ISD::BRCOND: return LowerBRCOND(Op, DAG); + case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); + case ISD::INTRINSIC_VOID: { + SDValue Chain = Op.getOperand(0); + unsigned IntrinsicID = + cast(Op.getOperand(1))->getZExtValue(); + switch (IntrinsicID) { + case AMDGPUIntrinsic::AMDGPU_store_output: { + int64_t RegIndex = cast(Op.getOperand(3))->getZExtValue(); + unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); + MFI->LiveOuts.push_back(Reg); + return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2)); + } + case AMDGPUIntrinsic::R600_store_swizzle: { + SDLoc DL(Op); + const SDValue Args[8] = { + Chain, + Op.getOperand(2), // Export Value + Op.getOperand(3), // ArrayBase + Op.getOperand(4), // Type + DAG.getConstant(0, DL, MVT::i32), // SWZ_X + DAG.getConstant(1, DL, MVT::i32), // SWZ_Y + DAG.getConstant(2, DL, MVT::i32), // SWZ_Z + DAG.getConstant(3, DL, MVT::i32) // SWZ_W + }; + return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args); + } + + // default for switch(IntrinsicID) + default: break; + } + // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode()) + break; + } + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IntrinsicID = + cast(Op.getOperand(0))->getZExtValue(); + EVT VT = Op.getValueType(); + SDLoc DL(Op); + switch(IntrinsicID) { + default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); + case AMDGPUIntrinsic::R600_load_input: { + int64_t RegIndex = cast(Op.getOperand(1))->getZExtValue(); + unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); + MachineFunction &MF = DAG.getMachineFunction(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + MRI.addLiveIn(Reg); + return DAG.getCopyFromReg(DAG.getEntryNode(), + SDLoc(DAG.getEntryNode()), Reg, VT); + } + + case AMDGPUIntrinsic::R600_interp_input: { + int slot = cast(Op.getOperand(1))->getZExtValue(); + int ijb = cast(Op.getOperand(2))->getSExtValue(); + MachineSDNode *interp; + if (ijb < 0) { + const R600InstrInfo *TII = + static_cast(Subtarget->getInstrInfo()); + interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL, + MVT::v4f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32)); + return DAG.getTargetExtractSubreg( + TII->getRegisterInfo().getSubRegFromChannel(slot % 4), + DL, MVT::f32, SDValue(interp, 0)); + } + MachineFunction &MF = DAG.getMachineFunction(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb); + unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1); + MRI.addLiveIn(RegisterI); + MRI.addLiveIn(RegisterJ); + SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(), + SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32); + SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(), + SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32); + + if (slot % 4 < 2) + interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, + MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32), + RegisterJNode, RegisterINode); + else + interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, + MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32), + RegisterJNode, RegisterINode); + return SDValue(interp, slot % 2); + } + case AMDGPUIntrinsic::R600_interp_xy: + case AMDGPUIntrinsic::R600_interp_zw: { + int slot = cast(Op.getOperand(1))->getZExtValue(); + MachineSDNode *interp; + SDValue RegisterINode = Op.getOperand(2); + SDValue RegisterJNode = Op.getOperand(3); + + if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy) + interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, + MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32), + RegisterJNode, RegisterINode); + else + interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, + MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32), + RegisterJNode, RegisterINode); + return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, + SDValue(interp, 0), SDValue(interp, 1)); + } + case AMDGPUIntrinsic::R600_tex: + case AMDGPUIntrinsic::R600_texc: + case AMDGPUIntrinsic::R600_txl: + case AMDGPUIntrinsic::R600_txlc: + case AMDGPUIntrinsic::R600_txb: + case AMDGPUIntrinsic::R600_txbc: + case AMDGPUIntrinsic::R600_txf: + case AMDGPUIntrinsic::R600_txq: + case AMDGPUIntrinsic::R600_ddx: + case AMDGPUIntrinsic::R600_ddy: + case AMDGPUIntrinsic::R600_ldptr: { + unsigned TextureOp; + switch (IntrinsicID) { + case AMDGPUIntrinsic::R600_tex: + TextureOp = 0; + break; + case AMDGPUIntrinsic::R600_texc: + TextureOp = 1; + break; + case AMDGPUIntrinsic::R600_txl: + TextureOp = 2; + break; + case AMDGPUIntrinsic::R600_txlc: + TextureOp = 3; + break; + case AMDGPUIntrinsic::R600_txb: + TextureOp = 4; + break; + case AMDGPUIntrinsic::R600_txbc: + TextureOp = 5; + break; + case AMDGPUIntrinsic::R600_txf: + TextureOp = 6; + break; + case AMDGPUIntrinsic::R600_txq: + TextureOp = 7; + break; + case AMDGPUIntrinsic::R600_ddx: + TextureOp = 8; + break; + case AMDGPUIntrinsic::R600_ddy: + TextureOp = 9; + break; + case AMDGPUIntrinsic::R600_ldptr: + TextureOp = 10; + break; + default: + llvm_unreachable("Unknow Texture Operation"); + } + + SDValue TexArgs[19] = { + DAG.getConstant(TextureOp, DL, MVT::i32), + Op.getOperand(1), + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(1, DL, MVT::i32), + DAG.getConstant(2, DL, MVT::i32), + DAG.getConstant(3, DL, MVT::i32), + Op.getOperand(2), + Op.getOperand(3), + Op.getOperand(4), + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(1, DL, MVT::i32), + DAG.getConstant(2, DL, MVT::i32), + DAG.getConstant(3, DL, MVT::i32), + Op.getOperand(5), + Op.getOperand(6), + Op.getOperand(7), + Op.getOperand(8), + Op.getOperand(9), + Op.getOperand(10) + }; + return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs); + } + case AMDGPUIntrinsic::AMDGPU_dp4: { + SDValue Args[8] = { + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), + DAG.getConstant(0, DL, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), + DAG.getConstant(0, DL, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), + DAG.getConstant(1, DL, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), + DAG.getConstant(1, DL, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), + DAG.getConstant(2, DL, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), + DAG.getConstant(2, DL, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), + DAG.getConstant(3, DL, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), + DAG.getConstant(3, DL, MVT::i32)) + }; + return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args); + } + + case Intrinsic::r600_read_ngroups_x: + return LowerImplicitParameter(DAG, VT, DL, 0); + case Intrinsic::r600_read_ngroups_y: + return LowerImplicitParameter(DAG, VT, DL, 1); + case Intrinsic::r600_read_ngroups_z: + return LowerImplicitParameter(DAG, VT, DL, 2); + case Intrinsic::r600_read_global_size_x: + return LowerImplicitParameter(DAG, VT, DL, 3); + case Intrinsic::r600_read_global_size_y: + return LowerImplicitParameter(DAG, VT, DL, 4); + case Intrinsic::r600_read_global_size_z: + return LowerImplicitParameter(DAG, VT, DL, 5); + case Intrinsic::r600_read_local_size_x: + return LowerImplicitParameter(DAG, VT, DL, 6); + case Intrinsic::r600_read_local_size_y: + return LowerImplicitParameter(DAG, VT, DL, 7); + case Intrinsic::r600_read_local_size_z: + return LowerImplicitParameter(DAG, VT, DL, 8); + + case Intrinsic::AMDGPU_read_workdim: + return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4); + + case Intrinsic::r600_read_tgid_x: + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T1_X, VT); + case Intrinsic::r600_read_tgid_y: + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T1_Y, VT); + case Intrinsic::r600_read_tgid_z: + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T1_Z, VT); + case Intrinsic::r600_read_tidig_x: + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T0_X, VT); + case Intrinsic::r600_read_tidig_y: + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T0_Y, VT); + case Intrinsic::r600_read_tidig_z: + return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, + AMDGPU::T0_Z, VT); + case Intrinsic::AMDGPU_rsq: + // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior. + return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); + + case AMDGPUIntrinsic::AMDGPU_fract: + case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. + return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); + } + // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) + break; + } + } // end switch(Op.getOpcode()) + return SDValue(); +} + +void R600TargetLowering::ReplaceNodeResults(SDNode *N, + SmallVectorImpl &Results, + SelectionDAG &DAG) const { + switch (N->getOpcode()) { + default: + AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG); + return; + case ISD::FP_TO_UINT: + if (N->getValueType(0) == MVT::i1) { + Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); + return; + } + // Fall-through. Since we don't care about out of bounds values + // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint + // considers some extra cases which are not necessary here. + case ISD::FP_TO_SINT: { + SDValue Result; + if (expandFP_TO_SINT(N, Result, DAG)) + Results.push_back(Result); + return; + } + case ISD::SDIVREM: { + SDValue Op = SDValue(N, 1); + SDValue RES = LowerSDIVREM(Op, DAG); + Results.push_back(RES); + Results.push_back(RES.getValue(1)); + break; + } + case ISD::UDIVREM: { + SDValue Op = SDValue(N, 0); + LowerUDIVREM64(Op, DAG, Results); + break; + } + } +} + +SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG, + SDValue Vector) const { + + SDLoc DL(Vector); + EVT VecVT = Vector.getValueType(); + EVT EltVT = VecVT.getVectorElementType(); + SmallVector Args; + + for (unsigned i = 0, e = VecVT.getVectorNumElements(); + i != e; ++i) { + Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector, + DAG.getConstant(i, DL, getVectorIdxTy()))); + } + + return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args); +} + +SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + + SDLoc DL(Op); + SDValue Vector = Op.getOperand(0); + SDValue Index = Op.getOperand(1); + + if (isa(Index) || + Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) + return Op; + + Vector = vectorToVerticalVector(DAG, Vector); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(), + Vector, Index); +} + +SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + SDValue Vector = Op.getOperand(0); + SDValue Value = Op.getOperand(1); + SDValue Index = Op.getOperand(2); + + if (isa(Index) || + Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) + return Op; + + Vector = vectorToVerticalVector(DAG, Vector); + SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(), + Vector, Value, Index); + return vectorToVerticalVector(DAG, Insert); +} + +SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { + // On hw >= R700, COS/SIN input must be between -1. and 1. + // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5) + EVT VT = Op.getValueType(); + SDValue Arg = Op.getOperand(0); + SDLoc DL(Op); + SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, + DAG.getNode(ISD::FADD, DL, VT, + DAG.getNode(ISD::FMUL, DL, VT, Arg, + DAG.getConstantFP(0.15915494309, DL, MVT::f32)), + DAG.getConstantFP(0.5, DL, MVT::f32))); + unsigned TrigNode; + switch (Op.getOpcode()) { + case ISD::FCOS: + TrigNode = AMDGPUISD::COS_HW; + break; + case ISD::FSIN: + TrigNode = AMDGPUISD::SIN_HW; + break; + default: + llvm_unreachable("Wrong trig opcode"); + } + SDValue TrigVal = DAG.getNode(TrigNode, DL, VT, + DAG.getNode(ISD::FADD, DL, VT, FractPart, + DAG.getConstantFP(-0.5, DL, MVT::f32))); + if (Gen >= AMDGPUSubtarget::R700) + return TrigVal; + // On R600 hw, COS/SIN input must be between -Pi and Pi. + return DAG.getNode(ISD::FMUL, DL, VT, TrigVal, + DAG.getConstantFP(3.14159265359, DL, MVT::f32)); +} + +SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + SDValue Lo = Op.getOperand(0); + SDValue Hi = Op.getOperand(1); + SDValue Shift = Op.getOperand(2); + SDValue Zero = DAG.getConstant(0, DL, VT); + SDValue One = DAG.getConstant(1, DL, VT); + + SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT); + SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT); + SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); + SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); + + // The dance around Width1 is necessary for 0 special case. + // Without it the CompShift might be 32, producing incorrect results in + // Overflow. So we do the shift in two steps, the alternative is to + // add a conditional to filter the special case. + + SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift); + Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One); + + SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift); + HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow); + SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift); + + SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift); + SDValue LoBig = Zero; + + Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); + Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); + + return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); +} + +SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + SDValue Lo = Op.getOperand(0); + SDValue Hi = Op.getOperand(1); + SDValue Shift = Op.getOperand(2); + SDValue Zero = DAG.getConstant(0, DL, VT); + SDValue One = DAG.getConstant(1, DL, VT); + + const bool SRA = Op.getOpcode() == ISD::SRA_PARTS; + + SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT); + SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT); + SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); + SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); + + // The dance around Width1 is necessary for 0 special case. + // Without it the CompShift might be 32, producing incorrect results in + // Overflow. So we do the shift in two steps, the alternative is to + // add a conditional to filter the special case. + + SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift); + Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One); + + SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift); + SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift); + LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow); + + SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift); + SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero; + + Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); + Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); + + return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); +} + +SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG, + unsigned mainop, unsigned ovf) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + SDValue Lo = Op.getOperand(0); + SDValue Hi = Op.getOperand(1); + + SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi); + // Extend sign. + OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF, + DAG.getValueType(MVT::i1)); + + SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi); + + return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF); +} + +SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + return DAG.getNode( + ISD::SETCC, + DL, + MVT::i1, + Op, DAG.getConstantFP(0.0f, DL, MVT::f32), + DAG.getCondCode(ISD::SETNE) + ); +} + +SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, + SDLoc DL, + unsigned DwordOffset) const { + unsigned ByteOffset = DwordOffset * 4; + PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), + AMDGPUAS::CONSTANT_BUFFER_0); + + // We shouldn't be using an offset wider than 16-bits for implicit parameters. + assert(isInt<16>(ByteOffset)); + + return DAG.getLoad(VT, DL, DAG.getEntryNode(), + DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR + MachinePointerInfo(ConstantPointerNull::get(PtrType)), + false, false, false, 0); +} + +bool R600TargetLowering::isZero(SDValue Op) const { + if(ConstantSDNode *Cst = dyn_cast(Op)) { + return Cst->isNullValue(); + } else if(ConstantFPSDNode *CstFP = dyn_cast(Op)){ + return CstFP->isZero(); + } else { + return false; + } +} + +SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + SDValue True = Op.getOperand(2); + SDValue False = Op.getOperand(3); + SDValue CC = Op.getOperand(4); + SDValue Temp; + + if (VT == MVT::f32) { + DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); + SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); + if (MinMax) + return MinMax; + } + + // LHS and RHS are guaranteed to be the same value type + EVT CompareVT = LHS.getValueType(); + + // Check if we can lower this to a native operation. + + // Try to lower to a SET* instruction: + // + // SET* can match the following patterns: + // + // select_cc f32, f32, -1, 0, cc_supported + // select_cc f32, f32, 1.0f, 0.0f, cc_supported + // select_cc i32, i32, -1, 0, cc_supported + // + + // Move hardware True/False values to the correct operand. + ISD::CondCode CCOpcode = cast(CC)->get(); + ISD::CondCode InverseCC = + ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); + if (isHWTrueValue(False) && isHWFalseValue(True)) { + if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) { + std::swap(False, True); + CC = DAG.getCondCode(InverseCC); + } else { + ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC); + if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) { + std::swap(False, True); + std::swap(LHS, RHS); + CC = DAG.getCondCode(SwapInvCC); + } + } + } + + if (isHWTrueValue(True) && isHWFalseValue(False) && + (CompareVT == VT || VT == MVT::i32)) { + // This can be matched by a SET* instruction. + return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC); + } + + // Try to lower to a CND* instruction: + // + // CND* can match the following patterns: + // + // select_cc f32, 0.0, f32, f32, cc_supported + // select_cc f32, 0.0, i32, i32, cc_supported + // select_cc i32, 0, f32, f32, cc_supported + // select_cc i32, 0, i32, i32, cc_supported + // + + // Try to move the zero value to the RHS + if (isZero(LHS)) { + ISD::CondCode CCOpcode = cast(CC)->get(); + // Try swapping the operands + ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode); + if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { + std::swap(LHS, RHS); + CC = DAG.getCondCode(CCSwapped); + } else { + // Try inverting the conditon and then swapping the operands + ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger()); + CCSwapped = ISD::getSetCCSwappedOperands(CCInv); + if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { + std::swap(True, False); + std::swap(LHS, RHS); + CC = DAG.getCondCode(CCSwapped); + } + } + } + if (isZero(RHS)) { + SDValue Cond = LHS; + SDValue Zero = RHS; + ISD::CondCode CCOpcode = cast(CC)->get(); + if (CompareVT != VT) { + // Bitcast True / False to the correct types. This will end up being + // a nop, but it allows us to define only a single pattern in the + // .TD files for each CND* instruction rather than having to have + // one pattern for integer True/False and one for fp True/False + True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True); + False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False); + } + + switch (CCOpcode) { + case ISD::SETONE: + case ISD::SETUNE: + case ISD::SETNE: + CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); + Temp = True; + True = False; + False = Temp; + break; + default: + break; + } + SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, + Cond, Zero, + True, False, + DAG.getCondCode(CCOpcode)); + return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode); + } + + // If we make it this for it means we have no native instructions to handle + // this SELECT_CC, so we must lower it. + SDValue HWTrue, HWFalse; + + if (CompareVT == MVT::f32) { + HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT); + HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT); + } else if (CompareVT == MVT::i32) { + HWTrue = DAG.getConstant(-1, DL, CompareVT); + HWFalse = DAG.getConstant(0, DL, CompareVT); + } + else { + llvm_unreachable("Unhandled value type in LowerSELECT_CC"); + } + + // Lower this unsupported SELECT_CC into a combination of two supported + // SELECT_CC operations. + SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC); + + return DAG.getNode(ISD::SELECT_CC, DL, VT, + Cond, HWFalse, + True, False, + DAG.getCondCode(ISD::SETNE)); +} + +/// LLVM generates byte-addressed pointers. For indirect addressing, we need to +/// convert these pointers to a register index. Each register holds +/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the +/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used +/// for indirect addressing. +SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr, + unsigned StackWidth, + SelectionDAG &DAG) const { + unsigned SRLPad; + switch(StackWidth) { + case 1: + SRLPad = 2; + break; + case 2: + SRLPad = 3; + break; + case 4: + SRLPad = 4; + break; + default: llvm_unreachable("Invalid stack width"); + } + + SDLoc DL(Ptr); + return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(SRLPad, DL, MVT::i32)); +} + +void R600TargetLowering::getStackAddress(unsigned StackWidth, + unsigned ElemIdx, + unsigned &Channel, + unsigned &PtrIncr) const { + switch (StackWidth) { + default: + case 1: + Channel = 0; + if (ElemIdx > 0) { + PtrIncr = 1; + } else { + PtrIncr = 0; + } + break; + case 2: + Channel = ElemIdx % 2; + if (ElemIdx == 2) { + PtrIncr = 1; + } else { + PtrIncr = 0; + } + break; + case 4: + Channel = ElemIdx; + PtrIncr = 0; + break; + } +} + +SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + StoreSDNode *StoreNode = cast(Op); + SDValue Chain = Op.getOperand(0); + SDValue Value = Op.getOperand(1); + SDValue Ptr = Op.getOperand(2); + + SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG); + if (Result.getNode()) { + return Result; + } + + if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) { + if (StoreNode->isTruncatingStore()) { + EVT VT = Value.getValueType(); + assert(VT.bitsLE(MVT::i32)); + EVT MemVT = StoreNode->getMemoryVT(); + SDValue MaskConstant; + if (MemVT == MVT::i8) { + MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32); + } else { + assert(MemVT == MVT::i16); + MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32); + } + SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr, + DAG.getConstant(2, DL, MVT::i32)); + SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(0x00000003, DL, VT)); + SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant); + SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex, + DAG.getConstant(3, DL, VT)); + SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift); + SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift); + // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32 + // vector instead. + SDValue Src[4] = { + ShiftedValue, + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i32), + Mask + }; + SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src); + SDValue Args[3] = { Chain, Input, DWordAddr }; + return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL, + Op->getVTList(), Args, MemVT, + StoreNode->getMemOperand()); + } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && + Value.getValueType().bitsGE(MVT::i32)) { + // Convert pointer from byte address to dword address. + Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), + DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), + Ptr, DAG.getConstant(2, DL, MVT::i32))); + + if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) { + llvm_unreachable("Truncated and indexed stores not supported yet"); + } else { + Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); + } + return Chain; + } + } + + EVT ValueVT = Value.getValueType(); + + if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { + return SDValue(); + } + + SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); + if (Ret.getNode()) { + return Ret; + } + // Lowering for indirect addressing + + const MachineFunction &MF = DAG.getMachineFunction(); + const AMDGPUFrameLowering *TFL = + static_cast(Subtarget->getFrameLowering()); + unsigned StackWidth = TFL->getStackWidth(MF); + + Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); + + if (ValueVT.isVector()) { + unsigned NumElemVT = ValueVT.getVectorNumElements(); + EVT ElemVT = ValueVT.getVectorElementType(); + SmallVector Stores(NumElemVT); + + assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " + "vector width in load"); + + for (unsigned i = 0; i < NumElemVT; ++i) { + unsigned Channel, PtrIncr; + getStackAddress(StackWidth, i, Channel, PtrIncr); + Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, + DAG.getConstant(PtrIncr, DL, MVT::i32)); + SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, + Value, DAG.getConstant(i, DL, MVT::i32)); + + Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, + Chain, Elem, Ptr, + DAG.getTargetConstant(Channel, DL, MVT::i32)); + } + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); + } else { + if (ValueVT == MVT::i8) { + Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value); + } + Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr, + DAG.getTargetConstant(0, DL, MVT::i32)); // Channel + } + + return Chain; +} + +// return (512 + (kc_bank << 12) +static int +ConstantAddressBlock(unsigned AddressSpace) { + switch (AddressSpace) { + case AMDGPUAS::CONSTANT_BUFFER_0: + return 512; + case AMDGPUAS::CONSTANT_BUFFER_1: + return 512 + 4096; + case AMDGPUAS::CONSTANT_BUFFER_2: + return 512 + 4096 * 2; + case AMDGPUAS::CONSTANT_BUFFER_3: + return 512 + 4096 * 3; + case AMDGPUAS::CONSTANT_BUFFER_4: + return 512 + 4096 * 4; + case AMDGPUAS::CONSTANT_BUFFER_5: + return 512 + 4096 * 5; + case AMDGPUAS::CONSTANT_BUFFER_6: + return 512 + 4096 * 6; + case AMDGPUAS::CONSTANT_BUFFER_7: + return 512 + 4096 * 7; + case AMDGPUAS::CONSTANT_BUFFER_8: + return 512 + 4096 * 8; + case AMDGPUAS::CONSTANT_BUFFER_9: + return 512 + 4096 * 9; + case AMDGPUAS::CONSTANT_BUFFER_10: + return 512 + 4096 * 10; + case AMDGPUAS::CONSTANT_BUFFER_11: + return 512 + 4096 * 11; + case AMDGPUAS::CONSTANT_BUFFER_12: + return 512 + 4096 * 12; + case AMDGPUAS::CONSTANT_BUFFER_13: + return 512 + 4096 * 13; + case AMDGPUAS::CONSTANT_BUFFER_14: + return 512 + 4096 * 14; + case AMDGPUAS::CONSTANT_BUFFER_15: + return 512 + 4096 * 15; + default: + return -1; + } +} + +SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const +{ + EVT VT = Op.getValueType(); + SDLoc DL(Op); + LoadSDNode *LoadNode = cast(Op); + SDValue Chain = Op.getOperand(0); + SDValue Ptr = Op.getOperand(1); + SDValue LoweredLoad; + + SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG); + if (Ret.getNode()) { + SDValue Ops[2] = { + Ret, + Chain + }; + return DAG.getMergeValues(Ops, DL); + } + + // Lower loads constant address space global variable loads + if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + isa(GetUnderlyingObject( + LoadNode->getMemOperand()->getValue(), *getDataLayout()))) { + + SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL, + getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); + Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, + DAG.getConstant(2, DL, MVT::i32)); + return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(), + LoadNode->getChain(), Ptr, + DAG.getTargetConstant(0, DL, MVT::i32), + Op.getOperand(2)); + } + + if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) { + SDValue MergedValues[2] = { + ScalarizeVectorLoad(Op, DAG), + Chain + }; + return DAG.getMergeValues(MergedValues, DL); + } + + int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); + if (ConstantBlock > -1 && + ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) || + (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) { + SDValue Result; + if (isa(LoadNode->getMemOperand()->getValue()) || + isa(LoadNode->getMemOperand()->getValue()) || + isa(Ptr)) { + SDValue Slots[4]; + for (unsigned i = 0; i < 4; i++) { + // We want Const position encoded with the following formula : + // (((512 + (kc_bank << 12) + const_index) << 2) + chan) + // const_index is Ptr computed by llvm using an alignment of 16. + // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and + // then div by 4 at the ISel step + SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32)); + Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); + } + EVT NewVT = MVT::v4i32; + unsigned NumElements = 4; + if (VT.isVector()) { + NewVT = VT; + NumElements = VT.getVectorNumElements(); + } + Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, + makeArrayRef(Slots, NumElements)); + } else { + // non-constant ptr can't be folded, keeps it as a v4f32 load + Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, + DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, + DAG.getConstant(4, DL, MVT::i32)), + DAG.getConstant(LoadNode->getAddressSpace() - + AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32) + ); + } + + if (!VT.isVector()) { + Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, + DAG.getConstant(0, DL, MVT::i32)); + } + + SDValue MergedValues[2] = { + Result, + Chain + }; + return DAG.getMergeValues(MergedValues, DL); + } + + // For most operations returning SDValue() will result in the node being + // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we + // need to manually expand loads that may be legal in some address spaces and + // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for + // compute shaders, since the data is sign extended when it is uploaded to the + // buffer. However SEXT loads from other address spaces are not supported, so + // we need to expand them here. + if (LoadNode->getExtensionType() == ISD::SEXTLOAD) { + EVT MemVT = LoadNode->getMemoryVT(); + assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8)); + SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr, + LoadNode->getPointerInfo(), MemVT, + LoadNode->isVolatile(), + LoadNode->isNonTemporal(), + LoadNode->isInvariant(), + LoadNode->getAlignment()); + SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad, + DAG.getValueType(MemVT)); + + SDValue MergedValues[2] = { Res, Chain }; + return DAG.getMergeValues(MergedValues, DL); + } + + if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { + return SDValue(); + } + + // Lowering for indirect addressing + const MachineFunction &MF = DAG.getMachineFunction(); + const AMDGPUFrameLowering *TFL = + static_cast(Subtarget->getFrameLowering()); + unsigned StackWidth = TFL->getStackWidth(MF); + + Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); + + if (VT.isVector()) { + unsigned NumElemVT = VT.getVectorNumElements(); + EVT ElemVT = VT.getVectorElementType(); + SDValue Loads[4]; + + assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " + "vector width in load"); + + for (unsigned i = 0; i < NumElemVT; ++i) { + unsigned Channel, PtrIncr; + getStackAddress(StackWidth, i, Channel, PtrIncr); + Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, + DAG.getConstant(PtrIncr, DL, MVT::i32)); + Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT, + Chain, Ptr, + DAG.getTargetConstant(Channel, DL, MVT::i32), + Op.getOperand(2)); + } + for (unsigned i = NumElemVT; i < 4; ++i) { + Loads[i] = DAG.getUNDEF(ElemVT); + } + EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4); + LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads); + } else { + LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, + Chain, Ptr, + DAG.getTargetConstant(0, DL, MVT::i32), // Channel + Op.getOperand(2)); + } + + SDValue Ops[2] = { + LoweredLoad, + Chain + }; + + return DAG.getMergeValues(Ops, DL); +} + +SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + SDValue Cond = Op.getOperand(1); + SDValue Jump = Op.getOperand(2); + + return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(), + Chain, Jump, Cond); +} + +/// XXX Only kernel functions are supported, so we can assume for now that +/// every function is a kernel function, but in the future we should use +/// separate calling conventions for kernel and non-kernel functions. +SDValue R600TargetLowering::LowerFormalArguments( + SDValue Chain, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl &Ins, + SDLoc DL, SelectionDAG &DAG, + SmallVectorImpl &InVals) const { + SmallVector ArgLocs; + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); + MachineFunction &MF = DAG.getMachineFunction(); + R600MachineFunctionInfo *MFI = MF.getInfo(); + + SmallVector LocalIns; + + getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns); + + AnalyzeFormalArguments(CCInfo, LocalIns); + + for (unsigned i = 0, e = Ins.size(); i < e; ++i) { + CCValAssign &VA = ArgLocs[i]; + const ISD::InputArg &In = Ins[i]; + EVT VT = In.VT; + EVT MemVT = VA.getLocVT(); + if (!VT.isVector() && MemVT.isVector()) { + // Get load source type if scalarized. + MemVT = MemVT.getVectorElementType(); + } + + if (MFI->getShaderType() != ShaderType::COMPUTE) { + unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass); + SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT); + InVals.push_back(Register); + continue; + } + + PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), + AMDGPUAS::CONSTANT_BUFFER_0); + + // i64 isn't a legal type, so the register type used ends up as i32, which + // isn't expected here. It attempts to create this sextload, but it ends up + // being invalid. Somehow this seems to work with i64 arguments, but breaks + // for <1 x i64>. + + // The first 36 bytes of the input buffer contains information about + // thread group and global sizes. + ISD::LoadExtType Ext = ISD::NON_EXTLOAD; + if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) { + // FIXME: This should really check the extload type, but the handling of + // extload vector parameters seems to be broken. + + // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD; + Ext = ISD::SEXTLOAD; + } + + // Compute the offset from the value. + // XXX - I think PartOffset should give you this, but it seems to give the + // size of the register which isn't useful. + + unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset(); + unsigned PartOffset = VA.getLocMemOffset(); + unsigned Offset = 36 + VA.getLocMemOffset(); + + MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase); + SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain, + DAG.getConstant(Offset, DL, MVT::i32), + DAG.getUNDEF(MVT::i32), + PtrInfo, + MemVT, false, true, true, 4); + + // 4 is the preferred alignment for the CONSTANT memory space. + InVals.push_back(Arg); + MFI->ABIArgOffset = Offset + MemVT.getStoreSize(); + } + return Chain; +} + +EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { + if (!VT.isVector()) + return MVT::i32; + return VT.changeVectorElementTypeToInteger(); +} + +static SDValue CompactSwizzlableVector( + SelectionDAG &DAG, SDValue VectorEntry, + DenseMap &RemapSwizzle) { + assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); + assert(RemapSwizzle.empty()); + SDValue NewBldVec[4] = { + VectorEntry.getOperand(0), + VectorEntry.getOperand(1), + VectorEntry.getOperand(2), + VectorEntry.getOperand(3) + }; + + for (unsigned i = 0; i < 4; i++) { + if (NewBldVec[i].getOpcode() == ISD::UNDEF) + // We mask write here to teach later passes that the ith element of this + // vector is undef. Thus we can use it to reduce 128 bits reg usage, + // break false dependencies and additionnaly make assembly easier to read. + RemapSwizzle[i] = 7; // SEL_MASK_WRITE + if (ConstantFPSDNode *C = dyn_cast(NewBldVec[i])) { + if (C->isZero()) { + RemapSwizzle[i] = 4; // SEL_0 + NewBldVec[i] = DAG.getUNDEF(MVT::f32); + } else if (C->isExactlyValue(1.0)) { + RemapSwizzle[i] = 5; // SEL_1 + NewBldVec[i] = DAG.getUNDEF(MVT::f32); + } + } + + if (NewBldVec[i].getOpcode() == ISD::UNDEF) + continue; + for (unsigned j = 0; j < i; j++) { + if (NewBldVec[i] == NewBldVec[j]) { + NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType()); + RemapSwizzle[i] = j; + break; + } + } + } + + return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), + VectorEntry.getValueType(), NewBldVec); +} + +static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, + DenseMap &RemapSwizzle) { + assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); + assert(RemapSwizzle.empty()); + SDValue NewBldVec[4] = { + VectorEntry.getOperand(0), + VectorEntry.getOperand(1), + VectorEntry.getOperand(2), + VectorEntry.getOperand(3) + }; + bool isUnmovable[4] = { false, false, false, false }; + for (unsigned i = 0; i < 4; i++) { + RemapSwizzle[i] = i; + if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + unsigned Idx = dyn_cast(NewBldVec[i].getOperand(1)) + ->getZExtValue(); + if (i == Idx) + isUnmovable[Idx] = true; + } + } + + for (unsigned i = 0; i < 4; i++) { + if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + unsigned Idx = dyn_cast(NewBldVec[i].getOperand(1)) + ->getZExtValue(); + if (isUnmovable[Idx]) + continue; + // Swap i and Idx + std::swap(NewBldVec[Idx], NewBldVec[i]); + std::swap(RemapSwizzle[i], RemapSwizzle[Idx]); + break; + } + } + + return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), + VectorEntry.getValueType(), NewBldVec); +} + + +SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, + SDValue Swz[4], SelectionDAG &DAG, + SDLoc DL) const { + assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR); + // Old -> New swizzle values + DenseMap SwizzleRemap; + + BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap); + for (unsigned i = 0; i < 4; i++) { + unsigned Idx = cast(Swz[i])->getZExtValue(); + if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) + Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); + } + + SwizzleRemap.clear(); + BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap); + for (unsigned i = 0; i < 4; i++) { + unsigned Idx = cast(Swz[i])->getZExtValue(); + if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) + Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); + } + + return BuildVector; +} + + +//===----------------------------------------------------------------------===// +// Custom DAG Optimizations +//===----------------------------------------------------------------------===// + +SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + + switch (N->getOpcode()) { + default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); + // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a) + case ISD::FP_ROUND: { + SDValue Arg = N->getOperand(0); + if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) { + return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0), + Arg.getOperand(0)); + } + break; + } + + // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) -> + // (i32 select_cc f32, f32, -1, 0 cc) + // + // Mesa's GLSL frontend generates the above pattern a lot and we can lower + // this to one of the SET*_DX10 instructions. + case ISD::FP_TO_SINT: { + SDValue FNeg = N->getOperand(0); + if (FNeg.getOpcode() != ISD::FNEG) { + return SDValue(); + } + SDValue SelectCC = FNeg.getOperand(0); + if (SelectCC.getOpcode() != ISD::SELECT_CC || + SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS + SelectCC.getOperand(2).getValueType() != MVT::f32 || // True + !isHWTrueValue(SelectCC.getOperand(2)) || + !isHWFalseValue(SelectCC.getOperand(3))) { + return SDValue(); + } + + SDLoc dl(N); + return DAG.getNode(ISD::SELECT_CC, dl, N->getValueType(0), + SelectCC.getOperand(0), // LHS + SelectCC.getOperand(1), // RHS + DAG.getConstant(-1, dl, MVT::i32), // True + DAG.getConstant(0, dl, MVT::i32), // False + SelectCC.getOperand(4)); // CC + + break; + } + + // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx + // => build_vector elt0, ... , NewEltIdx, ... , eltN + case ISD::INSERT_VECTOR_ELT: { + SDValue InVec = N->getOperand(0); + SDValue InVal = N->getOperand(1); + SDValue EltNo = N->getOperand(2); + SDLoc dl(N); + + // If the inserted element is an UNDEF, just use the input vector. + if (InVal.getOpcode() == ISD::UNDEF) + return InVec; + + EVT VT = InVec.getValueType(); + + // If we can't generate a legal BUILD_VECTOR, exit + if (!isOperationLegal(ISD::BUILD_VECTOR, VT)) + return SDValue(); + + // Check that we know which element is being inserted + if (!isa(EltNo)) + return SDValue(); + unsigned Elt = cast(EltNo)->getZExtValue(); + + // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially + // be converted to a BUILD_VECTOR). Fill in the Ops vector with the + // vector elements. + SmallVector Ops; + if (InVec.getOpcode() == ISD::BUILD_VECTOR) { + Ops.append(InVec.getNode()->op_begin(), + InVec.getNode()->op_end()); + } else if (InVec.getOpcode() == ISD::UNDEF) { + unsigned NElts = VT.getVectorNumElements(); + Ops.append(NElts, DAG.getUNDEF(InVal.getValueType())); + } else { + return SDValue(); + } + + // Insert the element + if (Elt < Ops.size()) { + // All the operands of BUILD_VECTOR must have the same type; + // we enforce that here. + EVT OpVT = Ops[0].getValueType(); + if (InVal.getValueType() != OpVT) + InVal = OpVT.bitsGT(InVal.getValueType()) ? + DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) : + DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal); + Ops[Elt] = InVal; + } + + // Return the new vector + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); + } + + // Extract_vec (Build_vector) generated by custom lowering + // also needs to be customly combined + case ISD::EXTRACT_VECTOR_ELT: { + SDValue Arg = N->getOperand(0); + if (Arg.getOpcode() == ISD::BUILD_VECTOR) { + if (ConstantSDNode *Const = dyn_cast(N->getOperand(1))) { + unsigned Element = Const->getZExtValue(); + return Arg->getOperand(Element); + } + } + if (Arg.getOpcode() == ISD::BITCAST && + Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { + if (ConstantSDNode *Const = dyn_cast(N->getOperand(1))) { + unsigned Element = Const->getZExtValue(); + return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(), + Arg->getOperand(0).getOperand(Element)); + } + } + } + + case ISD::SELECT_CC: { + // Try common optimizations + SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI); + if (Ret.getNode()) + return Ret; + + // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq -> + // selectcc x, y, a, b, inv(cc) + // + // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne -> + // selectcc x, y, a, b, cc + SDValue LHS = N->getOperand(0); + if (LHS.getOpcode() != ISD::SELECT_CC) { + return SDValue(); + } + + SDValue RHS = N->getOperand(1); + SDValue True = N->getOperand(2); + SDValue False = N->getOperand(3); + ISD::CondCode NCC = cast(N->getOperand(4))->get(); + + if (LHS.getOperand(2).getNode() != True.getNode() || + LHS.getOperand(3).getNode() != False.getNode() || + RHS.getNode() != False.getNode()) { + return SDValue(); + } + + switch (NCC) { + default: return SDValue(); + case ISD::SETNE: return LHS; + case ISD::SETEQ: { + ISD::CondCode LHSCC = cast(LHS.getOperand(4))->get(); + LHSCC = ISD::getSetCCInverse(LHSCC, + LHS.getOperand(0).getValueType().isInteger()); + if (DCI.isBeforeLegalizeOps() || + isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType())) + return DAG.getSelectCC(SDLoc(N), + LHS.getOperand(0), + LHS.getOperand(1), + LHS.getOperand(2), + LHS.getOperand(3), + LHSCC); + break; + } + } + return SDValue(); + } + + case AMDGPUISD::EXPORT: { + SDValue Arg = N->getOperand(1); + if (Arg.getOpcode() != ISD::BUILD_VECTOR) + break; + + SDValue NewArgs[8] = { + N->getOperand(0), // Chain + SDValue(), + N->getOperand(2), // ArrayBase + N->getOperand(3), // Type + N->getOperand(4), // SWZ_X + N->getOperand(5), // SWZ_Y + N->getOperand(6), // SWZ_Z + N->getOperand(7) // SWZ_W + }; + SDLoc DL(N); + NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL); + return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs); + } + case AMDGPUISD::TEXTURE_FETCH: { + SDValue Arg = N->getOperand(1); + if (Arg.getOpcode() != ISD::BUILD_VECTOR) + break; + + SDValue NewArgs[19] = { + N->getOperand(0), + N->getOperand(1), + N->getOperand(2), + N->getOperand(3), + N->getOperand(4), + N->getOperand(5), + N->getOperand(6), + N->getOperand(7), + N->getOperand(8), + N->getOperand(9), + N->getOperand(10), + N->getOperand(11), + N->getOperand(12), + N->getOperand(13), + N->getOperand(14), + N->getOperand(15), + N->getOperand(16), + N->getOperand(17), + N->getOperand(18), + }; + SDLoc DL(N); + NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL); + return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs); + } + } + + return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); +} + +static bool +FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, + SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) { + const R600InstrInfo *TII = + static_cast(DAG.getSubtarget().getInstrInfo()); + if (!Src.isMachineOpcode()) + return false; + switch (Src.getMachineOpcode()) { + case AMDGPU::FNEG_R600: + if (!Neg.getNode()) + return false; + Src = Src.getOperand(0); + Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32); + return true; + case AMDGPU::FABS_R600: + if (!Abs.getNode()) + return false; + Src = Src.getOperand(0); + Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32); + return true; + case AMDGPU::CONST_COPY: { + unsigned Opcode = ParentNode->getMachineOpcode(); + bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; + + if (!Sel.getNode()) + return false; + + SDValue CstOffset = Src.getOperand(0); + if (ParentNode->getValueType(0).isVector()) + return false; + + // Gather constants values + int SrcIndices[] = { + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src2), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) + }; + std::vector Consts; + for (int OtherSrcIdx : SrcIndices) { + int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx); + if (OtherSrcIdx < 0 || OtherSelIdx < 0) + continue; + if (HasDst) { + OtherSrcIdx--; + OtherSelIdx--; + } + if (RegisterSDNode *Reg = + dyn_cast(ParentNode->getOperand(OtherSrcIdx))) { + if (Reg->getReg() == AMDGPU::ALU_CONST) { + ConstantSDNode *Cst + = cast(ParentNode->getOperand(OtherSelIdx)); + Consts.push_back(Cst->getZExtValue()); + } + } + } + + ConstantSDNode *Cst = cast(CstOffset); + Consts.push_back(Cst->getZExtValue()); + if (!TII->fitsConstReadLimitations(Consts)) { + return false; + } + + Sel = CstOffset; + Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32); + return true; + } + case AMDGPU::MOV_IMM_I32: + case AMDGPU::MOV_IMM_F32: { + unsigned ImmReg = AMDGPU::ALU_LITERAL_X; + uint64_t ImmValue = 0; + + + if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) { + ConstantFPSDNode *FPC = dyn_cast(Src.getOperand(0)); + float FloatValue = FPC->getValueAPF().convertToFloat(); + if (FloatValue == 0.0) { + ImmReg = AMDGPU::ZERO; + } else if (FloatValue == 0.5) { + ImmReg = AMDGPU::HALF; + } else if (FloatValue == 1.0) { + ImmReg = AMDGPU::ONE; + } else { + ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue(); + } + } else { + ConstantSDNode *C = dyn_cast(Src.getOperand(0)); + uint64_t Value = C->getZExtValue(); + if (Value == 0) { + ImmReg = AMDGPU::ZERO; + } else if (Value == 1) { + ImmReg = AMDGPU::ONE_INT; + } else { + ImmValue = Value; + } + } + + // Check that we aren't already using an immediate. + // XXX: It's possible for an instruction to have more than one + // immediate operand, but this is not supported yet. + if (ImmReg == AMDGPU::ALU_LITERAL_X) { + if (!Imm.getNode()) + return false; + ConstantSDNode *C = dyn_cast(Imm); + assert(C); + if (C->getZExtValue()) + return false; + Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32); + } + Src = DAG.getRegister(ImmReg, MVT::i32); + return true; + } + default: + return false; + } +} + + +/// \brief Fold the instructions after selecting them +SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, + SelectionDAG &DAG) const { + const R600InstrInfo *TII = + static_cast(DAG.getSubtarget().getInstrInfo()); + if (!Node->isMachineOpcode()) + return Node; + unsigned Opcode = Node->getMachineOpcode(); + SDValue FakeOp; + + std::vector Ops(Node->op_begin(), Node->op_end()); + + if (Opcode == AMDGPU::DOT_4) { + int OperandIdx[] = { + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) + }; + int NegIdx[] = { + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W) + }; + int AbsIdx[] = { + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W) + }; + for (unsigned i = 0; i < 8; i++) { + if (OperandIdx[i] < 0) + return Node; + SDValue &Src = Ops[OperandIdx[i] - 1]; + SDValue &Neg = Ops[NegIdx[i] - 1]; + SDValue &Abs = Ops[AbsIdx[i] - 1]; + bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; + int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); + if (HasDst) + SelIdx--; + SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; + if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG)) + return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); + } + } else if (Opcode == AMDGPU::REG_SEQUENCE) { + for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) { + SDValue &Src = Ops[i]; + if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG)) + return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); + } + } else if (Opcode == AMDGPU::CLAMP_R600) { + SDValue Src = Node->getOperand(0); + if (!Src.isMachineOpcode() || + !TII->hasInstrModifiers(Src.getMachineOpcode())) + return Node; + int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(), + AMDGPU::OpName::clamp); + if (ClampIdx < 0) + return Node; + SDLoc DL(Node); + std::vector Ops(Src->op_begin(), Src->op_end()); + Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32); + return DAG.getMachineNode(Src.getMachineOpcode(), DL, + Node->getVTList(), Ops); + } else { + if (!TII->hasInstrModifiers(Opcode)) + return Node; + int OperandIdx[] = { + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src2) + }; + int NegIdx[] = { + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg) + }; + int AbsIdx[] = { + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs), + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs), + -1 + }; + for (unsigned i = 0; i < 3; i++) { + if (OperandIdx[i] < 0) + return Node; + SDValue &Src = Ops[OperandIdx[i] - 1]; + SDValue &Neg = Ops[NegIdx[i] - 1]; + SDValue FakeAbs; + SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs; + bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; + int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); + int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal); + if (HasDst) { + SelIdx--; + ImmIdx--; + } + SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; + SDValue &Imm = Ops[ImmIdx]; + if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG)) + return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); + } + } + + return Node; +} diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h new file mode 100644 index 00000000000..c06d3c4fd30 --- /dev/null +++ b/lib/Target/AMDGPU/R600ISelLowering.h @@ -0,0 +1,80 @@ +//===-- R600ISelLowering.h - R600 DAG Lowering Interface -*- C++ -*--------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief R600 DAG Lowering interface definition +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_R600ISELLOWERING_H +#define LLVM_LIB_TARGET_R600_R600ISELLOWERING_H + +#include "AMDGPUISelLowering.h" + +namespace llvm { + +class R600InstrInfo; + +class R600TargetLowering : public AMDGPUTargetLowering { +public: + R600TargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI); + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI, + MachineBasicBlock * BB) const override; + SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; + void ReplaceNodeResults(SDNode * N, + SmallVectorImpl &Results, + SelectionDAG &DAG) const override; + SDValue LowerFormalArguments( + SDValue Chain, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl &Ins, + SDLoc DL, SelectionDAG &DAG, + SmallVectorImpl &InVals) const override; + EVT getSetCCResultType(LLVMContext &, EVT VT) const override; +private: + unsigned Gen; + /// Each OpenCL kernel has nine implicit parameters that are stored in the + /// first nine dwords of a Vertex Buffer. These implicit parameters are + /// lowered to load instructions which retrieve the values from the Vertex + /// Buffer. + SDValue LowerImplicitParameter(SelectionDAG &DAG, EVT VT, + SDLoc DL, unsigned DwordOffset) const; + + void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB, + MachineRegisterInfo & MRI, unsigned dword_offset) const; + SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG, + SDLoc DL) const; + SDValue vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const; + + SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSHLParts(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSRXParts(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerUADDSUBO(SDValue Op, SelectionDAG &DAG, + unsigned mainop, unsigned ovf) const; + + SDValue stackPtrToRegIndex(SDValue Ptr, unsigned StackWidth, + SelectionDAG &DAG) const; + void getStackAddress(unsigned StackWidth, unsigned ElemIdx, + unsigned &Channel, unsigned &PtrIncr) const; + bool isZero(SDValue Op) const; + SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override; +}; + +} // End namespace llvm; + +#endif diff --git a/lib/Target/AMDGPU/R600InstrFormats.td b/lib/Target/AMDGPU/R600InstrFormats.td new file mode 100644 index 00000000000..0ffd485476e --- /dev/null +++ b/lib/Target/AMDGPU/R600InstrFormats.td @@ -0,0 +1,495 @@ +//===-- R600InstrFormats.td - R600 Instruction Encodings ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// R600 Instruction format definitions. +// +//===----------------------------------------------------------------------===// + +class InstR600 pattern, + InstrItinClass itin> + : AMDGPUInst { + + field bits<64> Inst; + bit Trig = 0; + bit Op3 = 0; + bit isVector = 0; + bits<2> FlagOperandIdx = 0; + bit Op1 = 0; + bit Op2 = 0; + bit LDS_1A = 0; + bit LDS_1A1D = 0; + bit HasNativeOperands = 0; + bit VTXInst = 0; + bit TEXInst = 0; + bit ALUInst = 0; + bit IsExport = 0; + bit LDS_1A2D = 0; + + let Namespace = "AMDGPU"; + let OutOperandList = outs; + let InOperandList = ins; + let AsmString = asm; + let Pattern = pattern; + let Itinerary = itin; + + // No AsmMatcher support. + let isCodeGenOnly = 1; + + let TSFlags{4} = Trig; + let TSFlags{5} = Op3; + + // Vector instructions are instructions that must fill all slots in an + // instruction group + let TSFlags{6} = isVector; + let TSFlags{8-7} = FlagOperandIdx; + let TSFlags{9} = HasNativeOperands; + let TSFlags{10} = Op1; + let TSFlags{11} = Op2; + let TSFlags{12} = VTXInst; + let TSFlags{13} = TEXInst; + let TSFlags{14} = ALUInst; + let TSFlags{15} = LDS_1A; + let TSFlags{16} = LDS_1A1D; + let TSFlags{17} = IsExport; + let TSFlags{18} = LDS_1A2D; +} + +//===----------------------------------------------------------------------===// +// ALU instructions +//===----------------------------------------------------------------------===// + +class R600_ALU_LDS_Word0 { + field bits<32> Word0; + + bits<11> src0; + bits<1> src0_rel; + bits<11> src1; + bits<1> src1_rel; + bits<3> index_mode = 0; + bits<2> pred_sel; + bits<1> last; + + bits<9> src0_sel = src0{8-0}; + bits<2> src0_chan = src0{10-9}; + bits<9> src1_sel = src1{8-0}; + bits<2> src1_chan = src1{10-9}; + + let Word0{8-0} = src0_sel; + let Word0{9} = src0_rel; + let Word0{11-10} = src0_chan; + let Word0{21-13} = src1_sel; + let Word0{22} = src1_rel; + let Word0{24-23} = src1_chan; + let Word0{28-26} = index_mode; + let Word0{30-29} = pred_sel; + let Word0{31} = last; +} + +class R600ALU_Word0 : R600_ALU_LDS_Word0 { + + bits<1> src0_neg; + bits<1> src1_neg; + + let Word0{12} = src0_neg; + let Word0{25} = src1_neg; +} + +class R600ALU_Word1 { + field bits<32> Word1; + + bits<11> dst; + bits<3> bank_swizzle; + bits<1> dst_rel; + bits<1> clamp; + + bits<7> dst_sel = dst{6-0}; + bits<2> dst_chan = dst{10-9}; + + let Word1{20-18} = bank_swizzle; + let Word1{27-21} = dst_sel; + let Word1{28} = dst_rel; + let Word1{30-29} = dst_chan; + let Word1{31} = clamp; +} + +class R600ALU_Word1_OP2 alu_inst> : R600ALU_Word1{ + + bits<1> src0_abs; + bits<1> src1_abs; + bits<1> update_exec_mask; + bits<1> update_pred; + bits<1> write; + bits<2> omod; + + let Word1{0} = src0_abs; + let Word1{1} = src1_abs; + let Word1{2} = update_exec_mask; + let Word1{3} = update_pred; + let Word1{4} = write; + let Word1{6-5} = omod; + let Word1{17-7} = alu_inst; +} + +class R600ALU_Word1_OP3 alu_inst> : R600ALU_Word1{ + + bits<11> src2; + bits<1> src2_rel; + bits<1> src2_neg; + + bits<9> src2_sel = src2{8-0}; + bits<2> src2_chan = src2{10-9}; + + let Word1{8-0} = src2_sel; + let Word1{9} = src2_rel; + let Word1{11-10} = src2_chan; + let Word1{12} = src2_neg; + let Word1{17-13} = alu_inst; +} + +class R600LDS_Word1 { + field bits<32> Word1; + + bits<11> src2; + bits<9> src2_sel = src2{8-0}; + bits<2> src2_chan = src2{10-9}; + bits<1> src2_rel; + // offset specifies the stride offset to the second set of data to be read + // from. This is a dword offset. + bits<5> alu_inst = 17; // OP3_INST_LDS_IDX_OP + bits<3> bank_swizzle; + bits<6> lds_op; + bits<2> dst_chan = 0; + + let Word1{8-0} = src2_sel; + let Word1{9} = src2_rel; + let Word1{11-10} = src2_chan; + let Word1{17-13} = alu_inst; + let Word1{20-18} = bank_swizzle; + let Word1{26-21} = lds_op; + let Word1{30-29} = dst_chan; +} + + +/* +XXX: R600 subtarget uses a slightly different encoding than the other +subtargets. We currently handle this in R600MCCodeEmitter, but we may +want to use these instruction classes in the future. + +class R600ALU_Word1_OP2_r600 : R600ALU_Word1_OP2 { + + bits<1> fog_merge; + bits<10> alu_inst; + + let Inst{37} = fog_merge; + let Inst{39-38} = omod; + let Inst{49-40} = alu_inst; +} + +class R600ALU_Word1_OP2_r700 : R600ALU_Word1_OP2 { + + bits<11> alu_inst; + + let Inst{38-37} = omod; + let Inst{49-39} = alu_inst; +} +*/ + +//===----------------------------------------------------------------------===// +// Vertex Fetch instructions +//===----------------------------------------------------------------------===// + +class VTX_WORD0 { + field bits<32> Word0; + bits<7> src_gpr; + bits<5> VC_INST; + bits<2> FETCH_TYPE; + bits<1> FETCH_WHOLE_QUAD; + bits<8> BUFFER_ID; + bits<1> SRC_REL; + bits<2> SRC_SEL_X; + + let Word0{4-0} = VC_INST; + let Word0{6-5} = FETCH_TYPE; + let Word0{7} = FETCH_WHOLE_QUAD; + let Word0{15-8} = BUFFER_ID; + let Word0{22-16} = src_gpr; + let Word0{23} = SRC_REL; + let Word0{25-24} = SRC_SEL_X; +} + +class VTX_WORD0_eg : VTX_WORD0 { + + bits<6> MEGA_FETCH_COUNT; + + let Word0{31-26} = MEGA_FETCH_COUNT; +} + +class VTX_WORD0_cm : VTX_WORD0 { + + bits<2> SRC_SEL_Y; + bits<2> STRUCTURED_READ; + bits<1> LDS_REQ; + bits<1> COALESCED_READ; + + let Word0{27-26} = SRC_SEL_Y; + let Word0{29-28} = STRUCTURED_READ; + let Word0{30} = LDS_REQ; + let Word0{31} = COALESCED_READ; +} + +class VTX_WORD1_GPR { + field bits<32> Word1; + bits<7> dst_gpr; + bits<1> DST_REL; + bits<3> DST_SEL_X; + bits<3> DST_SEL_Y; + bits<3> DST_SEL_Z; + bits<3> DST_SEL_W; + bits<1> USE_CONST_FIELDS; + bits<6> DATA_FORMAT; + bits<2> NUM_FORMAT_ALL; + bits<1> FORMAT_COMP_ALL; + bits<1> SRF_MODE_ALL; + + let Word1{6-0} = dst_gpr; + let Word1{7} = DST_REL; + let Word1{8} = 0; // Reserved + let Word1{11-9} = DST_SEL_X; + let Word1{14-12} = DST_SEL_Y; + let Word1{17-15} = DST_SEL_Z; + let Word1{20-18} = DST_SEL_W; + let Word1{21} = USE_CONST_FIELDS; + let Word1{27-22} = DATA_FORMAT; + let Word1{29-28} = NUM_FORMAT_ALL; + let Word1{30} = FORMAT_COMP_ALL; + let Word1{31} = SRF_MODE_ALL; +} + +//===----------------------------------------------------------------------===// +// Texture fetch instructions +//===----------------------------------------------------------------------===// + +class TEX_WORD0 { + field bits<32> Word0; + + bits<5> TEX_INST; + bits<2> INST_MOD; + bits<1> FETCH_WHOLE_QUAD; + bits<8> RESOURCE_ID; + bits<7> SRC_GPR; + bits<1> SRC_REL; + bits<1> ALT_CONST; + bits<2> RESOURCE_INDEX_MODE; + bits<2> SAMPLER_INDEX_MODE; + + let Word0{4-0} = TEX_INST; + let Word0{6-5} = INST_MOD; + let Word0{7} = FETCH_WHOLE_QUAD; + let Word0{15-8} = RESOURCE_ID; + let Word0{22-16} = SRC_GPR; + let Word0{23} = SRC_REL; + let Word0{24} = ALT_CONST; + let Word0{26-25} = RESOURCE_INDEX_MODE; + let Word0{28-27} = SAMPLER_INDEX_MODE; +} + +class TEX_WORD1 { + field bits<32> Word1; + + bits<7> DST_GPR; + bits<1> DST_REL; + bits<3> DST_SEL_X; + bits<3> DST_SEL_Y; + bits<3> DST_SEL_Z; + bits<3> DST_SEL_W; + bits<7> LOD_BIAS; + bits<1> COORD_TYPE_X; + bits<1> COORD_TYPE_Y; + bits<1> COORD_TYPE_Z; + bits<1> COORD_TYPE_W; + + let Word1{6-0} = DST_GPR; + let Word1{7} = DST_REL; + let Word1{11-9} = DST_SEL_X; + let Word1{14-12} = DST_SEL_Y; + let Word1{17-15} = DST_SEL_Z; + let Word1{20-18} = DST_SEL_W; + let Word1{27-21} = LOD_BIAS; + let Word1{28} = COORD_TYPE_X; + let Word1{29} = COORD_TYPE_Y; + let Word1{30} = COORD_TYPE_Z; + let Word1{31} = COORD_TYPE_W; +} + +class TEX_WORD2 { + field bits<32> Word2; + + bits<5> OFFSET_X; + bits<5> OFFSET_Y; + bits<5> OFFSET_Z; + bits<5> SAMPLER_ID; + bits<3> SRC_SEL_X; + bits<3> SRC_SEL_Y; + bits<3> SRC_SEL_Z; + bits<3> SRC_SEL_W; + + let Word2{4-0} = OFFSET_X; + let Word2{9-5} = OFFSET_Y; + let Word2{14-10} = OFFSET_Z; + let Word2{19-15} = SAMPLER_ID; + let Word2{22-20} = SRC_SEL_X; + let Word2{25-23} = SRC_SEL_Y; + let Word2{28-26} = SRC_SEL_Z; + let Word2{31-29} = SRC_SEL_W; +} + +//===----------------------------------------------------------------------===// +// Control Flow Instructions +//===----------------------------------------------------------------------===// + +class CF_WORD1_R600 { + field bits<32> Word1; + + bits<3> POP_COUNT; + bits<5> CF_CONST; + bits<2> COND; + bits<3> COUNT; + bits<6> CALL_COUNT; + bits<1> COUNT_3; + bits<1> END_OF_PROGRAM; + bits<1> VALID_PIXEL_MODE; + bits<7> CF_INST; + bits<1> WHOLE_QUAD_MODE; + bits<1> BARRIER; + + let Word1{2-0} = POP_COUNT; + let Word1{7-3} = CF_CONST; + let Word1{9-8} = COND; + let Word1{12-10} = COUNT; + let Word1{18-13} = CALL_COUNT; + let Word1{19} = COUNT_3; + let Word1{21} = END_OF_PROGRAM; + let Word1{22} = VALID_PIXEL_MODE; + let Word1{29-23} = CF_INST; + let Word1{30} = WHOLE_QUAD_MODE; + let Word1{31} = BARRIER; +} + +class CF_WORD0_EG { + field bits<32> Word0; + + bits<24> ADDR; + bits<3> JUMPTABLE_SEL; + + let Word0{23-0} = ADDR; + let Word0{26-24} = JUMPTABLE_SEL; +} + +class CF_WORD1_EG { + field bits<32> Word1; + + bits<3> POP_COUNT; + bits<5> CF_CONST; + bits<2> COND; + bits<6> COUNT; + bits<1> VALID_PIXEL_MODE; + bits<1> END_OF_PROGRAM; + bits<8> CF_INST; + bits<1> BARRIER; + + let Word1{2-0} = POP_COUNT; + let Word1{7-3} = CF_CONST; + let Word1{9-8} = COND; + let Word1{15-10} = COUNT; + let Word1{20} = VALID_PIXEL_MODE; + let Word1{21} = END_OF_PROGRAM; + let Word1{29-22} = CF_INST; + let Word1{31} = BARRIER; +} + +class CF_ALU_WORD0 { + field bits<32> Word0; + + bits<22> ADDR; + bits<4> KCACHE_BANK0; + bits<4> KCACHE_BANK1; + bits<2> KCACHE_MODE0; + + let Word0{21-0} = ADDR; + let Word0{25-22} = KCACHE_BANK0; + let Word0{29-26} = KCACHE_BANK1; + let Word0{31-30} = KCACHE_MODE0; +} + +class CF_ALU_WORD1 { + field bits<32> Word1; + + bits<2> KCACHE_MODE1; + bits<8> KCACHE_ADDR0; + bits<8> KCACHE_ADDR1; + bits<7> COUNT; + bits<1> ALT_CONST; + bits<4> CF_INST; + bits<1> WHOLE_QUAD_MODE; + bits<1> BARRIER; + + let Word1{1-0} = KCACHE_MODE1; + let Word1{9-2} = KCACHE_ADDR0; + let Word1{17-10} = KCACHE_ADDR1; + let Word1{24-18} = COUNT; + let Word1{25} = ALT_CONST; + let Word1{29-26} = CF_INST; + let Word1{30} = WHOLE_QUAD_MODE; + let Word1{31} = BARRIER; +} + +class CF_ALLOC_EXPORT_WORD0_RAT { + field bits<32> Word0; + + bits<4> rat_id; + bits<6> rat_inst; + bits<2> rim; + bits<2> type; + bits<7> rw_gpr; + bits<1> rw_rel; + bits<7> index_gpr; + bits<2> elem_size; + + let Word0{3-0} = rat_id; + let Word0{9-4} = rat_inst; + let Word0{10} = 0; // Reserved + let Word0{12-11} = rim; + let Word0{14-13} = type; + let Word0{21-15} = rw_gpr; + let Word0{22} = rw_rel; + let Word0{29-23} = index_gpr; + let Word0{31-30} = elem_size; +} + +class CF_ALLOC_EXPORT_WORD1_BUF { + field bits<32> Word1; + + bits<12> array_size; + bits<4> comp_mask; + bits<4> burst_count; + bits<1> vpm; + bits<1> eop; + bits<8> cf_inst; + bits<1> mark; + bits<1> barrier; + + let Word1{11-0} = array_size; + let Word1{15-12} = comp_mask; + let Word1{19-16} = burst_count; + let Word1{20} = vpm; + let Word1{21} = eop; + let Word1{29-22} = cf_inst; + let Word1{30} = mark; + let Word1{31} = barrier; +} diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp new file mode 100644 index 00000000000..5ef883cbcad --- /dev/null +++ b/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -0,0 +1,1435 @@ +//===-- R600InstrInfo.cpp - R600 Instruction Information ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief R600 Implementation of TargetInstrInfo. +// +//===----------------------------------------------------------------------===// + +#include "R600InstrInfo.h" +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "AMDGPUTargetMachine.h" +#include "R600Defines.h" +#include "R600MachineFunctionInfo.h" +#include "R600RegisterInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +#define GET_INSTRINFO_CTOR_DTOR +#include "AMDGPUGenDFAPacketizer.inc" + +R600InstrInfo::R600InstrInfo(const AMDGPUSubtarget &st) + : AMDGPUInstrInfo(st), RI() {} + +const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const { + return RI; +} + +bool R600InstrInfo::isTrig(const MachineInstr &MI) const { + return get(MI.getOpcode()).TSFlags & R600_InstFlag::TRIG; +} + +bool R600InstrInfo::isVector(const MachineInstr &MI) const { + return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR; +} + +void +R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const { + unsigned VectorComponents = 0; + if ((AMDGPU::R600_Reg128RegClass.contains(DestReg) || + AMDGPU::R600_Reg128VerticalRegClass.contains(DestReg)) && + (AMDGPU::R600_Reg128RegClass.contains(SrcReg) || + AMDGPU::R600_Reg128VerticalRegClass.contains(SrcReg))) { + VectorComponents = 4; + } else if((AMDGPU::R600_Reg64RegClass.contains(DestReg) || + AMDGPU::R600_Reg64VerticalRegClass.contains(DestReg)) && + (AMDGPU::R600_Reg64RegClass.contains(SrcReg) || + AMDGPU::R600_Reg64VerticalRegClass.contains(SrcReg))) { + VectorComponents = 2; + } + + if (VectorComponents > 0) { + for (unsigned I = 0; I < VectorComponents; I++) { + unsigned SubRegIndex = RI.getSubRegFromChannel(I); + buildDefaultInstruction(MBB, MI, AMDGPU::MOV, + RI.getSubReg(DestReg, SubRegIndex), + RI.getSubReg(SrcReg, SubRegIndex)) + .addReg(DestReg, + RegState::Define | RegState::Implicit); + } + } else { + MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV, + DestReg, SrcReg); + NewMI->getOperand(getOperandIdx(*NewMI, AMDGPU::OpName::src0)) + .setIsKill(KillSrc); + } +} + +/// \returns true if \p MBBI can be moved into a new basic. +bool R600InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) const { + for (MachineInstr::const_mop_iterator I = MBBI->operands_begin(), + E = MBBI->operands_end(); I != E; ++I) { + if (I->isReg() && !TargetRegisterInfo::isVirtualRegister(I->getReg()) && + I->isUse() && RI.isPhysRegLiveAcrossClauses(I->getReg())) + return false; + } + return true; +} + +bool R600InstrInfo::isMov(unsigned Opcode) const { + + + switch(Opcode) { + default: return false; + case AMDGPU::MOV: + case AMDGPU::MOV_IMM_F32: + case AMDGPU::MOV_IMM_I32: + return true; + } +} + +// Some instructions act as place holders to emulate operations that the GPU +// hardware does automatically. This function can be used to check if +// an opcode falls into this category. +bool R600InstrInfo::isPlaceHolderOpcode(unsigned Opcode) const { + switch (Opcode) { + default: return false; + case AMDGPU::RETURN: + return true; + } +} + +bool R600InstrInfo::isReductionOp(unsigned Opcode) const { + return false; +} + +bool R600InstrInfo::isCubeOp(unsigned Opcode) const { + switch(Opcode) { + default: return false; + case AMDGPU::CUBE_r600_pseudo: + case AMDGPU::CUBE_r600_real: + case AMDGPU::CUBE_eg_pseudo: + case AMDGPU::CUBE_eg_real: + return true; + } +} + +bool R600InstrInfo::isALUInstr(unsigned Opcode) const { + unsigned TargetFlags = get(Opcode).TSFlags; + + return (TargetFlags & R600_InstFlag::ALU_INST); +} + +bool R600InstrInfo::hasInstrModifiers(unsigned Opcode) const { + unsigned TargetFlags = get(Opcode).TSFlags; + + return ((TargetFlags & R600_InstFlag::OP1) | + (TargetFlags & R600_InstFlag::OP2) | + (TargetFlags & R600_InstFlag::OP3)); +} + +bool R600InstrInfo::isLDSInstr(unsigned Opcode) const { + unsigned TargetFlags = get(Opcode).TSFlags; + + return ((TargetFlags & R600_InstFlag::LDS_1A) | + (TargetFlags & R600_InstFlag::LDS_1A1D) | + (TargetFlags & R600_InstFlag::LDS_1A2D)); +} + +bool R600InstrInfo::isLDSNoRetInstr(unsigned Opcode) const { + return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) == -1; +} + +bool R600InstrInfo::isLDSRetInstr(unsigned Opcode) const { + return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) != -1; +} + +bool R600InstrInfo::canBeConsideredALU(const MachineInstr *MI) const { + if (isALUInstr(MI->getOpcode())) + return true; + if (isVector(*MI) || isCubeOp(MI->getOpcode())) + return true; + switch (MI->getOpcode()) { + case AMDGPU::PRED_X: + case AMDGPU::INTERP_PAIR_XY: + case AMDGPU::INTERP_PAIR_ZW: + case AMDGPU::INTERP_VEC_LOAD: + case AMDGPU::COPY: + case AMDGPU::DOT_4: + return true; + default: + return false; + } +} + +bool R600InstrInfo::isTransOnly(unsigned Opcode) const { + if (ST.hasCaymanISA()) + return false; + return (get(Opcode).getSchedClass() == AMDGPU::Sched::TransALU); +} + +bool R600InstrInfo::isTransOnly(const MachineInstr *MI) const { + return isTransOnly(MI->getOpcode()); +} + +bool R600InstrInfo::isVectorOnly(unsigned Opcode) const { + return (get(Opcode).getSchedClass() == AMDGPU::Sched::VecALU); +} + +bool R600InstrInfo::isVectorOnly(const MachineInstr *MI) const { + return isVectorOnly(MI->getOpcode()); +} + +bool R600InstrInfo::isExport(unsigned Opcode) const { + return (get(Opcode).TSFlags & R600_InstFlag::IS_EXPORT); +} + +bool R600InstrInfo::usesVertexCache(unsigned Opcode) const { + return ST.hasVertexCache() && IS_VTX(get(Opcode)); +} + +bool R600InstrInfo::usesVertexCache(const MachineInstr *MI) const { + const MachineFunction *MF = MI->getParent()->getParent(); + const R600MachineFunctionInfo *MFI = MF->getInfo(); + return MFI->getShaderType() != ShaderType::COMPUTE && + usesVertexCache(MI->getOpcode()); +} + +bool R600InstrInfo::usesTextureCache(unsigned Opcode) const { + return (!ST.hasVertexCache() && IS_VTX(get(Opcode))) || IS_TEX(get(Opcode)); +} + +bool R600InstrInfo::usesTextureCache(const MachineInstr *MI) const { + const MachineFunction *MF = MI->getParent()->getParent(); + const R600MachineFunctionInfo *MFI = MF->getInfo(); + return (MFI->getShaderType() == ShaderType::COMPUTE && + usesVertexCache(MI->getOpcode())) || + usesTextureCache(MI->getOpcode()); +} + +bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const { + switch (Opcode) { + case AMDGPU::KILLGT: + case AMDGPU::GROUP_BARRIER: + return true; + default: + return false; + } +} + +bool R600InstrInfo::usesAddressRegister(MachineInstr *MI) const { + return MI->findRegisterUseOperandIdx(AMDGPU::AR_X) != -1; +} + +bool R600InstrInfo::definesAddressRegister(MachineInstr *MI) const { + return MI->findRegisterDefOperandIdx(AMDGPU::AR_X) != -1; +} + +bool R600InstrInfo::readsLDSSrcReg(const MachineInstr *MI) const { + if (!isALUInstr(MI->getOpcode())) { + return false; + } + for (MachineInstr::const_mop_iterator I = MI->operands_begin(), + E = MI->operands_end(); I != E; ++I) { + if (!I->isReg() || !I->isUse() || + TargetRegisterInfo::isVirtualRegister(I->getReg())) + continue; + + if (AMDGPU::R600_LDS_SRC_REGRegClass.contains(I->getReg())) + return true; + } + return false; +} + +int R600InstrInfo::getSrcIdx(unsigned Opcode, unsigned SrcNum) const { + static const unsigned OpTable[] = { + AMDGPU::OpName::src0, + AMDGPU::OpName::src1, + AMDGPU::OpName::src2 + }; + + assert (SrcNum < 3); + return getOperandIdx(Opcode, OpTable[SrcNum]); +} + +int R600InstrInfo::getSelIdx(unsigned Opcode, unsigned SrcIdx) const { + static const unsigned SrcSelTable[][2] = { + {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel}, + {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel}, + {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel}, + {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X}, + {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y}, + {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z}, + {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W}, + {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X}, + {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y}, + {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z}, + {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W} + }; + + for (const auto &Row : SrcSelTable) { + if (getOperandIdx(Opcode, Row[0]) == (int)SrcIdx) { + return getOperandIdx(Opcode, Row[1]); + } + } + return -1; +} + +SmallVector, 3> +R600InstrInfo::getSrcs(MachineInstr *MI) const { + SmallVector, 3> Result; + + if (MI->getOpcode() == AMDGPU::DOT_4) { + static const unsigned OpTable[8][2] = { + {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X}, + {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y}, + {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z}, + {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W}, + {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X}, + {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y}, + {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z}, + {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W}, + }; + + for (unsigned j = 0; j < 8; j++) { + MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(), + OpTable[j][0])); + unsigned Reg = MO.getReg(); + if (Reg == AMDGPU::ALU_CONST) { + unsigned Sel = MI->getOperand(getOperandIdx(MI->getOpcode(), + OpTable[j][1])).getImm(); + Result.push_back(std::pair(&MO, Sel)); + continue; + } + + } + return Result; + } + + static const unsigned OpTable[3][2] = { + {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel}, + {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel}, + {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel}, + }; + + for (unsigned j = 0; j < 3; j++) { + int SrcIdx = getOperandIdx(MI->getOpcode(), OpTable[j][0]); + if (SrcIdx < 0) + break; + MachineOperand &MO = MI->getOperand(SrcIdx); + unsigned Reg = MI->getOperand(SrcIdx).getReg(); + if (Reg == AMDGPU::ALU_CONST) { + unsigned Sel = MI->getOperand( + getOperandIdx(MI->getOpcode(), OpTable[j][1])).getImm(); + Result.push_back(std::pair(&MO, Sel)); + continue; + } + if (Reg == AMDGPU::ALU_LITERAL_X) { + unsigned Imm = MI->getOperand( + getOperandIdx(MI->getOpcode(), AMDGPU::OpName::literal)).getImm(); + Result.push_back(std::pair(&MO, Imm)); + continue; + } + Result.push_back(std::pair(&MO, 0)); + } + return Result; +} + +std::vector > +R600InstrInfo::ExtractSrcs(MachineInstr *MI, + const DenseMap &PV, + unsigned &ConstCount) const { + ConstCount = 0; + ArrayRef> Srcs = getSrcs(MI); + const std::pair DummyPair(-1, 0); + std::vector > Result; + unsigned i = 0; + for (unsigned n = Srcs.size(); i < n; ++i) { + unsigned Reg = Srcs[i].first->getReg(); + unsigned Index = RI.getEncodingValue(Reg) & 0xff; + if (Reg == AMDGPU::OQAP) { + Result.push_back(std::pair(Index, 0)); + } + if (PV.find(Reg) != PV.end()) { + // 255 is used to tells its a PS/PV reg + Result.push_back(std::pair(255, 0)); + continue; + } + if (Index > 127) { + ConstCount++; + Result.push_back(DummyPair); + continue; + } + unsigned Chan = RI.getHWRegChan(Reg); + Result.push_back(std::pair(Index, Chan)); + } + for (; i < 3; ++i) + Result.push_back(DummyPair); + return Result; +} + +static std::vector > +Swizzle(std::vector > Src, + R600InstrInfo::BankSwizzle Swz) { + if (Src[0] == Src[1]) + Src[1].first = -1; + switch (Swz) { + case R600InstrInfo::ALU_VEC_012_SCL_210: + break; + case R600InstrInfo::ALU_VEC_021_SCL_122: + std::swap(Src[1], Src[2]); + break; + case R600InstrInfo::ALU_VEC_102_SCL_221: + std::swap(Src[0], Src[1]); + break; + case R600InstrInfo::ALU_VEC_120_SCL_212: + std::swap(Src[0], Src[1]); + std::swap(Src[0], Src[2]); + break; + case R600InstrInfo::ALU_VEC_201: + std::swap(Src[0], Src[2]); + std::swap(Src[0], Src[1]); + break; + case R600InstrInfo::ALU_VEC_210: + std::swap(Src[0], Src[2]); + break; + } + return Src; +} + +static unsigned +getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) { + switch (Swz) { + case R600InstrInfo::ALU_VEC_012_SCL_210: { + unsigned Cycles[3] = { 2, 1, 0}; + return Cycles[Op]; + } + case R600InstrInfo::ALU_VEC_021_SCL_122: { + unsigned Cycles[3] = { 1, 2, 2}; + return Cycles[Op]; + } + case R600InstrInfo::ALU_VEC_120_SCL_212: { + unsigned Cycles[3] = { 2, 1, 2}; + return Cycles[Op]; + } + case R600InstrInfo::ALU_VEC_102_SCL_221: { + unsigned Cycles[3] = { 2, 2, 1}; + return Cycles[Op]; + } + default: + llvm_unreachable("Wrong Swizzle for Trans Slot"); + return 0; + } +} + +/// returns how many MIs (whose inputs are represented by IGSrcs) can be packed +/// in the same Instruction Group while meeting read port limitations given a +/// Swz swizzle sequence. +unsigned R600InstrInfo::isLegalUpTo( + const std::vector > > &IGSrcs, + const std::vector &Swz, + const std::vector > &TransSrcs, + R600InstrInfo::BankSwizzle TransSwz) const { + int Vector[4][3]; + memset(Vector, -1, sizeof(Vector)); + for (unsigned i = 0, e = IGSrcs.size(); i < e; i++) { + const std::vector > &Srcs = + Swizzle(IGSrcs[i], Swz[i]); + for (unsigned j = 0; j < 3; j++) { + const std::pair &Src = Srcs[j]; + if (Src.first < 0 || Src.first == 255) + continue; + if (Src.first == GET_REG_INDEX(RI.getEncodingValue(AMDGPU::OQAP))) { + if (Swz[i] != R600InstrInfo::ALU_VEC_012_SCL_210 && + Swz[i] != R600InstrInfo::ALU_VEC_021_SCL_122) { + // The value from output queue A (denoted by register OQAP) can + // only be fetched during the first cycle. + return false; + } + // OQAP does not count towards the normal read port restrictions + continue; + } + if (Vector[Src.second][j] < 0) + Vector[Src.second][j] = Src.first; + if (Vector[Src.second][j] != Src.first) + return i; + } + } + // Now check Trans Alu + for (unsigned i = 0, e = TransSrcs.size(); i < e; ++i) { + const std::pair &Src = TransSrcs[i]; + unsigned Cycle = getTransSwizzle(TransSwz, i); + if (Src.first < 0) + continue; + if (Src.first == 255) + continue; + if (Vector[Src.second][Cycle] < 0) + Vector[Src.second][Cycle] = Src.first; + if (Vector[Src.second][Cycle] != Src.first) + return IGSrcs.size() - 1; + } + return IGSrcs.size(); +} + +/// Given a swizzle sequence SwzCandidate and an index Idx, returns the next +/// (in lexicographic term) swizzle sequence assuming that all swizzles after +/// Idx can be skipped +static bool +NextPossibleSolution( + std::vector &SwzCandidate, + unsigned Idx) { + assert(Idx < SwzCandidate.size()); + int ResetIdx = Idx; + while (ResetIdx > -1 && SwzCandidate[ResetIdx] == R600InstrInfo::ALU_VEC_210) + ResetIdx --; + for (unsigned i = ResetIdx + 1, e = SwzCandidate.size(); i < e; i++) { + SwzCandidate[i] = R600InstrInfo::ALU_VEC_012_SCL_210; + } + if (ResetIdx == -1) + return false; + int NextSwizzle = SwzCandidate[ResetIdx] + 1; + SwzCandidate[ResetIdx] = (R600InstrInfo::BankSwizzle)NextSwizzle; + return true; +} + +/// Enumerate all possible Swizzle sequence to find one that can meet all +/// read port requirements. +bool R600InstrInfo::FindSwizzleForVectorSlot( + const std::vector > > &IGSrcs, + std::vector &SwzCandidate, + const std::vector > &TransSrcs, + R600InstrInfo::BankSwizzle TransSwz) const { + unsigned ValidUpTo = 0; + do { + ValidUpTo = isLegalUpTo(IGSrcs, SwzCandidate, TransSrcs, TransSwz); + if (ValidUpTo == IGSrcs.size()) + return true; + } while (NextPossibleSolution(SwzCandidate, ValidUpTo)); + return false; +} + +/// Instructions in Trans slot can't read gpr at cycle 0 if they also read +/// a const, and can't read a gpr at cycle 1 if they read 2 const. +static bool +isConstCompatible(R600InstrInfo::BankSwizzle TransSwz, + const std::vector > &TransOps, + unsigned ConstCount) { + // TransALU can't read 3 constants + if (ConstCount > 2) + return false; + for (unsigned i = 0, e = TransOps.size(); i < e; ++i) { + const std::pair &Src = TransOps[i]; + unsigned Cycle = getTransSwizzle(TransSwz, i); + if (Src.first < 0) + continue; + if (ConstCount > 0 && Cycle == 0) + return false; + if (ConstCount > 1 && Cycle == 1) + return false; + } + return true; +} + +bool +R600InstrInfo::fitsReadPortLimitations(const std::vector &IG, + const DenseMap &PV, + std::vector &ValidSwizzle, + bool isLastAluTrans) + const { + //Todo : support shared src0 - src1 operand + + std::vector > > IGSrcs; + ValidSwizzle.clear(); + unsigned ConstCount; + BankSwizzle TransBS = ALU_VEC_012_SCL_210; + for (unsigned i = 0, e = IG.size(); i < e; ++i) { + IGSrcs.push_back(ExtractSrcs(IG[i], PV, ConstCount)); + unsigned Op = getOperandIdx(IG[i]->getOpcode(), + AMDGPU::OpName::bank_swizzle); + ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle) + IG[i]->getOperand(Op).getImm()); + } + std::vector > TransOps; + if (!isLastAluTrans) + return FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps, TransBS); + + TransOps = std::move(IGSrcs.back()); + IGSrcs.pop_back(); + ValidSwizzle.pop_back(); + + static const R600InstrInfo::BankSwizzle TransSwz[] = { + ALU_VEC_012_SCL_210, + ALU_VEC_021_SCL_122, + ALU_VEC_120_SCL_212, + ALU_VEC_102_SCL_221 + }; + for (unsigned i = 0; i < 4; i++) { + TransBS = TransSwz[i]; + if (!isConstCompatible(TransBS, TransOps, ConstCount)) + continue; + bool Result = FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps, + TransBS); + if (Result) { + ValidSwizzle.push_back(TransBS); + return true; + } + } + + return false; +} + + +bool +R600InstrInfo::fitsConstReadLimitations(const std::vector &Consts) + const { + assert (Consts.size() <= 12 && "Too many operands in instructions group"); + unsigned Pair1 = 0, Pair2 = 0; + for (unsigned i = 0, n = Consts.size(); i < n; ++i) { + unsigned ReadConstHalf = Consts[i] & 2; + unsigned ReadConstIndex = Consts[i] & (~3); + unsigned ReadHalfConst = ReadConstIndex | ReadConstHalf; + if (!Pair1) { + Pair1 = ReadHalfConst; + continue; + } + if (Pair1 == ReadHalfConst) + continue; + if (!Pair2) { + Pair2 = ReadHalfConst; + continue; + } + if (Pair2 != ReadHalfConst) + return false; + } + return true; +} + +bool +R600InstrInfo::fitsConstReadLimitations(const std::vector &MIs) + const { + std::vector Consts; + SmallSet Literals; + for (unsigned i = 0, n = MIs.size(); i < n; i++) { + MachineInstr *MI = MIs[i]; + if (!isALUInstr(MI->getOpcode())) + continue; + + ArrayRef> Srcs = getSrcs(MI); + + for (unsigned j = 0, e = Srcs.size(); j < e; j++) { + std::pair Src = Srcs[j]; + if (Src.first->getReg() == AMDGPU::ALU_LITERAL_X) + Literals.insert(Src.second); + if (Literals.size() > 4) + return false; + if (Src.first->getReg() == AMDGPU::ALU_CONST) + Consts.push_back(Src.second); + if (AMDGPU::R600_KC0RegClass.contains(Src.first->getReg()) || + AMDGPU::R600_KC1RegClass.contains(Src.first->getReg())) { + unsigned Index = RI.getEncodingValue(Src.first->getReg()) & 0xff; + unsigned Chan = RI.getHWRegChan(Src.first->getReg()); + Consts.push_back((Index << 2) | Chan); + } + } + } + return fitsConstReadLimitations(Consts); +} + +DFAPacketizer * +R600InstrInfo::CreateTargetScheduleState(const TargetSubtargetInfo &STI) const { + const InstrItineraryData *II = STI.getInstrItineraryData(); + return static_cast(STI).createDFAPacketizer(II); +} + +static bool +isPredicateSetter(unsigned Opcode) { + switch (Opcode) { + case AMDGPU::PRED_X: + return true; + default: + return false; + } +} + +static MachineInstr * +findFirstPredicateSetterFrom(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) { + while (I != MBB.begin()) { + --I; + MachineInstr *MI = I; + if (isPredicateSetter(MI->getOpcode())) + return MI; + } + + return nullptr; +} + +static +bool isJump(unsigned Opcode) { + return Opcode == AMDGPU::JUMP || Opcode == AMDGPU::JUMP_COND; +} + +static bool isBranch(unsigned Opcode) { + return Opcode == AMDGPU::BRANCH || Opcode == AMDGPU::BRANCH_COND_i32 || + Opcode == AMDGPU::BRANCH_COND_f32; +} + +bool +R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl &Cond, + bool AllowModify) const { + // Most of the following comes from the ARM implementation of AnalyzeBranch + + // If the block has no terminators, it just falls into the block after it. + MachineBasicBlock::iterator I = MBB.end(); + if (I == MBB.begin()) + return false; + --I; + while (I->isDebugValue()) { + if (I == MBB.begin()) + return false; + --I; + } + // AMDGPU::BRANCH* instructions are only available after isel and are not + // handled + if (isBranch(I->getOpcode())) + return true; + if (!isJump(static_cast(I)->getOpcode())) { + return false; + } + + // Remove successive JUMP + while (I != MBB.begin() && std::prev(I)->getOpcode() == AMDGPU::JUMP) { + MachineBasicBlock::iterator PriorI = std::prev(I); + if (AllowModify) + I->removeFromParent(); + I = PriorI; + } + MachineInstr *LastInst = I; + + // If there is only one terminator instruction, process it. + unsigned LastOpc = LastInst->getOpcode(); + if (I == MBB.begin() || + !isJump(static_cast(--I)->getOpcode())) { + if (LastOpc == AMDGPU::JUMP) { + TBB = LastInst->getOperand(0).getMBB(); + return false; + } else if (LastOpc == AMDGPU::JUMP_COND) { + MachineInstr *predSet = I; + while (!isPredicateSetter(predSet->getOpcode())) { + predSet = --I; + } + TBB = LastInst->getOperand(0).getMBB(); + Cond.push_back(predSet->getOperand(1)); + Cond.push_back(predSet->getOperand(2)); + Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false)); + return false; + } + return true; // Can't handle indirect branch. + } + + // Get the instruction before it if it is a terminator. + MachineInstr *SecondLastInst = I; + unsigned SecondLastOpc = SecondLastInst->getOpcode(); + + // If the block ends with a B and a Bcc, handle it. + if (SecondLastOpc == AMDGPU::JUMP_COND && LastOpc == AMDGPU::JUMP) { + MachineInstr *predSet = --I; + while (!isPredicateSetter(predSet->getOpcode())) { + predSet = --I; + } + TBB = SecondLastInst->getOperand(0).getMBB(); + FBB = LastInst->getOperand(0).getMBB(); + Cond.push_back(predSet->getOperand(1)); + Cond.push_back(predSet->getOperand(2)); + Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false)); + return false; + } + + // Otherwise, can't handle this. + return true; +} + +static +MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) { + for (MachineBasicBlock::reverse_iterator It = MBB.rbegin(), E = MBB.rend(); + It != E; ++It) { + if (It->getOpcode() == AMDGPU::CF_ALU || + It->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE) + return std::prev(It.base()); + } + return MBB.end(); +} + +unsigned +R600InstrInfo::InsertBranch(MachineBasicBlock &MBB, + MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + ArrayRef Cond, + DebugLoc DL) const { + assert(TBB && "InsertBranch must not be told to insert a fallthrough"); + + if (!FBB) { + if (Cond.empty()) { + BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB); + return 1; + } else { + MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end()); + assert(PredSet && "No previous predicate !"); + addFlag(PredSet, 0, MO_FLAG_PUSH); + PredSet->getOperand(2).setImm(Cond[1].getImm()); + + BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND)) + .addMBB(TBB) + .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); + MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); + if (CfAlu == MBB.end()) + return 1; + assert (CfAlu->getOpcode() == AMDGPU::CF_ALU); + CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE)); + return 1; + } + } else { + MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end()); + assert(PredSet && "No previous predicate !"); + addFlag(PredSet, 0, MO_FLAG_PUSH); + PredSet->getOperand(2).setImm(Cond[1].getImm()); + BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND)) + .addMBB(TBB) + .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); + BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB); + MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); + if (CfAlu == MBB.end()) + return 2; + assert (CfAlu->getOpcode() == AMDGPU::CF_ALU); + CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE)); + return 2; + } +} + +unsigned +R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { + + // Note : we leave PRED* instructions there. + // They may be needed when predicating instructions. + + MachineBasicBlock::iterator I = MBB.end(); + + if (I == MBB.begin()) { + return 0; + } + --I; + switch (I->getOpcode()) { + default: + return 0; + case AMDGPU::JUMP_COND: { + MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I); + clearFlag(predSet, 0, MO_FLAG_PUSH); + I->eraseFromParent(); + MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); + if (CfAlu == MBB.end()) + break; + assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE); + CfAlu->setDesc(get(AMDGPU::CF_ALU)); + break; + } + case AMDGPU::JUMP: + I->eraseFromParent(); + break; + } + I = MBB.end(); + + if (I == MBB.begin()) { + return 1; + } + --I; + switch (I->getOpcode()) { + // FIXME: only one case?? + default: + return 1; + case AMDGPU::JUMP_COND: { + MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I); + clearFlag(predSet, 0, MO_FLAG_PUSH); + I->eraseFromParent(); + MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); + if (CfAlu == MBB.end()) + break; + assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE); + CfAlu->setDesc(get(AMDGPU::CF_ALU)); + break; + } + case AMDGPU::JUMP: + I->eraseFromParent(); + break; + } + return 2; +} + +bool +R600InstrInfo::isPredicated(const MachineInstr *MI) const { + int idx = MI->findFirstPredOperandIdx(); + if (idx < 0) + return false; + + unsigned Reg = MI->getOperand(idx).getReg(); + switch (Reg) { + default: return false; + case AMDGPU::PRED_SEL_ONE: + case AMDGPU::PRED_SEL_ZERO: + case AMDGPU::PREDICATE_BIT: + return true; + } +} + +bool +R600InstrInfo::isPredicable(MachineInstr *MI) const { + // XXX: KILL* instructions can be predicated, but they must be the last + // instruction in a clause, so this means any instructions after them cannot + // be predicated. Until we have proper support for instruction clauses in the + // backend, we will mark KILL* instructions as unpredicable. + + if (MI->getOpcode() == AMDGPU::KILLGT) { + return false; + } else if (MI->getOpcode() == AMDGPU::CF_ALU) { + // If the clause start in the middle of MBB then the MBB has more + // than a single clause, unable to predicate several clauses. + if (MI->getParent()->begin() != MachineBasicBlock::iterator(MI)) + return false; + // TODO: We don't support KC merging atm + if (MI->getOperand(3).getImm() != 0 || MI->getOperand(4).getImm() != 0) + return false; + return true; + } else if (isVector(*MI)) { + return false; + } else { + return AMDGPUInstrInfo::isPredicable(MI); + } +} + + +bool +R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB, + unsigned NumCyles, + unsigned ExtraPredCycles, + const BranchProbability &Probability) const{ + return true; +} + +bool +R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB, + unsigned NumTCycles, + unsigned ExtraTCycles, + MachineBasicBlock &FMBB, + unsigned NumFCycles, + unsigned ExtraFCycles, + const BranchProbability &Probability) const { + return true; +} + +bool +R600InstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB, + unsigned NumCyles, + const BranchProbability &Probability) + const { + return true; +} + +bool +R600InstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB, + MachineBasicBlock &FMBB) const { + return false; +} + + +bool +R600InstrInfo::ReverseBranchCondition(SmallVectorImpl &Cond) const { + MachineOperand &MO = Cond[1]; + switch (MO.getImm()) { + case OPCODE_IS_ZERO_INT: + MO.setImm(OPCODE_IS_NOT_ZERO_INT); + break; + case OPCODE_IS_NOT_ZERO_INT: + MO.setImm(OPCODE_IS_ZERO_INT); + break; + case OPCODE_IS_ZERO: + MO.setImm(OPCODE_IS_NOT_ZERO); + break; + case OPCODE_IS_NOT_ZERO: + MO.setImm(OPCODE_IS_ZERO); + break; + default: + return true; + } + + MachineOperand &MO2 = Cond[2]; + switch (MO2.getReg()) { + case AMDGPU::PRED_SEL_ZERO: + MO2.setReg(AMDGPU::PRED_SEL_ONE); + break; + case AMDGPU::PRED_SEL_ONE: + MO2.setReg(AMDGPU::PRED_SEL_ZERO); + break; + default: + return true; + } + return false; +} + +bool +R600InstrInfo::DefinesPredicate(MachineInstr *MI, + std::vector &Pred) const { + return isPredicateSetter(MI->getOpcode()); +} + + +bool +R600InstrInfo::SubsumesPredicate(ArrayRef Pred1, + ArrayRef Pred2) const { + return false; +} + + +bool +R600InstrInfo::PredicateInstruction(MachineInstr *MI, + ArrayRef Pred) const { + int PIdx = MI->findFirstPredOperandIdx(); + + if (MI->getOpcode() == AMDGPU::CF_ALU) { + MI->getOperand(8).setImm(0); + return true; + } + + if (MI->getOpcode() == AMDGPU::DOT_4) { + MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_X)) + .setReg(Pred[2].getReg()); + MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Y)) + .setReg(Pred[2].getReg()); + MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Z)) + .setReg(Pred[2].getReg()); + MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_W)) + .setReg(Pred[2].getReg()); + MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI); + MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit); + return true; + } + + if (PIdx != -1) { + MachineOperand &PMO = MI->getOperand(PIdx); + PMO.setReg(Pred[2].getReg()); + MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI); + MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit); + return true; + } + + return false; +} + +unsigned int R600InstrInfo::getPredicationCost(const MachineInstr *) const { + return 2; +} + +unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, + unsigned *PredCost) const { + if (PredCost) + *PredCost = 2; + return 2; +} + +bool R600InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { + + switch(MI->getOpcode()) { + default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); + case AMDGPU::R600_EXTRACT_ELT_V2: + case AMDGPU::R600_EXTRACT_ELT_V4: + buildIndirectRead(MI->getParent(), MI, MI->getOperand(0).getReg(), + RI.getHWRegIndex(MI->getOperand(1).getReg()), // Address + MI->getOperand(2).getReg(), + RI.getHWRegChan(MI->getOperand(1).getReg())); + break; + case AMDGPU::R600_INSERT_ELT_V2: + case AMDGPU::R600_INSERT_ELT_V4: + buildIndirectWrite(MI->getParent(), MI, MI->getOperand(2).getReg(), // Value + RI.getHWRegIndex(MI->getOperand(1).getReg()), // Address + MI->getOperand(3).getReg(), // Offset + RI.getHWRegChan(MI->getOperand(1).getReg())); // Channel + break; + } + MI->eraseFromParent(); + return true; +} + +void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved, + const MachineFunction &MF) const { + const AMDGPUFrameLowering *TFL = static_cast( + MF.getSubtarget().getFrameLowering()); + + unsigned StackWidth = TFL->getStackWidth(MF); + int End = getIndirectIndexEnd(MF); + + if (End == -1) + return; + + for (int Index = getIndirectIndexBegin(MF); Index <= End; ++Index) { + unsigned SuperReg = AMDGPU::R600_Reg128RegClass.getRegister(Index); + Reserved.set(SuperReg); + for (unsigned Chan = 0; Chan < StackWidth; ++Chan) { + unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister((4 * Index) + Chan); + Reserved.set(Reg); + } + } +} + +unsigned R600InstrInfo::calculateIndirectAddress(unsigned RegIndex, + unsigned Channel) const { + // XXX: Remove when we support a stack width > 2 + assert(Channel == 0); + return RegIndex; +} + +const TargetRegisterClass *R600InstrInfo::getIndirectAddrRegClass() const { + return &AMDGPU::R600_TReg32_XRegClass; +} + +MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const { + return buildIndirectWrite(MBB, I, ValueReg, Address, OffsetReg, 0); +} + +MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg, + unsigned AddrChan) const { + unsigned AddrReg; + switch (AddrChan) { + default: llvm_unreachable("Invalid Channel"); + case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break; + case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break; + case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break; + case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break; + } + MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg, + AMDGPU::AR_X, OffsetReg); + setImmOperand(MOVA, AMDGPU::OpName::write, 0); + + MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV, + AddrReg, ValueReg) + .addReg(AMDGPU::AR_X, + RegState::Implicit | RegState::Kill); + setImmOperand(Mov, AMDGPU::OpName::dst_rel, 1); + return Mov; +} + +MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const { + return buildIndirectRead(MBB, I, ValueReg, Address, OffsetReg, 0); +} + +MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg, + unsigned AddrChan) const { + unsigned AddrReg; + switch (AddrChan) { + default: llvm_unreachable("Invalid Channel"); + case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break; + case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break; + case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break; + case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break; + } + MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg, + AMDGPU::AR_X, + OffsetReg); + setImmOperand(MOVA, AMDGPU::OpName::write, 0); + MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV, + ValueReg, + AddrReg) + .addReg(AMDGPU::AR_X, + RegState::Implicit | RegState::Kill); + setImmOperand(Mov, AMDGPU::OpName::src0_rel, 1); + + return Mov; +} + +unsigned R600InstrInfo::getMaxAlusPerClause() const { + return 115; +} + +MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned Opcode, + unsigned DstReg, + unsigned Src0Reg, + unsigned Src1Reg) const { + MachineInstrBuilder MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opcode), + DstReg); // $dst + + if (Src1Reg) { + MIB.addImm(0) // $update_exec_mask + .addImm(0); // $update_predicate + } + MIB.addImm(1) // $write + .addImm(0) // $omod + .addImm(0) // $dst_rel + .addImm(0) // $dst_clamp + .addReg(Src0Reg) // $src0 + .addImm(0) // $src0_neg + .addImm(0) // $src0_rel + .addImm(0) // $src0_abs + .addImm(-1); // $src0_sel + + if (Src1Reg) { + MIB.addReg(Src1Reg) // $src1 + .addImm(0) // $src1_neg + .addImm(0) // $src1_rel + .addImm(0) // $src1_abs + .addImm(-1); // $src1_sel + } + + //XXX: The r600g finalizer expects this to be 1, once we've moved the + //scheduling to the backend, we can change the default to 0. + MIB.addImm(1) // $last + .addReg(AMDGPU::PRED_SEL_OFF) // $pred_sel + .addImm(0) // $literal + .addImm(0); // $bank_swizzle + + return MIB; +} + +#define OPERAND_CASE(Label) \ + case Label: { \ + static const unsigned Ops[] = \ + { \ + Label##_X, \ + Label##_Y, \ + Label##_Z, \ + Label##_W \ + }; \ + return Ops[Slot]; \ + } + +static unsigned getSlotedOps(unsigned Op, unsigned Slot) { + switch (Op) { + OPERAND_CASE(AMDGPU::OpName::update_exec_mask) + OPERAND_CASE(AMDGPU::OpName::update_pred) + OPERAND_CASE(AMDGPU::OpName::write) + OPERAND_CASE(AMDGPU::OpName::omod) + OPERAND_CASE(AMDGPU::OpName::dst_rel) + OPERAND_CASE(AMDGPU::OpName::clamp) + OPERAND_CASE(AMDGPU::OpName::src0) + OPERAND_CASE(AMDGPU::OpName::src0_neg) + OPERAND_CASE(AMDGPU::OpName::src0_rel) + OPERAND_CASE(AMDGPU::OpName::src0_abs) + OPERAND_CASE(AMDGPU::OpName::src0_sel) + OPERAND_CASE(AMDGPU::OpName::src1) + OPERAND_CASE(AMDGPU::OpName::src1_neg) + OPERAND_CASE(AMDGPU::OpName::src1_rel) + OPERAND_CASE(AMDGPU::OpName::src1_abs) + OPERAND_CASE(AMDGPU::OpName::src1_sel) + OPERAND_CASE(AMDGPU::OpName::pred_sel) + default: + llvm_unreachable("Wrong Operand"); + } +} + +#undef OPERAND_CASE + +MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction( + MachineBasicBlock &MBB, MachineInstr *MI, unsigned Slot, unsigned DstReg) + const { + assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented"); + unsigned Opcode; + if (ST.getGeneration() <= AMDGPUSubtarget::R700) + Opcode = AMDGPU::DOT4_r600; + else + Opcode = AMDGPU::DOT4_eg; + MachineBasicBlock::iterator I = MI; + MachineOperand &Src0 = MI->getOperand( + getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src0, Slot))); + MachineOperand &Src1 = MI->getOperand( + getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src1, Slot))); + MachineInstr *MIB = buildDefaultInstruction( + MBB, I, Opcode, DstReg, Src0.getReg(), Src1.getReg()); + static const unsigned Operands[14] = { + AMDGPU::OpName::update_exec_mask, + AMDGPU::OpName::update_pred, + AMDGPU::OpName::write, + AMDGPU::OpName::omod, + AMDGPU::OpName::dst_rel, + AMDGPU::OpName::clamp, + AMDGPU::OpName::src0_neg, + AMDGPU::OpName::src0_rel, + AMDGPU::OpName::src0_abs, + AMDGPU::OpName::src0_sel, + AMDGPU::OpName::src1_neg, + AMDGPU::OpName::src1_rel, + AMDGPU::OpName::src1_abs, + AMDGPU::OpName::src1_sel, + }; + + MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(), + getSlotedOps(AMDGPU::OpName::pred_sel, Slot))); + MIB->getOperand(getOperandIdx(Opcode, AMDGPU::OpName::pred_sel)) + .setReg(MO.getReg()); + + for (unsigned i = 0; i < 14; i++) { + MachineOperand &MO = MI->getOperand( + getOperandIdx(MI->getOpcode(), getSlotedOps(Operands[i], Slot))); + assert (MO.isImm()); + setImmOperand(MIB, Operands[i], MO.getImm()); + } + MIB->getOperand(20).setImm(0); + return MIB; +} + +MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB, + MachineBasicBlock::iterator I, + unsigned DstReg, + uint64_t Imm) const { + MachineInstr *MovImm = buildDefaultInstruction(BB, I, AMDGPU::MOV, DstReg, + AMDGPU::ALU_LITERAL_X); + setImmOperand(MovImm, AMDGPU::OpName::literal, Imm); + return MovImm; +} + +MachineInstr *R600InstrInfo::buildMovInstr(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned DstReg, unsigned SrcReg) const { + return buildDefaultInstruction(*MBB, I, AMDGPU::MOV, DstReg, SrcReg); +} + +int R600InstrInfo::getOperandIdx(const MachineInstr &MI, unsigned Op) const { + return getOperandIdx(MI.getOpcode(), Op); +} + +int R600InstrInfo::getOperandIdx(unsigned Opcode, unsigned Op) const { + return AMDGPU::getNamedOperandIdx(Opcode, Op); +} + +void R600InstrInfo::setImmOperand(MachineInstr *MI, unsigned Op, + int64_t Imm) const { + int Idx = getOperandIdx(*MI, Op); + assert(Idx != -1 && "Operand not supported for this instruction."); + assert(MI->getOperand(Idx).isImm()); + MI->getOperand(Idx).setImm(Imm); +} + +//===----------------------------------------------------------------------===// +// Instruction flag getters/setters +//===----------------------------------------------------------------------===// + +bool R600InstrInfo::hasFlagOperand(const MachineInstr &MI) const { + return GET_FLAG_OPERAND_IDX(get(MI.getOpcode()).TSFlags) != 0; +} + +MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx, + unsigned Flag) const { + unsigned TargetFlags = get(MI->getOpcode()).TSFlags; + int FlagIndex = 0; + if (Flag != 0) { + // If we pass something other than the default value of Flag to this + // function, it means we are want to set a flag on an instruction + // that uses native encoding. + assert(HAS_NATIVE_OPERANDS(TargetFlags)); + bool IsOP3 = (TargetFlags & R600_InstFlag::OP3) == R600_InstFlag::OP3; + switch (Flag) { + case MO_FLAG_CLAMP: + FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::clamp); + break; + case MO_FLAG_MASK: + FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::write); + break; + case MO_FLAG_NOT_LAST: + case MO_FLAG_LAST: + FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::last); + break; + case MO_FLAG_NEG: + switch (SrcIdx) { + case 0: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src0_neg); break; + case 1: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src1_neg); break; + case 2: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src2_neg); break; + } + break; + + case MO_FLAG_ABS: + assert(!IsOP3 && "Cannot set absolute value modifier for OP3 " + "instructions."); + (void)IsOP3; + switch (SrcIdx) { + case 0: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src0_abs); break; + case 1: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src1_abs); break; + } + break; + + default: + FlagIndex = -1; + break; + } + assert(FlagIndex != -1 && "Flag not supported for this instruction"); + } else { + FlagIndex = GET_FLAG_OPERAND_IDX(TargetFlags); + assert(FlagIndex != 0 && + "Instruction flags not supported for this instruction"); + } + + MachineOperand &FlagOp = MI->getOperand(FlagIndex); + assert(FlagOp.isImm()); + return FlagOp; +} + +void R600InstrInfo::addFlag(MachineInstr *MI, unsigned Operand, + unsigned Flag) const { + unsigned TargetFlags = get(MI->getOpcode()).TSFlags; + if (Flag == 0) { + return; + } + if (HAS_NATIVE_OPERANDS(TargetFlags)) { + MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag); + if (Flag == MO_FLAG_NOT_LAST) { + clearFlag(MI, Operand, MO_FLAG_LAST); + } else if (Flag == MO_FLAG_MASK) { + clearFlag(MI, Operand, Flag); + } else { + FlagOp.setImm(1); + } + } else { + MachineOperand &FlagOp = getFlagOp(MI, Operand); + FlagOp.setImm(FlagOp.getImm() | (Flag << (NUM_MO_FLAGS * Operand))); + } +} + +void R600InstrInfo::clearFlag(MachineInstr *MI, unsigned Operand, + unsigned Flag) const { + unsigned TargetFlags = get(MI->getOpcode()).TSFlags; + if (HAS_NATIVE_OPERANDS(TargetFlags)) { + MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag); + FlagOp.setImm(0); + } else { + MachineOperand &FlagOp = getFlagOp(MI); + unsigned InstFlags = FlagOp.getImm(); + InstFlags &= ~(Flag << (NUM_MO_FLAGS * Operand)); + FlagOp.setImm(InstFlags); + } +} diff --git a/lib/Target/AMDGPU/R600InstrInfo.h b/lib/Target/AMDGPU/R600InstrInfo.h new file mode 100644 index 00000000000..dee4c2b9ae3 --- /dev/null +++ b/lib/Target/AMDGPU/R600InstrInfo.h @@ -0,0 +1,303 @@ +//===-- R600InstrInfo.h - R600 Instruction Info Interface -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface definition for R600InstrInfo +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_R600INSTRINFO_H +#define LLVM_LIB_TARGET_R600_R600INSTRINFO_H + +#include "AMDGPUInstrInfo.h" +#include "R600Defines.h" +#include "R600RegisterInfo.h" +#include + +namespace llvm { + + class AMDGPUTargetMachine; + class DFAPacketizer; + class ScheduleDAG; + class MachineFunction; + class MachineInstr; + class MachineInstrBuilder; + + class R600InstrInfo : public AMDGPUInstrInfo { + private: + const R600RegisterInfo RI; + + std::vector > + ExtractSrcs(MachineInstr *MI, const DenseMap &PV, unsigned &ConstCount) const; + + + MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg, + unsigned AddrChan) const; + + MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg, + unsigned AddrChan) const; + public: + enum BankSwizzle { + ALU_VEC_012_SCL_210 = 0, + ALU_VEC_021_SCL_122, + ALU_VEC_120_SCL_212, + ALU_VEC_102_SCL_221, + ALU_VEC_201, + ALU_VEC_210 + }; + + explicit R600InstrInfo(const AMDGPUSubtarget &st); + + const R600RegisterInfo &getRegisterInfo() const override; + void copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const override; + bool isLegalToSplitMBBAt(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) const override; + + bool isTrig(const MachineInstr &MI) const; + bool isPlaceHolderOpcode(unsigned opcode) const; + bool isReductionOp(unsigned opcode) const; + bool isCubeOp(unsigned opcode) const; + + /// \returns true if this \p Opcode represents an ALU instruction. + bool isALUInstr(unsigned Opcode) const; + bool hasInstrModifiers(unsigned Opcode) const; + bool isLDSInstr(unsigned Opcode) const; + bool isLDSNoRetInstr(unsigned Opcode) const; + bool isLDSRetInstr(unsigned Opcode) const; + + /// \returns true if this \p Opcode represents an ALU instruction or an + /// instruction that will be lowered in ExpandSpecialInstrs Pass. + bool canBeConsideredALU(const MachineInstr *MI) const; + + bool isTransOnly(unsigned Opcode) const; + bool isTransOnly(const MachineInstr *MI) const; + bool isVectorOnly(unsigned Opcode) const; + bool isVectorOnly(const MachineInstr *MI) const; + bool isExport(unsigned Opcode) const; + + bool usesVertexCache(unsigned Opcode) const; + bool usesVertexCache(const MachineInstr *MI) const; + bool usesTextureCache(unsigned Opcode) const; + bool usesTextureCache(const MachineInstr *MI) const; + + bool mustBeLastInClause(unsigned Opcode) const; + bool usesAddressRegister(MachineInstr *MI) const; + bool definesAddressRegister(MachineInstr *MI) const; + bool readsLDSSrcReg(const MachineInstr *MI) const; + + /// \returns The operand index for the given source number. Legal values + /// for SrcNum are 0, 1, and 2. + int getSrcIdx(unsigned Opcode, unsigned SrcNum) const; + /// \returns The operand Index for the Sel operand given an index to one + /// of the instruction's src operands. + int getSelIdx(unsigned Opcode, unsigned SrcIdx) const; + + /// \returns a pair for each src of an ALU instructions. + /// The first member of a pair is the register id. + /// If register is ALU_CONST, second member is SEL. + /// If register is ALU_LITERAL, second member is IMM. + /// Otherwise, second member value is undefined. + SmallVector, 3> + getSrcs(MachineInstr *MI) const; + + unsigned isLegalUpTo( + const std::vector > > &IGSrcs, + const std::vector &Swz, + const std::vector > &TransSrcs, + R600InstrInfo::BankSwizzle TransSwz) const; + + bool FindSwizzleForVectorSlot( + const std::vector > > &IGSrcs, + std::vector &SwzCandidate, + const std::vector > &TransSrcs, + R600InstrInfo::BankSwizzle TransSwz) const; + + /// Given the order VEC_012 < VEC_021 < VEC_120 < VEC_102 < VEC_201 < VEC_210 + /// returns true and the first (in lexical order) BankSwizzle affectation + /// starting from the one already provided in the Instruction Group MIs that + /// fits Read Port limitations in BS if available. Otherwise returns false + /// and undefined content in BS. + /// isLastAluTrans should be set if the last Alu of MIs will be executed on + /// Trans ALU. In this case, ValidTSwizzle returns the BankSwizzle value to + /// apply to the last instruction. + /// PV holds GPR to PV registers in the Instruction Group MIs. + bool fitsReadPortLimitations(const std::vector &MIs, + const DenseMap &PV, + std::vector &BS, + bool isLastAluTrans) const; + + /// An instruction group can only access 2 channel pair (either [XY] or [ZW]) + /// from KCache bank on R700+. This function check if MI set in input meet + /// this limitations + bool fitsConstReadLimitations(const std::vector &) const; + /// Same but using const index set instead of MI set. + bool fitsConstReadLimitations(const std::vector&) const; + + /// \brief Vector instructions are instructions that must fill all + /// instruction slots within an instruction group. + bool isVector(const MachineInstr &MI) const; + + bool isMov(unsigned Opcode) const override; + + DFAPacketizer * + CreateTargetScheduleState(const TargetSubtargetInfo &) const override; + + bool ReverseBranchCondition(SmallVectorImpl &Cond) const override; + + bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, + SmallVectorImpl &Cond, bool AllowModify) const override; + + unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, ArrayRef Cond, + DebugLoc DL) const override; + + unsigned RemoveBranch(MachineBasicBlock &MBB) const override; + + bool isPredicated(const MachineInstr *MI) const override; + + bool isPredicable(MachineInstr *MI) const override; + + bool + isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, + const BranchProbability &Probability) const override; + + bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, + unsigned ExtraPredCycles, + const BranchProbability &Probability) const override ; + + bool + isProfitableToIfCvt(MachineBasicBlock &TMBB, + unsigned NumTCycles, unsigned ExtraTCycles, + MachineBasicBlock &FMBB, + unsigned NumFCycles, unsigned ExtraFCycles, + const BranchProbability &Probability) const override; + + bool DefinesPredicate(MachineInstr *MI, + std::vector &Pred) const override; + + bool SubsumesPredicate(ArrayRef Pred1, + ArrayRef Pred2) const override; + + bool isProfitableToUnpredicate(MachineBasicBlock &TMBB, + MachineBasicBlock &FMBB) const override; + + bool PredicateInstruction(MachineInstr *MI, + ArrayRef Pred) const override; + + unsigned int getPredicationCost(const MachineInstr *) const override; + + unsigned int getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, + unsigned *PredCost = nullptr) const override; + + int getInstrLatency(const InstrItineraryData *ItinData, + SDNode *Node) const override { return 1;} + + bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; + + /// \brief Reserve the registers that may be accesed using indirect addressing. + void reserveIndirectRegisters(BitVector &Reserved, + const MachineFunction &MF) const; + + unsigned calculateIndirectAddress(unsigned RegIndex, + unsigned Channel) const override; + + const TargetRegisterClass *getIndirectAddrRegClass() const override; + + MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const override; + + MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const override; + + unsigned getMaxAlusPerClause() const; + + ///buildDefaultInstruction - This function returns a MachineInstr with + /// all the instruction modifiers initialized to their default values. + /// You can use this function to avoid manually specifying each instruction + /// modifier operand when building a new instruction. + /// + /// \returns a MachineInstr with all the instruction modifiers initialized + /// to their default values. + MachineInstrBuilder buildDefaultInstruction(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned Opcode, + unsigned DstReg, + unsigned Src0Reg, + unsigned Src1Reg = 0) const; + + MachineInstr *buildSlotOfVectorInstruction(MachineBasicBlock &MBB, + MachineInstr *MI, + unsigned Slot, + unsigned DstReg) const; + + MachineInstr *buildMovImm(MachineBasicBlock &BB, + MachineBasicBlock::iterator I, + unsigned DstReg, + uint64_t Imm) const; + + MachineInstr *buildMovInstr(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned DstReg, unsigned SrcReg) const override; + + /// \brief Get the index of Op in the MachineInstr. + /// + /// \returns -1 if the Instruction does not contain the specified \p Op. + int getOperandIdx(const MachineInstr &MI, unsigned Op) const; + + /// \brief Get the index of \p Op for the given Opcode. + /// + /// \returns -1 if the Instruction does not contain the specified \p Op. + int getOperandIdx(unsigned Opcode, unsigned Op) const; + + /// \brief Helper function for setting instruction flag values. + void setImmOperand(MachineInstr *MI, unsigned Op, int64_t Imm) const; + + /// \returns true if this instruction has an operand for storing target flags. + bool hasFlagOperand(const MachineInstr &MI) const; + + ///\brief Add one of the MO_FLAG* flags to the specified \p Operand. + void addFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const; + + ///\brief Determine if the specified \p Flag is set on this \p Operand. + bool isFlagSet(const MachineInstr &MI, unsigned Operand, unsigned Flag) const; + + /// \param SrcIdx The register source to set the flag on (e.g src0, src1, src2) + /// \param Flag The flag being set. + /// + /// \returns the operand containing the flags for this instruction. + MachineOperand &getFlagOp(MachineInstr *MI, unsigned SrcIdx = 0, + unsigned Flag = 0) const; + + /// \brief Clear the specified flag on the instruction. + void clearFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const; +}; + +namespace AMDGPU { + +int getLDSNoRetOp(uint16_t Opcode); + +} //End namespace AMDGPU + +} // End llvm namespace + +#endif diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td new file mode 100644 index 00000000000..7beed092b3f --- /dev/null +++ b/lib/Target/AMDGPU/R600Instructions.td @@ -0,0 +1,1744 @@ +//===-- R600Instructions.td - R600 Instruction defs -------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TableGen definitions for instructions which are available on R600 family +// GPUs. +// +//===----------------------------------------------------------------------===// + +include "R600Intrinsics.td" +include "R600InstrFormats.td" + +class InstR600ISA pattern> : + InstR600 { + + let Namespace = "AMDGPU"; +} + +def MEMxi : Operand { + let MIOperandInfo = (ops R600_TReg32_X:$ptr, i32imm:$index); + let PrintMethod = "printMemOperand"; +} + +def MEMrr : Operand { + let MIOperandInfo = (ops R600_Reg32:$ptr, R600_Reg32:$index); +} + +// Operands for non-registers + +class InstFlag + : OperandWithDefaultOps { + let PrintMethod = PM; +} + +// src_sel for ALU src operands, see also ALU_CONST, ALU_PARAM registers +def SEL : OperandWithDefaultOps { + let PrintMethod = "printSel"; +} +def BANK_SWIZZLE : OperandWithDefaultOps { + let PrintMethod = "printBankSwizzle"; +} + +def LITERAL : InstFlag<"printLiteral">; + +def WRITE : InstFlag <"printWrite", 1>; +def OMOD : InstFlag <"printOMOD">; +def REL : InstFlag <"printRel">; +def CLAMP : InstFlag <"printClamp">; +def NEG : InstFlag <"printNeg">; +def ABS : InstFlag <"printAbs">; +def UEM : InstFlag <"printUpdateExecMask">; +def UP : InstFlag <"printUpdatePred">; + +// XXX: The r600g finalizer in Mesa expects last to be one in most cases. +// Once we start using the packetizer in this backend we should have this +// default to 0. +def LAST : InstFlag<"printLast", 1>; +def RSel : Operand { + let PrintMethod = "printRSel"; +} +def CT: Operand { + let PrintMethod = "printCT"; +} + +def FRAMEri : Operand { + let MIOperandInfo = (ops R600_Reg32:$ptr, i32imm:$index); +} + +def ADDRParam : ComplexPattern; +def ADDRDWord : ComplexPattern; +def ADDRVTX_READ : ComplexPattern; +def ADDRGA_CONST_OFFSET : ComplexPattern; +def ADDRGA_VAR_OFFSET : ComplexPattern; + + +def R600_Pred : PredicateOperand; + + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { + +// Class for instructions with only one source register. +// If you add new ins to this instruction, make sure they are listed before +// $literal, because the backend currently assumes that the last operand is +// a literal. Also be sure to update the enum R600Op1OperandIndex::ROI in +// R600Defines.h, R600InstrInfo::buildDefaultInstruction(), +// and R600InstrInfo::getOperandIdx(). +class R600_1OP inst, string opName, list pattern, + InstrItinClass itin = AnyALU> : + InstR600 <(outs R600_Reg32:$dst), + (ins WRITE:$write, OMOD:$omod, REL:$dst_rel, CLAMP:$clamp, + R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel, + LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal, + BANK_SWIZZLE:$bank_swizzle), + !strconcat(" ", opName, + "$clamp $last $dst$write$dst_rel$omod, " + "$src0_neg$src0_abs$src0$src0_abs$src0_rel, " + "$pred_sel $bank_swizzle"), + pattern, + itin>, + R600ALU_Word0, + R600ALU_Word1_OP2 { + + let src1 = 0; + let src1_rel = 0; + let src1_neg = 0; + let src1_abs = 0; + let update_exec_mask = 0; + let update_pred = 0; + let HasNativeOperands = 1; + let Op1 = 1; + let ALUInst = 1; + let DisableEncoding = "$literal"; + let UseNamedOperandTable = 1; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; +} + +class R600_1OP_Helper inst, string opName, SDPatternOperator node, + InstrItinClass itin = AnyALU> : + R600_1OP ; + +// If you add or change the operands for R600_2OP instructions, you must +// also update the R600Op2OperandIndex::ROI enum in R600Defines.h, +// R600InstrInfo::buildDefaultInstruction(), and R600InstrInfo::getOperandIdx(). +class R600_2OP inst, string opName, list pattern, + InstrItinClass itin = AnyALU> : + InstR600 <(outs R600_Reg32:$dst), + (ins UEM:$update_exec_mask, UP:$update_pred, WRITE:$write, + OMOD:$omod, REL:$dst_rel, CLAMP:$clamp, + R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel, + R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, ABS:$src1_abs, SEL:$src1_sel, + LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal, + BANK_SWIZZLE:$bank_swizzle), + !strconcat(" ", opName, + "$clamp $last $update_exec_mask$update_pred$dst$write$dst_rel$omod, " + "$src0_neg$src0_abs$src0$src0_abs$src0_rel, " + "$src1_neg$src1_abs$src1$src1_abs$src1_rel, " + "$pred_sel $bank_swizzle"), + pattern, + itin>, + R600ALU_Word0, + R600ALU_Word1_OP2 { + + let HasNativeOperands = 1; + let Op2 = 1; + let ALUInst = 1; + let DisableEncoding = "$literal"; + let UseNamedOperandTable = 1; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; +} + +class R600_2OP_Helper inst, string opName, SDPatternOperator node, + InstrItinClass itin = AnyALU> : + R600_2OP ; + +// If you add our change the operands for R600_3OP instructions, you must +// also update the R600Op3OperandIndex::ROI enum in R600Defines.h, +// R600InstrInfo::buildDefaultInstruction(), and +// R600InstrInfo::getOperandIdx(). +class R600_3OP inst, string opName, list pattern, + InstrItinClass itin = AnyALU> : + InstR600 <(outs R600_Reg32:$dst), + (ins REL:$dst_rel, CLAMP:$clamp, + R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, SEL:$src0_sel, + R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, SEL:$src1_sel, + R600_Reg32:$src2, NEG:$src2_neg, REL:$src2_rel, SEL:$src2_sel, + LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal, + BANK_SWIZZLE:$bank_swizzle), + !strconcat(" ", opName, "$clamp $last $dst$dst_rel, " + "$src0_neg$src0$src0_rel, " + "$src1_neg$src1$src1_rel, " + "$src2_neg$src2$src2_rel, " + "$pred_sel" + "$bank_swizzle"), + pattern, + itin>, + R600ALU_Word0, + R600ALU_Word1_OP3{ + + let HasNativeOperands = 1; + let DisableEncoding = "$literal"; + let Op3 = 1; + let UseNamedOperandTable = 1; + let ALUInst = 1; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; +} + +class R600_REDUCTION inst, dag ins, string asm, list pattern, + InstrItinClass itin = VecALU> : + InstR600 <(outs R600_Reg32:$dst), + ins, + asm, + pattern, + itin>; + + + +} // End mayLoad = 1, mayStore = 0, hasSideEffects = 0 + +def TEX_SHADOW : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return (TType >= 6 && TType <= 8) || TType == 13; + }] +>; + +def TEX_RECT : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return TType == 5; + }] +>; + +def TEX_ARRAY : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return TType == 9 || TType == 10 || TType == 16; + }] +>; + +def TEX_SHADOW_ARRAY : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return TType == 11 || TType == 12 || TType == 17; + }] +>; + +def TEX_MSAA : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return TType == 14; + }] +>; + +def TEX_ARRAY_MSAA : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return TType == 15; + }] +>; + +class EG_CF_RAT cfinst, bits <6> ratinst, bits<4> ratid, bits<4> mask, + dag outs, dag ins, string asm, list pattern> : + InstR600ISA , + CF_ALLOC_EXPORT_WORD0_RAT, CF_ALLOC_EXPORT_WORD1_BUF { + + let rat_id = ratid; + let rat_inst = ratinst; + let rim = 0; + // XXX: Have a separate instruction for non-indexed writes. + let type = 1; + let rw_rel = 0; + let elem_size = 0; + + let array_size = 0; + let comp_mask = mask; + let burst_count = 0; + let vpm = 0; + let cf_inst = cfinst; + let mark = 0; + let barrier = 1; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; + let IsExport = 1; + +} + +class VTX_READ buffer_id, dag outs, list pattern> + : InstR600ISA , + VTX_WORD1_GPR { + + // Static fields + let DST_REL = 0; + // The docs say that if this bit is set, then DATA_FORMAT, NUM_FORMAT_ALL, + // FORMAT_COMP_ALL, SRF_MODE_ALL, and ENDIAN_SWAP fields will be ignored, + // however, based on my testing if USE_CONST_FIELDS is set, then all + // these fields need to be set to 0. + let USE_CONST_FIELDS = 0; + let NUM_FORMAT_ALL = 1; + let FORMAT_COMP_ALL = 0; + let SRF_MODE_ALL = 0; + + let Inst{63-32} = Word1; + // LLVM can only encode 64-bit instructions, so these fields are manually + // encoded in R600CodeEmitter + // + // bits<16> OFFSET; + // bits<2> ENDIAN_SWAP = 0; + // bits<1> CONST_BUF_NO_STRIDE = 0; + // bits<1> MEGA_FETCH = 0; + // bits<1> ALT_CONST = 0; + // bits<2> BUFFER_INDEX_MODE = 0; + + // VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding + // is done in R600CodeEmitter + // + // Inst{79-64} = OFFSET; + // Inst{81-80} = ENDIAN_SWAP; + // Inst{82} = CONST_BUF_NO_STRIDE; + // Inst{83} = MEGA_FETCH; + // Inst{84} = ALT_CONST; + // Inst{86-85} = BUFFER_INDEX_MODE; + // Inst{95-86} = 0; Reserved + + // VTX_WORD3 (Padding) + // + // Inst{127-96} = 0; + + let VTXInst = 1; +} + +class LoadParamFrag : PatFrag < + (ops node:$ptr), (load_type node:$ptr), + [{ return isConstantLoad(dyn_cast(N), 0); }] +>; + +def load_param : LoadParamFrag; +def load_param_exti8 : LoadParamFrag; +def load_param_exti16 : LoadParamFrag; + +def isR600 : Predicate<"Subtarget->getGeneration() <= AMDGPUSubtarget::R700">; + +def isR600toCayman + : Predicate< + "Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS">; + +//===----------------------------------------------------------------------===// +// R600 SDNodes +//===----------------------------------------------------------------------===// + +def INTERP_PAIR_XY : AMDGPUShaderInst < + (outs R600_TReg32_X:$dst0, R600_TReg32_Y:$dst1), + (ins i32imm:$src0, R600_TReg32_Y:$src1, R600_TReg32_X:$src2), + "INTERP_PAIR_XY $src0 $src1 $src2 : $dst0 dst1", + []>; + +def INTERP_PAIR_ZW : AMDGPUShaderInst < + (outs R600_TReg32_Z:$dst0, R600_TReg32_W:$dst1), + (ins i32imm:$src0, R600_TReg32_Y:$src1, R600_TReg32_X:$src2), + "INTERP_PAIR_ZW $src0 $src1 $src2 : $dst0 dst1", + []>; + +def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS", + SDTypeProfile<1, -1, [SDTCisInt<0>, SDTCisPtrTy<1>]>, + [SDNPVariadic] +>; + +def DOT4 : SDNode<"AMDGPUISD::DOT4", + SDTypeProfile<1, 8, [SDTCisFP<0>, SDTCisVT<1, f32>, SDTCisVT<2, f32>, + SDTCisVT<3, f32>, SDTCisVT<4, f32>, SDTCisVT<5, f32>, + SDTCisVT<6, f32>, SDTCisVT<7, f32>, SDTCisVT<8, f32>]>, + [] +>; + +def COS_HW : SDNode<"AMDGPUISD::COS_HW", + SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]> +>; + +def SIN_HW : SDNode<"AMDGPUISD::SIN_HW", + SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]> +>; + +def TEXTURE_FETCH_Type : SDTypeProfile<1, 19, [SDTCisFP<0>]>; + +def TEXTURE_FETCH: SDNode<"AMDGPUISD::TEXTURE_FETCH", TEXTURE_FETCH_Type, []>; + +multiclass TexPattern TextureOp, Instruction inst, ValueType vt = v4f32> { +def : Pat<(TEXTURE_FETCH (i32 TextureOp), vt:$SRC_GPR, + (i32 imm:$srcx), (i32 imm:$srcy), (i32 imm:$srcz), (i32 imm:$srcw), + (i32 imm:$offsetx), (i32 imm:$offsety), (i32 imm:$offsetz), + (i32 imm:$DST_SEL_X), (i32 imm:$DST_SEL_Y), (i32 imm:$DST_SEL_Z), + (i32 imm:$DST_SEL_W), + (i32 imm:$RESOURCE_ID), (i32 imm:$SAMPLER_ID), + (i32 imm:$COORD_TYPE_X), (i32 imm:$COORD_TYPE_Y), (i32 imm:$COORD_TYPE_Z), + (i32 imm:$COORD_TYPE_W)), + (inst R600_Reg128:$SRC_GPR, + imm:$srcx, imm:$srcy, imm:$srcz, imm:$srcw, + imm:$offsetx, imm:$offsety, imm:$offsetz, + imm:$DST_SEL_X, imm:$DST_SEL_Y, imm:$DST_SEL_Z, + imm:$DST_SEL_W, + imm:$RESOURCE_ID, imm:$SAMPLER_ID, + imm:$COORD_TYPE_X, imm:$COORD_TYPE_Y, imm:$COORD_TYPE_Z, + imm:$COORD_TYPE_W)>; +} + +//===----------------------------------------------------------------------===// +// Interpolation Instructions +//===----------------------------------------------------------------------===// + +def INTERP_VEC_LOAD : AMDGPUShaderInst < + (outs R600_Reg128:$dst), + (ins i32imm:$src0), + "INTERP_LOAD $src0 : $dst", + [(set R600_Reg128:$dst, (int_R600_interp_const imm:$src0))]>; + +def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> { + let bank_swizzle = 5; +} + +def INTERP_ZW : R600_2OP <0xD7, "INTERP_ZW", []> { + let bank_swizzle = 5; +} + +def INTERP_LOAD_P0 : R600_1OP <0xE0, "INTERP_LOAD_P0", []>; + +//===----------------------------------------------------------------------===// +// Export Instructions +//===----------------------------------------------------------------------===// + +def ExportType : SDTypeProfile<0, 7, [SDTCisFP<0>, SDTCisInt<1>]>; + +def EXPORT: SDNode<"AMDGPUISD::EXPORT", ExportType, + [SDNPHasChain, SDNPSideEffect]>; + +class ExportWord0 { + field bits<32> Word0; + + bits<13> arraybase; + bits<2> type; + bits<7> gpr; + bits<2> elem_size; + + let Word0{12-0} = arraybase; + let Word0{14-13} = type; + let Word0{21-15} = gpr; + let Word0{22} = 0; // RW_REL + let Word0{29-23} = 0; // INDEX_GPR + let Word0{31-30} = elem_size; +} + +class ExportSwzWord1 { + field bits<32> Word1; + + bits<3> sw_x; + bits<3> sw_y; + bits<3> sw_z; + bits<3> sw_w; + bits<1> eop; + bits<8> inst; + + let Word1{2-0} = sw_x; + let Word1{5-3} = sw_y; + let Word1{8-6} = sw_z; + let Word1{11-9} = sw_w; +} + +class ExportBufWord1 { + field bits<32> Word1; + + bits<12> arraySize; + bits<4> compMask; + bits<1> eop; + bits<8> inst; + + let Word1{11-0} = arraySize; + let Word1{15-12} = compMask; +} + +multiclass ExportPattern cf_inst> { + def : Pat<(int_R600_store_pixel_depth R600_Reg32:$reg), + (ExportInst + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $reg, sub0), + 0, 61, 0, 7, 7, 7, cf_inst, 0) + >; + + def : Pat<(int_R600_store_pixel_stencil R600_Reg32:$reg), + (ExportInst + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $reg, sub0), + 0, 61, 7, 0, 7, 7, cf_inst, 0) + >; + + def : Pat<(int_R600_store_dummy (i32 imm:$type)), + (ExportInst + (v4f32 (IMPLICIT_DEF)), imm:$type, 0, 7, 7, 7, 7, cf_inst, 0) + >; + + def : Pat<(int_R600_store_dummy 1), + (ExportInst + (v4f32 (IMPLICIT_DEF)), 1, 60, 7, 7, 7, 7, cf_inst, 0) + >; + + def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type), + (i32 imm:$swz_x), (i32 imm:$swz_y), (i32 imm:$swz_z), (i32 imm:$swz_w)), + (ExportInst R600_Reg128:$src, imm:$type, imm:$base, + imm:$swz_x, imm:$swz_y, imm:$swz_z, imm:$swz_w, cf_inst, 0) + >; + +} + +multiclass SteamOutputExportPattern buf0inst, bits<8> buf1inst, bits<8> buf2inst, bits<8> buf3inst> { +// Stream0 + def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), + (i32 imm:$arraybase), (i32 0), (i32 imm:$mask)), + (ExportInst R600_Reg128:$src, 0, imm:$arraybase, + 4095, imm:$mask, buf0inst, 0)>; +// Stream1 + def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), + (i32 imm:$arraybase), (i32 1), (i32 imm:$mask)), + (ExportInst $src, 0, imm:$arraybase, + 4095, imm:$mask, buf1inst, 0)>; +// Stream2 + def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), + (i32 imm:$arraybase), (i32 2), (i32 imm:$mask)), + (ExportInst $src, 0, imm:$arraybase, + 4095, imm:$mask, buf2inst, 0)>; +// Stream3 + def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), + (i32 imm:$arraybase), (i32 3), (i32 imm:$mask)), + (ExportInst $src, 0, imm:$arraybase, + 4095, imm:$mask, buf3inst, 0)>; +} + +// Export Instructions should not be duplicated by TailDuplication pass +// (which assumes that duplicable instruction are affected by exec mask) +let usesCustomInserter = 1, isNotDuplicable = 1 in { + +class ExportSwzInst : InstR600ISA<( + outs), + (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase, + RSel:$sw_x, RSel:$sw_y, RSel:$sw_z, RSel:$sw_w, i32imm:$inst, + i32imm:$eop), + !strconcat("EXPORT", " $gpr.$sw_x$sw_y$sw_z$sw_w"), + []>, ExportWord0, ExportSwzWord1 { + let elem_size = 3; + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; + let IsExport = 1; +} + +} // End usesCustomInserter = 1 + +class ExportBufInst : InstR600ISA<( + outs), + (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase, + i32imm:$arraySize, i32imm:$compMask, i32imm:$inst, i32imm:$eop), + !strconcat("EXPORT", " $gpr"), + []>, ExportWord0, ExportBufWord1 { + let elem_size = 0; + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; + let IsExport = 1; +} + +//===----------------------------------------------------------------------===// +// Control Flow Instructions +//===----------------------------------------------------------------------===// + + +def KCACHE : InstFlag<"printKCache">; + +class ALU_CLAUSE inst, string OpName> : AMDGPUInst <(outs), +(ins i32imm:$ADDR, i32imm:$KCACHE_BANK0, i32imm:$KCACHE_BANK1, +KCACHE:$KCACHE_MODE0, KCACHE:$KCACHE_MODE1, +i32imm:$KCACHE_ADDR0, i32imm:$KCACHE_ADDR1, +i32imm:$COUNT, i32imm:$Enabled), +!strconcat(OpName, " $COUNT, @$ADDR, " +"KC0[$KCACHE_MODE0], KC1[$KCACHE_MODE1]"), +[] >, CF_ALU_WORD0, CF_ALU_WORD1 { + field bits<64> Inst; + + let CF_INST = inst; + let ALT_CONST = 0; + let WHOLE_QUAD_MODE = 0; + let BARRIER = 1; + let isCodeGenOnly = 1; + let UseNamedOperandTable = 1; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; +} + +class CF_WORD0_R600 { + field bits<32> Word0; + + bits<32> ADDR; + + let Word0 = ADDR; +} + +class CF_CLAUSE_R600 inst, dag ins, string AsmPrint> : AMDGPUInst <(outs), +ins, AsmPrint, [] >, CF_WORD0_R600, CF_WORD1_R600 { + field bits<64> Inst; + bits<4> CNT; + + let CF_INST = inst; + let BARRIER = 1; + let CF_CONST = 0; + let VALID_PIXEL_MODE = 0; + let COND = 0; + let COUNT = CNT{2-0}; + let CALL_COUNT = 0; + let COUNT_3 = CNT{3}; + let END_OF_PROGRAM = 0; + let WHOLE_QUAD_MODE = 0; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; +} + +class CF_CLAUSE_EG inst, dag ins, string AsmPrint> : AMDGPUInst <(outs), +ins, AsmPrint, [] >, CF_WORD0_EG, CF_WORD1_EG { + field bits<64> Inst; + + let CF_INST = inst; + let BARRIER = 1; + let JUMPTABLE_SEL = 0; + let CF_CONST = 0; + let VALID_PIXEL_MODE = 0; + let COND = 0; + let END_OF_PROGRAM = 0; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; +} + +def CF_ALU : ALU_CLAUSE<8, "ALU">; +def CF_ALU_PUSH_BEFORE : ALU_CLAUSE<9, "ALU_PUSH_BEFORE">; +def CF_ALU_POP_AFTER : ALU_CLAUSE<10, "ALU_POP_AFTER">; +def CF_ALU_CONTINUE : ALU_CLAUSE<13, "ALU_CONTINUE">; +def CF_ALU_BREAK : ALU_CLAUSE<14, "ALU_BREAK">; +def CF_ALU_ELSE_AFTER : ALU_CLAUSE<15, "ALU_ELSE_AFTER">; + +def FETCH_CLAUSE : AMDGPUInst <(outs), +(ins i32imm:$addr), "Fetch clause starting at $addr:", [] > { + field bits<8> Inst; + bits<8> num; + let Inst = num; + let isCodeGenOnly = 1; +} + +def ALU_CLAUSE : AMDGPUInst <(outs), +(ins i32imm:$addr), "ALU clause starting at $addr:", [] > { + field bits<8> Inst; + bits<8> num; + let Inst = num; + let isCodeGenOnly = 1; +} + +def LITERALS : AMDGPUInst <(outs), +(ins LITERAL:$literal1, LITERAL:$literal2), "$literal1, $literal2", [] > { + let isCodeGenOnly = 1; + + field bits<64> Inst; + bits<32> literal1; + bits<32> literal2; + + let Inst{31-0} = literal1; + let Inst{63-32} = literal2; +} + +def PAD : AMDGPUInst <(outs), (ins), "PAD", [] > { + field bits<64> Inst; +} + +let Predicates = [isR600toCayman] in { + +//===----------------------------------------------------------------------===// +// Common Instructions R600, R700, Evergreen, Cayman +//===----------------------------------------------------------------------===// + +def ADD : R600_2OP_Helper <0x0, "ADD", fadd>; +// Non-IEEE MUL: 0 * anything = 0 +def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE", int_AMDGPU_mul>; +def MUL_IEEE : R600_2OP_Helper <0x2, "MUL_IEEE", fmul>; +// TODO: Do these actually match the regular fmin/fmax behavior? +def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax_legacy>; +def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin_legacy>; +// According to https://msdn.microsoft.com/en-us/library/windows/desktop/cc308050%28v=vs.85%29.aspx +// DX10 min/max returns the other operand if one is NaN, +// this matches http://llvm.org/docs/LangRef.html#llvm-minnum-intrinsic +def MAX_DX10 : R600_2OP_Helper <0x5, "MAX_DX10", fmaxnum>; +def MIN_DX10 : R600_2OP_Helper <0x6, "MIN_DX10", fminnum>; + +// For the SET* instructions there is a naming conflict in TargetSelectionDAG.td, +// so some of the instruction names don't match the asm string. +// XXX: Use the defs in TargetSelectionDAG.td instead of intrinsics. +def SETE : R600_2OP < + 0x08, "SETE", + [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OEQ))] +>; + +def SGT : R600_2OP < + 0x09, "SETGT", + [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGT))] +>; + +def SGE : R600_2OP < + 0xA, "SETGE", + [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGE))] +>; + +def SNE : R600_2OP < + 0xB, "SETNE", + [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_UNE_NE))] +>; + +def SETE_DX10 : R600_2OP < + 0xC, "SETE_DX10", + [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OEQ))] +>; + +def SETGT_DX10 : R600_2OP < + 0xD, "SETGT_DX10", + [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGT))] +>; + +def SETGE_DX10 : R600_2OP < + 0xE, "SETGE_DX10", + [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGE))] +>; + +// FIXME: This should probably be COND_ONE +def SETNE_DX10 : R600_2OP < + 0xF, "SETNE_DX10", + [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_UNE_NE))] +>; + +def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>; +def TRUNC : R600_1OP_Helper <0x11, "TRUNC", ftrunc>; +def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>; +def RNDNE : R600_1OP_Helper <0x13, "RNDNE", frint>; +def FLOOR : R600_1OP_Helper <0x14, "FLOOR", ffloor>; + +def MOV : R600_1OP <0x19, "MOV", []>; + +let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in { + +class MOV_IMM : AMDGPUInst < + (outs R600_Reg32:$dst), + (ins immType:$imm), + "", + [] +>; + +} // end let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 + +def MOV_IMM_I32 : MOV_IMM; +def : Pat < + (imm:$val), + (MOV_IMM_I32 imm:$val) +>; + +def MOV_IMM_F32 : MOV_IMM; +def : Pat < + (fpimm:$val), + (MOV_IMM_F32 fpimm:$val) +>; + +def PRED_SETE : R600_2OP <0x20, "PRED_SETE", []>; +def PRED_SETGT : R600_2OP <0x21, "PRED_SETGT", []>; +def PRED_SETGE : R600_2OP <0x22, "PRED_SETGE", []>; +def PRED_SETNE : R600_2OP <0x23, "PRED_SETNE", []>; + +let hasSideEffects = 1 in { + +def KILLGT : R600_2OP <0x2D, "KILLGT", []>; + +} // end hasSideEffects + +def AND_INT : R600_2OP_Helper <0x30, "AND_INT", and>; +def OR_INT : R600_2OP_Helper <0x31, "OR_INT", or>; +def XOR_INT : R600_2OP_Helper <0x32, "XOR_INT", xor>; +def NOT_INT : R600_1OP_Helper <0x33, "NOT_INT", not>; +def ADD_INT : R600_2OP_Helper <0x34, "ADD_INT", add>; +def SUB_INT : R600_2OP_Helper <0x35, "SUB_INT", sub>; +def MAX_INT : R600_2OP_Helper <0x36, "MAX_INT", smax>; +def MIN_INT : R600_2OP_Helper <0x37, "MIN_INT", smin>; +def MAX_UINT : R600_2OP_Helper <0x38, "MAX_UINT", umax>; +def MIN_UINT : R600_2OP_Helper <0x39, "MIN_UINT", umin>; + +def SETE_INT : R600_2OP < + 0x3A, "SETE_INT", + [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETEQ))] +>; + +def SETGT_INT : R600_2OP < + 0x3B, "SETGT_INT", + [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETGT))] +>; + +def SETGE_INT : R600_2OP < + 0x3C, "SETGE_INT", + [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETGE))] +>; + +def SETNE_INT : R600_2OP < + 0x3D, "SETNE_INT", + [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETNE))] +>; + +def SETGT_UINT : R600_2OP < + 0x3E, "SETGT_UINT", + [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETUGT))] +>; + +def SETGE_UINT : R600_2OP < + 0x3F, "SETGE_UINT", + [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETUGE))] +>; + +def PRED_SETE_INT : R600_2OP <0x42, "PRED_SETE_INT", []>; +def PRED_SETGT_INT : R600_2OP <0x43, "PRED_SETGE_INT", []>; +def PRED_SETGE_INT : R600_2OP <0x44, "PRED_SETGE_INT", []>; +def PRED_SETNE_INT : R600_2OP <0x45, "PRED_SETNE_INT", []>; + +def CNDE_INT : R600_3OP < + 0x1C, "CNDE_INT", + [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_EQ))] +>; + +def CNDGE_INT : R600_3OP < + 0x1E, "CNDGE_INT", + [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_SGE))] +>; + +def CNDGT_INT : R600_3OP < + 0x1D, "CNDGT_INT", + [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_SGT))] +>; + +//===----------------------------------------------------------------------===// +// Texture instructions +//===----------------------------------------------------------------------===// + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { + +class R600_TEX inst, string opName> : + InstR600 <(outs R600_Reg128:$DST_GPR), + (ins R600_Reg128:$SRC_GPR, + RSel:$srcx, RSel:$srcy, RSel:$srcz, RSel:$srcw, + i32imm:$offsetx, i32imm:$offsety, i32imm:$offsetz, + RSel:$DST_SEL_X, RSel:$DST_SEL_Y, RSel:$DST_SEL_Z, RSel:$DST_SEL_W, + i32imm:$RESOURCE_ID, i32imm:$SAMPLER_ID, + CT:$COORD_TYPE_X, CT:$COORD_TYPE_Y, CT:$COORD_TYPE_Z, + CT:$COORD_TYPE_W), + !strconcat(opName, + " $DST_GPR.$DST_SEL_X$DST_SEL_Y$DST_SEL_Z$DST_SEL_W, " + "$SRC_GPR.$srcx$srcy$srcz$srcw " + "RID:$RESOURCE_ID SID:$SAMPLER_ID " + "CT:$COORD_TYPE_X$COORD_TYPE_Y$COORD_TYPE_Z$COORD_TYPE_W"), + [], + NullALU>, TEX_WORD0, TEX_WORD1, TEX_WORD2 { + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; + + let TEX_INST = inst{4-0}; + let SRC_REL = 0; + let DST_REL = 0; + let LOD_BIAS = 0; + + let INST_MOD = 0; + let FETCH_WHOLE_QUAD = 0; + let ALT_CONST = 0; + let SAMPLER_INDEX_MODE = 0; + let RESOURCE_INDEX_MODE = 0; + + let TEXInst = 1; +} + +} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0 + + + +def TEX_SAMPLE : R600_TEX <0x10, "TEX_SAMPLE">; +def TEX_SAMPLE_C : R600_TEX <0x18, "TEX_SAMPLE_C">; +def TEX_SAMPLE_L : R600_TEX <0x11, "TEX_SAMPLE_L">; +def TEX_SAMPLE_C_L : R600_TEX <0x19, "TEX_SAMPLE_C_L">; +def TEX_SAMPLE_LB : R600_TEX <0x12, "TEX_SAMPLE_LB">; +def TEX_SAMPLE_C_LB : R600_TEX <0x1A, "TEX_SAMPLE_C_LB">; +def TEX_LD : R600_TEX <0x03, "TEX_LD">; +def TEX_LDPTR : R600_TEX <0x03, "TEX_LDPTR"> { + let INST_MOD = 1; +} +def TEX_GET_TEXTURE_RESINFO : R600_TEX <0x04, "TEX_GET_TEXTURE_RESINFO">; +def TEX_GET_GRADIENTS_H : R600_TEX <0x07, "TEX_GET_GRADIENTS_H">; +def TEX_GET_GRADIENTS_V : R600_TEX <0x08, "TEX_GET_GRADIENTS_V">; +def TEX_SET_GRADIENTS_H : R600_TEX <0x0B, "TEX_SET_GRADIENTS_H">; +def TEX_SET_GRADIENTS_V : R600_TEX <0x0C, "TEX_SET_GRADIENTS_V">; +def TEX_SAMPLE_G : R600_TEX <0x14, "TEX_SAMPLE_G">; +def TEX_SAMPLE_C_G : R600_TEX <0x1C, "TEX_SAMPLE_C_G">; + +defm : TexPattern<0, TEX_SAMPLE>; +defm : TexPattern<1, TEX_SAMPLE_C>; +defm : TexPattern<2, TEX_SAMPLE_L>; +defm : TexPattern<3, TEX_SAMPLE_C_L>; +defm : TexPattern<4, TEX_SAMPLE_LB>; +defm : TexPattern<5, TEX_SAMPLE_C_LB>; +defm : TexPattern<6, TEX_LD, v4i32>; +defm : TexPattern<7, TEX_GET_TEXTURE_RESINFO, v4i32>; +defm : TexPattern<8, TEX_GET_GRADIENTS_H>; +defm : TexPattern<9, TEX_GET_GRADIENTS_V>; +defm : TexPattern<10, TEX_LDPTR, v4i32>; + +//===----------------------------------------------------------------------===// +// Helper classes for common instructions +//===----------------------------------------------------------------------===// + +class MUL_LIT_Common inst> : R600_3OP < + inst, "MUL_LIT", + [] +>; + +class MULADD_Common inst> : R600_3OP < + inst, "MULADD", + [] +>; + +class MULADD_IEEE_Common inst> : R600_3OP < + inst, "MULADD_IEEE", + [(set f32:$dst, (fmad f32:$src0, f32:$src1, f32:$src2))] +>; + +class FMA_Common inst> : R600_3OP < + inst, "FMA", + [(set f32:$dst, (fma f32:$src0, f32:$src1, f32:$src2))], VecALU +>; + +class CNDE_Common inst> : R600_3OP < + inst, "CNDE", + [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OEQ))] +>; + +class CNDGT_Common inst> : R600_3OP < + inst, "CNDGT", + [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGT))] +> { + let Itinerary = VecALU; +} + +class CNDGE_Common inst> : R600_3OP < + inst, "CNDGE", + [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGE))] +> { + let Itinerary = VecALU; +} + + +let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in { +class R600_VEC2OP pattern> : InstR600 <(outs R600_Reg32:$dst), (ins +// Slot X + UEM:$update_exec_mask_X, UP:$update_pred_X, WRITE:$write_X, + OMOD:$omod_X, REL:$dst_rel_X, CLAMP:$clamp_X, + R600_TReg32_X:$src0_X, NEG:$src0_neg_X, REL:$src0_rel_X, ABS:$src0_abs_X, SEL:$src0_sel_X, + R600_TReg32_X:$src1_X, NEG:$src1_neg_X, REL:$src1_rel_X, ABS:$src1_abs_X, SEL:$src1_sel_X, + R600_Pred:$pred_sel_X, +// Slot Y + UEM:$update_exec_mask_Y, UP:$update_pred_Y, WRITE:$write_Y, + OMOD:$omod_Y, REL:$dst_rel_Y, CLAMP:$clamp_Y, + R600_TReg32_Y:$src0_Y, NEG:$src0_neg_Y, REL:$src0_rel_Y, ABS:$src0_abs_Y, SEL:$src0_sel_Y, + R600_TReg32_Y:$src1_Y, NEG:$src1_neg_Y, REL:$src1_rel_Y, ABS:$src1_abs_Y, SEL:$src1_sel_Y, + R600_Pred:$pred_sel_Y, +// Slot Z + UEM:$update_exec_mask_Z, UP:$update_pred_Z, WRITE:$write_Z, + OMOD:$omod_Z, REL:$dst_rel_Z, CLAMP:$clamp_Z, + R600_TReg32_Z:$src0_Z, NEG:$src0_neg_Z, REL:$src0_rel_Z, ABS:$src0_abs_Z, SEL:$src0_sel_Z, + R600_TReg32_Z:$src1_Z, NEG:$src1_neg_Z, REL:$src1_rel_Z, ABS:$src1_abs_Z, SEL:$src1_sel_Z, + R600_Pred:$pred_sel_Z, +// Slot W + UEM:$update_exec_mask_W, UP:$update_pred_W, WRITE:$write_W, + OMOD:$omod_W, REL:$dst_rel_W, CLAMP:$clamp_W, + R600_TReg32_W:$src0_W, NEG:$src0_neg_W, REL:$src0_rel_W, ABS:$src0_abs_W, SEL:$src0_sel_W, + R600_TReg32_W:$src1_W, NEG:$src1_neg_W, REL:$src1_rel_W, ABS:$src1_abs_W, SEL:$src1_sel_W, + R600_Pred:$pred_sel_W, + LITERAL:$literal0, LITERAL:$literal1), + "", + pattern, + AnyALU> { + + let UseNamedOperandTable = 1; + +} +} + +def DOT_4 : R600_VEC2OP<[(set R600_Reg32:$dst, (DOT4 + R600_TReg32_X:$src0_X, R600_TReg32_X:$src1_X, + R600_TReg32_Y:$src0_Y, R600_TReg32_Y:$src1_Y, + R600_TReg32_Z:$src0_Z, R600_TReg32_Z:$src1_Z, + R600_TReg32_W:$src0_W, R600_TReg32_W:$src1_W))]>; + + +class DOT4_Common inst> : R600_2OP ; + + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { +multiclass CUBE_Common inst> { + + def _pseudo : InstR600 < + (outs R600_Reg128:$dst), + (ins R600_Reg128:$src0), + "CUBE $dst $src0", + [(set v4f32:$dst, (int_AMDGPU_cube v4f32:$src0))], + VecALU + > { + let isPseudo = 1; + let UseNamedOperandTable = 1; + } + + def _real : R600_2OP ; +} +} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0 + +class EXP_IEEE_Common inst> : R600_1OP_Helper < + inst, "EXP_IEEE", fexp2 +> { + let Itinerary = TransALU; +} + +class FLT_TO_INT_Common inst> : R600_1OP_Helper < + inst, "FLT_TO_INT", fp_to_sint +> { + let Itinerary = TransALU; +} + +class INT_TO_FLT_Common inst> : R600_1OP_Helper < + inst, "INT_TO_FLT", sint_to_fp +> { + let Itinerary = TransALU; +} + +class FLT_TO_UINT_Common inst> : R600_1OP_Helper < + inst, "FLT_TO_UINT", fp_to_uint +> { + let Itinerary = TransALU; +} + +class UINT_TO_FLT_Common inst> : R600_1OP_Helper < + inst, "UINT_TO_FLT", uint_to_fp +> { + let Itinerary = TransALU; +} + +class LOG_CLAMPED_Common inst> : R600_1OP < + inst, "LOG_CLAMPED", [] +>; + +class LOG_IEEE_Common inst> : R600_1OP_Helper < + inst, "LOG_IEEE", flog2 +> { + let Itinerary = TransALU; +} + +class LSHL_Common inst> : R600_2OP_Helper ; +class LSHR_Common inst> : R600_2OP_Helper ; +class ASHR_Common inst> : R600_2OP_Helper ; +class MULHI_INT_Common inst> : R600_2OP_Helper < + inst, "MULHI_INT", mulhs +> { + let Itinerary = TransALU; +} +class MULHI_UINT_Common inst> : R600_2OP_Helper < + inst, "MULHI", mulhu +> { + let Itinerary = TransALU; +} +class MULLO_INT_Common inst> : R600_2OP_Helper < + inst, "MULLO_INT", mul +> { + let Itinerary = TransALU; +} +class MULLO_UINT_Common inst> : R600_2OP { + let Itinerary = TransALU; +} + +class RECIP_CLAMPED_Common inst> : R600_1OP < + inst, "RECIP_CLAMPED", [] +> { + let Itinerary = TransALU; +} + +class RECIP_IEEE_Common inst> : R600_1OP < + inst, "RECIP_IEEE", [(set f32:$dst, (AMDGPUrcp f32:$src0))] +> { + let Itinerary = TransALU; +} + +class RECIP_UINT_Common inst> : R600_1OP_Helper < + inst, "RECIP_UINT", AMDGPUurecip +> { + let Itinerary = TransALU; +} + +// Clamped to maximum. +class RECIPSQRT_CLAMPED_Common inst> : R600_1OP_Helper < + inst, "RECIPSQRT_CLAMPED", AMDGPUrsq_clamped +> { + let Itinerary = TransALU; +} + +class RECIPSQRT_IEEE_Common inst> : R600_1OP_Helper < + inst, "RECIPSQRT_IEEE", AMDGPUrsq_legacy +> { + let Itinerary = TransALU; +} + +// TODO: There is also RECIPSQRT_FF which clamps to zero. + +class SIN_Common inst> : R600_1OP < + inst, "SIN", [(set f32:$dst, (SIN_HW f32:$src0))]>{ + let Trig = 1; + let Itinerary = TransALU; +} + +class COS_Common inst> : R600_1OP < + inst, "COS", [(set f32:$dst, (COS_HW f32:$src0))]> { + let Trig = 1; + let Itinerary = TransALU; +} + +def CLAMP_R600 : CLAMP ; +def FABS_R600 : FABS; +def FNEG_R600 : FNEG; + +//===----------------------------------------------------------------------===// +// Helper patterns for complex intrinsics +//===----------------------------------------------------------------------===// + +// FIXME: Should be predicated on unsafe fp math. +multiclass DIV_Common { +def : Pat< + (int_AMDGPU_div f32:$src0, f32:$src1), + (MUL_IEEE $src0, (recip_ieee $src1)) +>; + +def : Pat< + (fdiv f32:$src0, f32:$src1), + (MUL_IEEE $src0, (recip_ieee $src1)) +>; + +def : RcpPat; +} + +class TGSI_LIT_Z_Common + : Pat < + (int_TGSI_lit_z f32:$src_x, f32:$src_y, f32:$src_w), + (exp_ieee (mul_lit (log_clamped (MAX $src_y, (f32 ZERO))), $src_w, $src_x)) +>; + +//===----------------------------------------------------------------------===// +// R600 / R700 Instructions +//===----------------------------------------------------------------------===// + +let Predicates = [isR600] in { + + def MUL_LIT_r600 : MUL_LIT_Common<0x0C>; + def MULADD_r600 : MULADD_Common<0x10>; + def MULADD_IEEE_r600 : MULADD_IEEE_Common<0x14>; + def CNDE_r600 : CNDE_Common<0x18>; + def CNDGT_r600 : CNDGT_Common<0x19>; + def CNDGE_r600 : CNDGE_Common<0x1A>; + def DOT4_r600 : DOT4_Common<0x50>; + defm CUBE_r600 : CUBE_Common<0x52>; + def EXP_IEEE_r600 : EXP_IEEE_Common<0x61>; + def LOG_CLAMPED_r600 : LOG_CLAMPED_Common<0x62>; + def LOG_IEEE_r600 : LOG_IEEE_Common<0x63>; + def RECIP_CLAMPED_r600 : RECIP_CLAMPED_Common<0x64>; + def RECIP_IEEE_r600 : RECIP_IEEE_Common<0x66>; + def RECIPSQRT_CLAMPED_r600 : RECIPSQRT_CLAMPED_Common<0x67>; + def RECIPSQRT_IEEE_r600 : RECIPSQRT_IEEE_Common<0x69>; + def FLT_TO_INT_r600 : FLT_TO_INT_Common<0x6b>; + def INT_TO_FLT_r600 : INT_TO_FLT_Common<0x6c>; + def FLT_TO_UINT_r600 : FLT_TO_UINT_Common<0x79>; + def UINT_TO_FLT_r600 : UINT_TO_FLT_Common<0x6d>; + def SIN_r600 : SIN_Common<0x6E>; + def COS_r600 : COS_Common<0x6F>; + def ASHR_r600 : ASHR_Common<0x70>; + def LSHR_r600 : LSHR_Common<0x71>; + def LSHL_r600 : LSHL_Common<0x72>; + def MULLO_INT_r600 : MULLO_INT_Common<0x73>; + def MULHI_INT_r600 : MULHI_INT_Common<0x74>; + def MULLO_UINT_r600 : MULLO_UINT_Common<0x75>; + def MULHI_UINT_r600 : MULHI_UINT_Common<0x76>; + def RECIP_UINT_r600 : RECIP_UINT_Common <0x78>; + + defm DIV_r600 : DIV_Common; + def : POW_Common ; + def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common; + + def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>; + def : RsqPat; + + def R600_ExportSwz : ExportSwzInst { + let Word1{20-17} = 0; // BURST_COUNT + let Word1{21} = eop; + let Word1{22} = 0; // VALID_PIXEL_MODE + let Word1{30-23} = inst; + let Word1{31} = 1; // BARRIER + } + defm : ExportPattern; + + def R600_ExportBuf : ExportBufInst { + let Word1{20-17} = 0; // BURST_COUNT + let Word1{21} = eop; + let Word1{22} = 0; // VALID_PIXEL_MODE + let Word1{30-23} = inst; + let Word1{31} = 1; // BARRIER + } + defm : SteamOutputExportPattern; + + def CF_TC_R600 : CF_CLAUSE_R600<1, (ins i32imm:$ADDR, i32imm:$CNT), + "TEX $CNT @$ADDR"> { + let POP_COUNT = 0; + } + def CF_VC_R600 : CF_CLAUSE_R600<2, (ins i32imm:$ADDR, i32imm:$CNT), + "VTX $CNT @$ADDR"> { + let POP_COUNT = 0; + } + def WHILE_LOOP_R600 : CF_CLAUSE_R600<6, (ins i32imm:$ADDR), + "LOOP_START_DX10 @$ADDR"> { + let POP_COUNT = 0; + let CNT = 0; + } + def END_LOOP_R600 : CF_CLAUSE_R600<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> { + let POP_COUNT = 0; + let CNT = 0; + } + def LOOP_BREAK_R600 : CF_CLAUSE_R600<9, (ins i32imm:$ADDR), + "LOOP_BREAK @$ADDR"> { + let POP_COUNT = 0; + let CNT = 0; + } + def CF_CONTINUE_R600 : CF_CLAUSE_R600<8, (ins i32imm:$ADDR), + "CONTINUE @$ADDR"> { + let POP_COUNT = 0; + let CNT = 0; + } + def CF_JUMP_R600 : CF_CLAUSE_R600<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "JUMP @$ADDR POP:$POP_COUNT"> { + let CNT = 0; + } + def CF_PUSH_ELSE_R600 : CF_CLAUSE_R600<12, (ins i32imm:$ADDR), + "PUSH_ELSE @$ADDR"> { + let CNT = 0; + let POP_COUNT = 0; // FIXME? + } + def CF_ELSE_R600 : CF_CLAUSE_R600<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "ELSE @$ADDR POP:$POP_COUNT"> { + let CNT = 0; + } + def CF_CALL_FS_R600 : CF_CLAUSE_R600<19, (ins), "CALL_FS"> { + let ADDR = 0; + let CNT = 0; + let POP_COUNT = 0; + } + def POP_R600 : CF_CLAUSE_R600<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "POP @$ADDR POP:$POP_COUNT"> { + let CNT = 0; + } + def CF_END_R600 : CF_CLAUSE_R600<0, (ins), "CF_END"> { + let CNT = 0; + let POP_COUNT = 0; + let ADDR = 0; + let END_OF_PROGRAM = 1; + } + +} + + +//===----------------------------------------------------------------------===// +// Regist loads and stores - for indirect addressing +//===----------------------------------------------------------------------===// + +defm R600_ : RegisterLoadStore ; + + +//===----------------------------------------------------------------------===// +// Pseudo instructions +//===----------------------------------------------------------------------===// + +let isPseudo = 1 in { + +def PRED_X : InstR600 < + (outs R600_Predicate_Bit:$dst), + (ins R600_Reg32:$src0, i32imm:$src1, i32imm:$flags), + "", [], NullALU> { + let FlagOperandIdx = 3; +} + +let isTerminator = 1, isBranch = 1 in { +def JUMP_COND : InstR600 < + (outs), + (ins brtarget:$target, R600_Predicate_Bit:$p), + "JUMP $target ($p)", + [], AnyALU + >; + +def JUMP : InstR600 < + (outs), + (ins brtarget:$target), + "JUMP $target", + [], AnyALU + > +{ + let isPredicable = 1; + let isBarrier = 1; +} + +} // End isTerminator = 1, isBranch = 1 + +let usesCustomInserter = 1 in { + +let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in { + +def MASK_WRITE : AMDGPUShaderInst < + (outs), + (ins R600_Reg32:$src), + "MASK_WRITE $src", + [] +>; + +} // End mayLoad = 0, mayStore = 0, hasSideEffects = 1 + + +def TXD: InstR600 < + (outs R600_Reg128:$dst), + (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, + i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget), + "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget", + [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2, + imm:$resourceId, imm:$samplerId, imm:$textureTarget))], + NullALU > { + let TEXInst = 1; +} + +def TXD_SHADOW: InstR600 < + (outs R600_Reg128:$dst), + (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, + i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget), + "TXD_SHADOW $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget", + [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2, + imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))], + NullALU +> { + let TEXInst = 1; +} +} // End isPseudo = 1 +} // End usesCustomInserter = 1 + + +//===----------------------------------------------------------------------===// +// Constant Buffer Addressing Support +//===----------------------------------------------------------------------===// + +let usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in { +def CONST_COPY : Instruction { + let OutOperandList = (outs R600_Reg32:$dst); + let InOperandList = (ins i32imm:$src); + let Pattern = + [(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))]; + let AsmString = "CONST_COPY"; + let hasSideEffects = 0; + let isAsCheapAsAMove = 1; + let Itinerary = NullALU; +} +} // end usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" + +def TEX_VTX_CONSTBUF : + InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "VTX_READ_eg $dst, $ptr", + [(set v4i32:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr, (i32 imm:$BUFFER_ID)))]>, + VTX_WORD1_GPR, VTX_WORD0_eg { + + let VC_INST = 0; + let FETCH_TYPE = 2; + let FETCH_WHOLE_QUAD = 0; + let SRC_REL = 0; + let SRC_SEL_X = 0; + let DST_REL = 0; + let USE_CONST_FIELDS = 0; + let NUM_FORMAT_ALL = 2; + let FORMAT_COMP_ALL = 1; + let SRF_MODE_ALL = 1; + let MEGA_FETCH_COUNT = 16; + let DST_SEL_X = 0; + let DST_SEL_Y = 1; + let DST_SEL_Z = 2; + let DST_SEL_W = 3; + let DATA_FORMAT = 35; + + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; + +// LLVM can only encode 64-bit instructions, so these fields are manually +// encoded in R600CodeEmitter +// +// bits<16> OFFSET; +// bits<2> ENDIAN_SWAP = 0; +// bits<1> CONST_BUF_NO_STRIDE = 0; +// bits<1> MEGA_FETCH = 0; +// bits<1> ALT_CONST = 0; +// bits<2> BUFFER_INDEX_MODE = 0; + + + +// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding +// is done in R600CodeEmitter +// +// Inst{79-64} = OFFSET; +// Inst{81-80} = ENDIAN_SWAP; +// Inst{82} = CONST_BUF_NO_STRIDE; +// Inst{83} = MEGA_FETCH; +// Inst{84} = ALT_CONST; +// Inst{86-85} = BUFFER_INDEX_MODE; +// Inst{95-86} = 0; Reserved + +// VTX_WORD3 (Padding) +// +// Inst{127-96} = 0; + let VTXInst = 1; +} + +def TEX_VTX_TEXBUF: + InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "TEX_VTX_EXPLICIT_READ $dst, $ptr", + [(set v4f32:$dst, (int_R600_load_texbuf ADDRGA_VAR_OFFSET:$ptr, imm:$BUFFER_ID))]>, +VTX_WORD1_GPR, VTX_WORD0_eg { + +let VC_INST = 0; +let FETCH_TYPE = 2; +let FETCH_WHOLE_QUAD = 0; +let SRC_REL = 0; +let SRC_SEL_X = 0; +let DST_REL = 0; +let USE_CONST_FIELDS = 1; +let NUM_FORMAT_ALL = 0; +let FORMAT_COMP_ALL = 0; +let SRF_MODE_ALL = 1; +let MEGA_FETCH_COUNT = 16; +let DST_SEL_X = 0; +let DST_SEL_Y = 1; +let DST_SEL_Z = 2; +let DST_SEL_W = 3; +let DATA_FORMAT = 0; + +let Inst{31-0} = Word0; +let Inst{63-32} = Word1; + +// LLVM can only encode 64-bit instructions, so these fields are manually +// encoded in R600CodeEmitter +// +// bits<16> OFFSET; +// bits<2> ENDIAN_SWAP = 0; +// bits<1> CONST_BUF_NO_STRIDE = 0; +// bits<1> MEGA_FETCH = 0; +// bits<1> ALT_CONST = 0; +// bits<2> BUFFER_INDEX_MODE = 0; + + + +// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding +// is done in R600CodeEmitter +// +// Inst{79-64} = OFFSET; +// Inst{81-80} = ENDIAN_SWAP; +// Inst{82} = CONST_BUF_NO_STRIDE; +// Inst{83} = MEGA_FETCH; +// Inst{84} = ALT_CONST; +// Inst{86-85} = BUFFER_INDEX_MODE; +// Inst{95-86} = 0; Reserved + +// VTX_WORD3 (Padding) +// +// Inst{127-96} = 0; + let VTXInst = 1; +} + +//===---------------------------------------------------------------------===// +// Flow and Program control Instructions +//===---------------------------------------------------------------------===// +class ILFormat pattern> +: Instruction { + + let Namespace = "AMDGPU"; + dag OutOperandList = outs; + dag InOperandList = ins; + let Pattern = pattern; + let AsmString = !strconcat(asmstr, "\n"); + let isPseudo = 1; + let Itinerary = NullALU; + bit hasIEEEFlag = 0; + bit hasZeroOpFlag = 0; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let isCodeGenOnly = 1; +} + +multiclass BranchConditional { + def _i32 : ILFormat<(outs), + (ins brtarget:$target, rci:$src0), + "; i32 Pseudo branch instruction", + [(Op bb:$target, (i32 rci:$src0))]>; + def _f32 : ILFormat<(outs), + (ins brtarget:$target, rcf:$src0), + "; f32 Pseudo branch instruction", + [(Op bb:$target, (f32 rcf:$src0))]>; +} + +// Only scalar types should generate flow control +multiclass BranchInstr { + def _i32 : ILFormat<(outs), (ins R600_Reg32:$src), + !strconcat(name, " $src"), []>; + def _f32 : ILFormat<(outs), (ins R600_Reg32:$src), + !strconcat(name, " $src"), []>; +} +// Only scalar types should generate flow control +multiclass BranchInstr2 { + def _i32 : ILFormat<(outs), (ins R600_Reg32:$src0, R600_Reg32:$src1), + !strconcat(name, " $src0, $src1"), []>; + def _f32 : ILFormat<(outs), (ins R600_Reg32:$src0, R600_Reg32:$src1), + !strconcat(name, " $src0, $src1"), []>; +} + +//===---------------------------------------------------------------------===// +// Custom Inserter for Branches and returns, this eventually will be a +// separate pass +//===---------------------------------------------------------------------===// +let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in { + def BRANCH : ILFormat<(outs), (ins brtarget:$target), + "; Pseudo unconditional branch instruction", + [(br bb:$target)]>; + defm BRANCH_COND : BranchConditional; +} + +//===---------------------------------------------------------------------===// +// Return instruction +//===---------------------------------------------------------------------===// +let isTerminator = 1, isReturn = 1, hasCtrlDep = 1, + usesCustomInserter = 1 in { + def RETURN : ILFormat<(outs), (ins variable_ops), + "RETURN", [(IL_retflag)]>; +} + +//===----------------------------------------------------------------------===// +// Branch Instructions +//===----------------------------------------------------------------------===// + +def IF_PREDICATE_SET : ILFormat<(outs), (ins R600_Reg32:$src), + "IF_PREDICATE_SET $src", []>; + +let isTerminator=1 in { + def BREAK : ILFormat< (outs), (ins), + "BREAK", []>; + def CONTINUE : ILFormat< (outs), (ins), + "CONTINUE", []>; + def DEFAULT : ILFormat< (outs), (ins), + "DEFAULT", []>; + def ELSE : ILFormat< (outs), (ins), + "ELSE", []>; + def ENDSWITCH : ILFormat< (outs), (ins), + "ENDSWITCH", []>; + def ENDMAIN : ILFormat< (outs), (ins), + "ENDMAIN", []>; + def END : ILFormat< (outs), (ins), + "END", []>; + def ENDFUNC : ILFormat< (outs), (ins), + "ENDFUNC", []>; + def ENDIF : ILFormat< (outs), (ins), + "ENDIF", []>; + def WHILELOOP : ILFormat< (outs), (ins), + "WHILE", []>; + def ENDLOOP : ILFormat< (outs), (ins), + "ENDLOOP", []>; + def FUNC : ILFormat< (outs), (ins), + "FUNC", []>; + def RETDYN : ILFormat< (outs), (ins), + "RET_DYN", []>; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm IF_LOGICALNZ : BranchInstr<"IF_LOGICALNZ">; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm IF_LOGICALZ : BranchInstr<"IF_LOGICALZ">; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm BREAK_LOGICALNZ : BranchInstr<"BREAK_LOGICALNZ">; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm BREAK_LOGICALZ : BranchInstr<"BREAK_LOGICALZ">; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm CONTINUE_LOGICALNZ : BranchInstr<"CONTINUE_LOGICALNZ">; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm CONTINUE_LOGICALZ : BranchInstr<"CONTINUE_LOGICALZ">; + defm IFC : BranchInstr2<"IFC">; + defm BREAKC : BranchInstr2<"BREAKC">; + defm CONTINUEC : BranchInstr2<"CONTINUEC">; +} + +//===----------------------------------------------------------------------===// +// Indirect addressing pseudo instructions +//===----------------------------------------------------------------------===// + +let isPseudo = 1 in { + +class ExtractVertical : InstR600 < + (outs R600_Reg32:$dst), + (ins vec_rc:$vec, R600_Reg32:$index), "", + [], + AnyALU +>; + +let Constraints = "$dst = $vec" in { + +class InsertVertical : InstR600 < + (outs vec_rc:$dst), + (ins vec_rc:$vec, R600_Reg32:$value, R600_Reg32:$index), "", + [], + AnyALU +>; + +} // End Constraints = "$dst = $vec" + +} // End isPseudo = 1 + +def R600_EXTRACT_ELT_V2 : ExtractVertical ; +def R600_EXTRACT_ELT_V4 : ExtractVertical ; + +def R600_INSERT_ELT_V2 : InsertVertical ; +def R600_INSERT_ELT_V4 : InsertVertical ; + +class ExtractVerticalPat : Pat < + (scalar_ty (extractelt vec_ty:$vec, i32:$index)), + (inst $vec, $index) +>; + +def : ExtractVerticalPat ; +def : ExtractVerticalPat ; +def : ExtractVerticalPat ; +def : ExtractVerticalPat ; + +class InsertVerticalPat : Pat < + (vec_ty (insertelt vec_ty:$vec, scalar_ty:$value, i32:$index)), + (inst $vec, $value, $index) +>; + +def : InsertVerticalPat ; +def : InsertVerticalPat ; +def : InsertVerticalPat ; +def : InsertVerticalPat ; + +//===----------------------------------------------------------------------===// +// ISel Patterns +//===----------------------------------------------------------------------===// + +// CND*_INT Pattterns for f32 True / False values + +class CND_INT_f32 : Pat < + (selectcc i32:$src0, 0, f32:$src1, f32:$src2, cc), + (cnd $src0, $src1, $src2) +>; + +def : CND_INT_f32 ; +def : CND_INT_f32 ; +def : CND_INT_f32 ; + +//CNDGE_INT extra pattern +def : Pat < + (selectcc i32:$src0, -1, i32:$src1, i32:$src2, COND_SGT), + (CNDGE_INT $src0, $src1, $src2) +>; + +// KIL Patterns +def KILP : Pat < + (int_AMDGPU_kilp), + (MASK_WRITE (KILLGT (f32 ONE), (f32 ZERO))) +>; + +def KIL : Pat < + (int_AMDGPU_kill f32:$src0), + (MASK_WRITE (KILLGT (f32 ZERO), $src0)) +>; + +def : Extract_Element ; +def : Extract_Element ; +def : Extract_Element ; +def : Extract_Element ; + +def : Insert_Element ; +def : Insert_Element ; +def : Insert_Element ; +def : Insert_Element ; + +def : Extract_Element ; +def : Extract_Element ; +def : Extract_Element ; +def : Extract_Element ; + +def : Insert_Element ; +def : Insert_Element ; +def : Insert_Element ; +def : Insert_Element ; + +def : Extract_Element ; +def : Extract_Element ; + +def : Insert_Element ; +def : Insert_Element ; + +def : Extract_Element ; +def : Extract_Element ; + +def : Insert_Element ; +def : Insert_Element ; + +// bitconvert patterns + +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; + +// DWORDADDR pattern +def : DwordAddrPat ; + +} // End isR600toCayman Predicate + +let Predicates = [isR600] in { +// Intrinsic patterns +defm : Expand24IBitOps; +defm : Expand24UBitOps; +} // End isR600 + +def getLDSNoRetOp : InstrMapping { + let FilterClass = "R600_LDS_1A1D"; + let RowFields = ["BaseOp"]; + let ColFields = ["DisableEncoding"]; + let KeyCol = ["$dst"]; + let ValueCols = [[""""]]; +} diff --git a/lib/Target/AMDGPU/R600Intrinsics.td b/lib/Target/AMDGPU/R600Intrinsics.td new file mode 100644 index 00000000000..9681747006d --- /dev/null +++ b/lib/Target/AMDGPU/R600Intrinsics.td @@ -0,0 +1,75 @@ +//===-- R600Intrinsics.td - R600 Instrinsic defs -------*- tablegen -*-----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// R600 Intrinsic Definitions +// +//===----------------------------------------------------------------------===// + +let TargetPrefix = "R600", isTarget = 1 in { + class TextureIntrinsicFloatInput : + Intrinsic<[llvm_v4f32_ty], [ + llvm_v4f32_ty, // Coord + llvm_i32_ty, // offset_x + llvm_i32_ty, // offset_y, + llvm_i32_ty, // offset_z, + llvm_i32_ty, // resource_id + llvm_i32_ty, // samplerid + llvm_i32_ty, // coord_type_x + llvm_i32_ty, // coord_type_y + llvm_i32_ty, // coord_type_z + llvm_i32_ty // coord_type_w + ], [IntrNoMem]>; + class TextureIntrinsicInt32Input : + Intrinsic<[llvm_v4i32_ty], [ + llvm_v4i32_ty, // Coord + llvm_i32_ty, // offset_x + llvm_i32_ty, // offset_y, + llvm_i32_ty, // offset_z, + llvm_i32_ty, // resource_id + llvm_i32_ty, // samplerid + llvm_i32_ty, // coord_type_x + llvm_i32_ty, // coord_type_y + llvm_i32_ty, // coord_type_z + llvm_i32_ty // coord_type_w + ], [IntrNoMem]>; + + def int_R600_load_input : + Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_R600_interp_input : + Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_R600_interp_const : + Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], [IntrNoMem]>; +def int_R600_interp_xy : + Intrinsic<[llvm_v2f32_ty], [llvm_i32_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; +def int_R600_interp_zw : + Intrinsic<[llvm_v2f32_ty], [llvm_i32_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_R600_load_texbuf : + Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_R600_tex : TextureIntrinsicFloatInput; + def int_R600_texc : TextureIntrinsicFloatInput; + def int_R600_txl : TextureIntrinsicFloatInput; + def int_R600_txlc : TextureIntrinsicFloatInput; + def int_R600_txb : TextureIntrinsicFloatInput; + def int_R600_txbc : TextureIntrinsicFloatInput; + def int_R600_txf : TextureIntrinsicInt32Input; + def int_R600_ldptr : TextureIntrinsicInt32Input; + def int_R600_txq : TextureIntrinsicInt32Input; + def int_R600_ddx : TextureIntrinsicFloatInput; + def int_R600_ddy : TextureIntrinsicFloatInput; + def int_R600_store_swizzle : + Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>; + def int_R600_store_stream_output : + Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; + def int_R600_store_pixel_depth : + Intrinsic<[], [llvm_float_ty], []>; + def int_R600_store_pixel_stencil : + Intrinsic<[], [llvm_float_ty], []>; + def int_R600_store_dummy : + Intrinsic<[], [llvm_i32_ty], []>; +} diff --git a/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp b/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp new file mode 100644 index 00000000000..01105c614c5 --- /dev/null +++ b/lib/Target/AMDGPU/R600MachineFunctionInfo.cpp @@ -0,0 +1,20 @@ +//===-- R600MachineFunctionInfo.cpp - R600 Machine Function Info-*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#include "R600MachineFunctionInfo.h" + +using namespace llvm; + + +// Pin the vtable to this file. +void R600MachineFunctionInfo::anchor() {} + +R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF) + : AMDGPUMachineFunction(MF) { } diff --git a/lib/Target/AMDGPU/R600MachineFunctionInfo.h b/lib/Target/AMDGPU/R600MachineFunctionInfo.h new file mode 100644 index 00000000000..263561edd30 --- /dev/null +++ b/lib/Target/AMDGPU/R600MachineFunctionInfo.h @@ -0,0 +1,34 @@ +//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_R600MACHINEFUNCTIONINFO_H +#define LLVM_LIB_TARGET_R600_R600MACHINEFUNCTIONINFO_H + +#include "AMDGPUMachineFunction.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include + +namespace llvm { + +class R600MachineFunctionInfo : public AMDGPUMachineFunction { + void anchor() override; +public: + R600MachineFunctionInfo(const MachineFunction &MF); + SmallVector LiveOuts; + std::vector IndirectRegs; + unsigned StackSize; +}; + +} // End llvm namespace + +#endif diff --git a/lib/Target/AMDGPU/R600MachineScheduler.cpp b/lib/Target/AMDGPU/R600MachineScheduler.cpp new file mode 100644 index 00000000000..bcde5fb50da --- /dev/null +++ b/lib/Target/AMDGPU/R600MachineScheduler.cpp @@ -0,0 +1,469 @@ +//===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief R600 Machine Scheduler interface +// +//===----------------------------------------------------------------------===// + +#include "R600MachineScheduler.h" +#include "AMDGPUSubtarget.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Pass.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "misched" + +void R600SchedStrategy::initialize(ScheduleDAGMI *dag) { + assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness"); + DAG = static_cast(dag); + const AMDGPUSubtarget &ST = DAG->MF.getSubtarget(); + TII = static_cast(DAG->TII); + TRI = static_cast(DAG->TRI); + VLIW5 = !ST.hasCaymanISA(); + MRI = &DAG->MRI; + CurInstKind = IDOther; + CurEmitted = 0; + OccupedSlotsMask = 31; + InstKindLimit[IDAlu] = TII->getMaxAlusPerClause(); + InstKindLimit[IDOther] = 32; + InstKindLimit[IDFetch] = ST.getTexVTXClauseSize(); + AluInstCount = 0; + FetchInstCount = 0; +} + +void R600SchedStrategy::MoveUnits(std::vector &QSrc, + std::vector &QDst) +{ + QDst.insert(QDst.end(), QSrc.begin(), QSrc.end()); + QSrc.clear(); +} + +static +unsigned getWFCountLimitedByGPR(unsigned GPRCount) { + assert (GPRCount && "GPRCount cannot be 0"); + return 248 / GPRCount; +} + +SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) { + SUnit *SU = nullptr; + NextInstKind = IDOther; + + IsTopNode = false; + + // check if we might want to switch current clause type + bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) || + (Available[CurInstKind].empty()); + bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) && + (!Available[IDFetch].empty() || !Available[IDOther].empty()); + + if (CurInstKind == IDAlu && !Available[IDFetch].empty()) { + // We use the heuristic provided by AMD Accelerated Parallel Processing + // OpenCL Programming Guide : + // The approx. number of WF that allows TEX inst to hide ALU inst is : + // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU)) + float ALUFetchRationEstimate = + (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) / + (FetchInstCount + Available[IDFetch].size()); + if (ALUFetchRationEstimate == 0) { + AllowSwitchFromAlu = true; + } else { + unsigned NeededWF = 62.5f / ALUFetchRationEstimate; + DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" ); + // We assume the local GPR requirements to be "dominated" by the requirement + // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and + // after TEX are indeed likely to consume or generate values from/for the + // TEX clause. + // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause + // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need + // one GPR) or TmXYZW = TnXYZW (need 2 GPR). + // (TODO : use RegisterPressure) + // If we are going too use too many GPR, we flush Fetch instruction to lower + // register pressure on 128 bits regs. + unsigned NearRegisterRequirement = 2 * Available[IDFetch].size(); + if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement)) + AllowSwitchFromAlu = true; + } + } + + if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) || + (!AllowSwitchFromAlu && CurInstKind == IDAlu))) { + // try to pick ALU + SU = pickAlu(); + if (!SU && !PhysicalRegCopy.empty()) { + SU = PhysicalRegCopy.front(); + PhysicalRegCopy.erase(PhysicalRegCopy.begin()); + } + if (SU) { + if (CurEmitted >= InstKindLimit[IDAlu]) + CurEmitted = 0; + NextInstKind = IDAlu; + } + } + + if (!SU) { + // try to pick FETCH + SU = pickOther(IDFetch); + if (SU) + NextInstKind = IDFetch; + } + + // try to pick other + if (!SU) { + SU = pickOther(IDOther); + if (SU) + NextInstKind = IDOther; + } + + DEBUG( + if (SU) { + dbgs() << " ** Pick node **\n"; + SU->dump(DAG); + } else { + dbgs() << "NO NODE \n"; + for (unsigned i = 0; i < DAG->SUnits.size(); i++) { + const SUnit &S = DAG->SUnits[i]; + if (!S.isScheduled) + S.dump(DAG); + } + } + ); + + return SU; +} + +void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { + if (NextInstKind != CurInstKind) { + DEBUG(dbgs() << "Instruction Type Switch\n"); + if (NextInstKind != IDAlu) + OccupedSlotsMask |= 31; + CurEmitted = 0; + CurInstKind = NextInstKind; + } + + if (CurInstKind == IDAlu) { + AluInstCount ++; + switch (getAluKind(SU)) { + case AluT_XYZW: + CurEmitted += 4; + break; + case AluDiscarded: + break; + default: { + ++CurEmitted; + for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(), + E = SU->getInstr()->operands_end(); It != E; ++It) { + MachineOperand &MO = *It; + if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X) + ++CurEmitted; + } + } + } + } else { + ++CurEmitted; + } + + + DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n"); + + if (CurInstKind != IDFetch) { + MoveUnits(Pending[IDFetch], Available[IDFetch]); + } else + FetchInstCount++; +} + +static bool +isPhysicalRegCopy(MachineInstr *MI) { + if (MI->getOpcode() != AMDGPU::COPY) + return false; + + return !TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg()); +} + +void R600SchedStrategy::releaseTopNode(SUnit *SU) { + DEBUG(dbgs() << "Top Releasing ";SU->dump(DAG);); +} + +void R600SchedStrategy::releaseBottomNode(SUnit *SU) { + DEBUG(dbgs() << "Bottom Releasing ";SU->dump(DAG);); + if (isPhysicalRegCopy(SU->getInstr())) { + PhysicalRegCopy.push_back(SU); + return; + } + + int IK = getInstKind(SU); + + // There is no export clause, we can schedule one as soon as its ready + if (IK == IDOther) + Available[IDOther].push_back(SU); + else + Pending[IK].push_back(SU); + +} + +bool R600SchedStrategy::regBelongsToClass(unsigned Reg, + const TargetRegisterClass *RC) const { + if (!TargetRegisterInfo::isVirtualRegister(Reg)) { + return RC->contains(Reg); + } else { + return MRI->getRegClass(Reg) == RC; + } +} + +R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { + MachineInstr *MI = SU->getInstr(); + + if (TII->isTransOnly(MI)) + return AluTrans; + + switch (MI->getOpcode()) { + case AMDGPU::PRED_X: + return AluPredX; + case AMDGPU::INTERP_PAIR_XY: + case AMDGPU::INTERP_PAIR_ZW: + case AMDGPU::INTERP_VEC_LOAD: + case AMDGPU::DOT_4: + return AluT_XYZW; + case AMDGPU::COPY: + if (MI->getOperand(1).isUndef()) { + // MI will become a KILL, don't considers it in scheduling + return AluDiscarded; + } + default: + break; + } + + // Does the instruction take a whole IG ? + // XXX: Is it possible to add a helper function in R600InstrInfo that can + // be used here and in R600PacketizerList::isSoloInstruction() ? + if(TII->isVector(*MI) || + TII->isCubeOp(MI->getOpcode()) || + TII->isReductionOp(MI->getOpcode()) || + MI->getOpcode() == AMDGPU::GROUP_BARRIER) { + return AluT_XYZW; + } + + if (TII->isLDSInstr(MI->getOpcode())) { + return AluT_X; + } + + // Is the result already assigned to a channel ? + unsigned DestSubReg = MI->getOperand(0).getSubReg(); + switch (DestSubReg) { + case AMDGPU::sub0: + return AluT_X; + case AMDGPU::sub1: + return AluT_Y; + case AMDGPU::sub2: + return AluT_Z; + case AMDGPU::sub3: + return AluT_W; + default: + break; + } + + // Is the result already member of a X/Y/Z/W class ? + unsigned DestReg = MI->getOperand(0).getReg(); + if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) || + regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass)) + return AluT_X; + if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass)) + return AluT_Y; + if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass)) + return AluT_Z; + if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass)) + return AluT_W; + if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass)) + return AluT_XYZW; + + // LDS src registers cannot be used in the Trans slot. + if (TII->readsLDSSrcReg(MI)) + return AluT_XYZW; + + return AluAny; + +} + +int R600SchedStrategy::getInstKind(SUnit* SU) { + int Opcode = SU->getInstr()->getOpcode(); + + if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode)) + return IDFetch; + + if (TII->isALUInstr(Opcode)) { + return IDAlu; + } + + switch (Opcode) { + case AMDGPU::PRED_X: + case AMDGPU::COPY: + case AMDGPU::CONST_COPY: + case AMDGPU::INTERP_PAIR_XY: + case AMDGPU::INTERP_PAIR_ZW: + case AMDGPU::INTERP_VEC_LOAD: + case AMDGPU::DOT_4: + return IDAlu; + default: + return IDOther; + } +} + +SUnit *R600SchedStrategy::PopInst(std::vector &Q, bool AnyALU) { + if (Q.empty()) + return nullptr; + for (std::vector::reverse_iterator It = Q.rbegin(), E = Q.rend(); + It != E; ++It) { + SUnit *SU = *It; + InstructionsGroupCandidate.push_back(SU->getInstr()); + if (TII->fitsConstReadLimitations(InstructionsGroupCandidate) + && (!AnyALU || !TII->isVectorOnly(SU->getInstr())) + ) { + InstructionsGroupCandidate.pop_back(); + Q.erase((It + 1).base()); + return SU; + } else { + InstructionsGroupCandidate.pop_back(); + } + } + return nullptr; +} + +void R600SchedStrategy::LoadAlu() { + std::vector &QSrc = Pending[IDAlu]; + for (unsigned i = 0, e = QSrc.size(); i < e; ++i) { + AluKind AK = getAluKind(QSrc[i]); + AvailableAlus[AK].push_back(QSrc[i]); + } + QSrc.clear(); +} + +void R600SchedStrategy::PrepareNextSlot() { + DEBUG(dbgs() << "New Slot\n"); + assert (OccupedSlotsMask && "Slot wasn't filled"); + OccupedSlotsMask = 0; +// if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS) +// OccupedSlotsMask |= 16; + InstructionsGroupCandidate.clear(); + LoadAlu(); +} + +void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) { + int DstIndex = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); + if (DstIndex == -1) { + return; + } + unsigned DestReg = MI->getOperand(DstIndex).getReg(); + // PressureRegister crashes if an operand is def and used in the same inst + // and we try to constraint its regclass + for (MachineInstr::mop_iterator It = MI->operands_begin(), + E = MI->operands_end(); It != E; ++It) { + MachineOperand &MO = *It; + if (MO.isReg() && !MO.isDef() && + MO.getReg() == DestReg) + return; + } + // Constrains the regclass of DestReg to assign it to Slot + switch (Slot) { + case 0: + MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_XRegClass); + break; + case 1: + MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_YRegClass); + break; + case 2: + MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass); + break; + case 3: + MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_WRegClass); + break; + } +} + +SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) { + static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W}; + SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu); + if (SlotedSU) + return SlotedSU; + SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu); + if (UnslotedSU) + AssignSlot(UnslotedSU->getInstr(), Slot); + return UnslotedSU; +} + +unsigned R600SchedStrategy::AvailablesAluCount() const { + return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() + + AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() + + AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() + + AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() + + AvailableAlus[AluPredX].size(); +} + +SUnit* R600SchedStrategy::pickAlu() { + while (AvailablesAluCount() || !Pending[IDAlu].empty()) { + if (!OccupedSlotsMask) { + // Bottom up scheduling : predX must comes first + if (!AvailableAlus[AluPredX].empty()) { + OccupedSlotsMask |= 31; + return PopInst(AvailableAlus[AluPredX], false); + } + // Flush physical reg copies (RA will discard them) + if (!AvailableAlus[AluDiscarded].empty()) { + OccupedSlotsMask |= 31; + return PopInst(AvailableAlus[AluDiscarded], false); + } + // If there is a T_XYZW alu available, use it + if (!AvailableAlus[AluT_XYZW].empty()) { + OccupedSlotsMask |= 15; + return PopInst(AvailableAlus[AluT_XYZW], false); + } + } + bool TransSlotOccuped = OccupedSlotsMask & 16; + if (!TransSlotOccuped && VLIW5) { + if (!AvailableAlus[AluTrans].empty()) { + OccupedSlotsMask |= 16; + return PopInst(AvailableAlus[AluTrans], false); + } + SUnit *SU = AttemptFillSlot(3, true); + if (SU) { + OccupedSlotsMask |= 16; + return SU; + } + } + for (int Chan = 3; Chan > -1; --Chan) { + bool isOccupied = OccupedSlotsMask & (1 << Chan); + if (!isOccupied) { + SUnit *SU = AttemptFillSlot(Chan, false); + if (SU) { + OccupedSlotsMask |= (1 << Chan); + InstructionsGroupCandidate.push_back(SU->getInstr()); + return SU; + } + } + } + PrepareNextSlot(); + } + return nullptr; +} + +SUnit* R600SchedStrategy::pickOther(int QID) { + SUnit *SU = nullptr; + std::vector &AQ = Available[QID]; + + if (AQ.empty()) { + MoveUnits(Pending[QID], AQ); + } + if (!AQ.empty()) { + SU = AQ.back(); + AQ.resize(AQ.size() - 1); + } + return SU; +} diff --git a/lib/Target/AMDGPU/R600MachineScheduler.h b/lib/Target/AMDGPU/R600MachineScheduler.h new file mode 100644 index 00000000000..fc5b95c28e7 --- /dev/null +++ b/lib/Target/AMDGPU/R600MachineScheduler.h @@ -0,0 +1,103 @@ +//===-- R600MachineScheduler.h - R600 Scheduler Interface -*- C++ -*-------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief R600 Machine Scheduler interface +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_R600MACHINESCHEDULER_H +#define LLVM_LIB_TARGET_R600_R600MACHINESCHEDULER_H + +#include "R600InstrInfo.h" +#include "llvm/ADT/PriorityQueue.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +namespace llvm { + +class R600SchedStrategy : public MachineSchedStrategy { + + const ScheduleDAGMILive *DAG; + const R600InstrInfo *TII; + const R600RegisterInfo *TRI; + MachineRegisterInfo *MRI; + + enum InstKind { + IDAlu, + IDFetch, + IDOther, + IDLast + }; + + enum AluKind { + AluAny, + AluT_X, + AluT_Y, + AluT_Z, + AluT_W, + AluT_XYZW, + AluPredX, + AluTrans, + AluDiscarded, // LLVM Instructions that are going to be eliminated + AluLast + }; + + std::vector Available[IDLast], Pending[IDLast]; + std::vector AvailableAlus[AluLast]; + std::vector PhysicalRegCopy; + + InstKind CurInstKind; + int CurEmitted; + InstKind NextInstKind; + + unsigned AluInstCount; + unsigned FetchInstCount; + + int InstKindLimit[IDLast]; + + int OccupedSlotsMask; + +public: + R600SchedStrategy() : + DAG(nullptr), TII(nullptr), TRI(nullptr), MRI(nullptr) { + } + + virtual ~R600SchedStrategy() {} + + void initialize(ScheduleDAGMI *dag) override; + SUnit *pickNode(bool &IsTopNode) override; + void schedNode(SUnit *SU, bool IsTopNode) override; + void releaseTopNode(SUnit *SU) override; + void releaseBottomNode(SUnit *SU) override; + +private: + std::vector InstructionsGroupCandidate; + bool VLIW5; + + int getInstKind(SUnit *SU); + bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const; + AluKind getAluKind(SUnit *SU) const; + void LoadAlu(); + unsigned AvailablesAluCount() const; + SUnit *AttemptFillSlot (unsigned Slot, bool AnyAlu); + void PrepareNextSlot(); + SUnit *PopInst(std::vector &Q, bool AnyALU); + + void AssignSlot(MachineInstr *MI, unsigned Slot); + SUnit* pickAlu(); + SUnit* pickOther(int QID); + void MoveUnits(std::vector &QSrc, std::vector &QDst); +}; + +} // namespace llvm + +#endif /* R600MACHINESCHEDULER_H_ */ diff --git a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp new file mode 100644 index 00000000000..0c06ccc736d --- /dev/null +++ b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -0,0 +1,382 @@ +//===--------------------- R600MergeVectorRegisters.cpp -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass merges inputs of swizzeable instructions into vector sharing +/// common data and/or have enough undef subreg using swizzle abilities. +/// +/// For instance let's consider the following pseudo code : +/// vreg5 = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3 +/// ... +/// vreg7 = REG_SEQ vreg1, sub0, vreg3, sub1, undef, sub2, vreg4, sub3 +/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub1, sub2, sub3 +/// +/// is turned into : +/// vreg5 = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3 +/// ... +/// vreg7 = INSERT_SUBREG vreg4, sub3 +/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub2, sub1, sub3 +/// +/// This allow regalloc to reduce register pressure for vector registers and +/// to reduce MOV count. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "R600InstrInfo.h" +#include "llvm/CodeGen/DFAPacketizer.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "vec-merger" + +namespace { + +static bool +isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) { + for (MachineRegisterInfo::def_instr_iterator It = MRI.def_instr_begin(Reg), + E = MRI.def_instr_end(); It != E; ++It) { + return (*It).isImplicitDef(); + } + if (MRI.isReserved(Reg)) { + return false; + } + llvm_unreachable("Reg without a def"); + return false; +} + +class RegSeqInfo { +public: + MachineInstr *Instr; + DenseMap RegToChan; + std::vector UndefReg; + RegSeqInfo(MachineRegisterInfo &MRI, MachineInstr *MI) : Instr(MI) { + assert(MI->getOpcode() == AMDGPU::REG_SEQUENCE); + for (unsigned i = 1, e = Instr->getNumOperands(); i < e; i+=2) { + MachineOperand &MO = Instr->getOperand(i); + unsigned Chan = Instr->getOperand(i + 1).getImm(); + if (isImplicitlyDef(MRI, MO.getReg())) + UndefReg.push_back(Chan); + else + RegToChan[MO.getReg()] = Chan; + } + } + RegSeqInfo() {} + + bool operator==(const RegSeqInfo &RSI) const { + return RSI.Instr == Instr; + } +}; + +class R600VectorRegMerger : public MachineFunctionPass { +private: + MachineRegisterInfo *MRI; + const R600InstrInfo *TII; + bool canSwizzle(const MachineInstr &) const; + bool areAllUsesSwizzeable(unsigned Reg) const; + void SwizzleInput(MachineInstr &, + const std::vector > &) const; + bool tryMergeVector(const RegSeqInfo *, RegSeqInfo *, + std::vector > &Remap) const; + bool tryMergeUsingCommonSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI, + std::vector > &RemapChan); + bool tryMergeUsingFreeSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI, + std::vector > &RemapChan); + MachineInstr *RebuildVector(RegSeqInfo *MI, + const RegSeqInfo *BaseVec, + const std::vector > &RemapChan) const; + void RemoveMI(MachineInstr *); + void trackRSI(const RegSeqInfo &RSI); + + typedef DenseMap > InstructionSetMap; + DenseMap PreviousRegSeq; + InstructionSetMap PreviousRegSeqByReg; + InstructionSetMap PreviousRegSeqByUndefCount; +public: + static char ID; + R600VectorRegMerger(TargetMachine &tm) : MachineFunctionPass(ID), + TII(nullptr) { } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + const char *getPassName() const override { + return "R600 Vector Registers Merge Pass"; + } + + bool runOnMachineFunction(MachineFunction &Fn) override; +}; + +char R600VectorRegMerger::ID = 0; + +bool R600VectorRegMerger::canSwizzle(const MachineInstr &MI) + const { + if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) + return true; + switch (MI.getOpcode()) { + case AMDGPU::R600_ExportSwz: + case AMDGPU::EG_ExportSwz: + return true; + default: + return false; + } +} + +bool R600VectorRegMerger::tryMergeVector(const RegSeqInfo *Untouched, + RegSeqInfo *ToMerge, std::vector< std::pair > &Remap) + const { + unsigned CurrentUndexIdx = 0; + for (DenseMap::iterator It = ToMerge->RegToChan.begin(), + E = ToMerge->RegToChan.end(); It != E; ++It) { + DenseMap::const_iterator PosInUntouched = + Untouched->RegToChan.find((*It).first); + if (PosInUntouched != Untouched->RegToChan.end()) { + Remap.push_back(std::pair + ((*It).second, (*PosInUntouched).second)); + continue; + } + if (CurrentUndexIdx >= Untouched->UndefReg.size()) + return false; + Remap.push_back(std::pair + ((*It).second, Untouched->UndefReg[CurrentUndexIdx++])); + } + + return true; +} + +static +unsigned getReassignedChan( + const std::vector > &RemapChan, + unsigned Chan) { + for (unsigned j = 0, je = RemapChan.size(); j < je; j++) { + if (RemapChan[j].first == Chan) + return RemapChan[j].second; + } + llvm_unreachable("Chan wasn't reassigned"); +} + +MachineInstr *R600VectorRegMerger::RebuildVector( + RegSeqInfo *RSI, const RegSeqInfo *BaseRSI, + const std::vector > &RemapChan) const { + unsigned Reg = RSI->Instr->getOperand(0).getReg(); + MachineBasicBlock::iterator Pos = RSI->Instr; + MachineBasicBlock &MBB = *Pos->getParent(); + DebugLoc DL = Pos->getDebugLoc(); + + unsigned SrcVec = BaseRSI->Instr->getOperand(0).getReg(); + DenseMap UpdatedRegToChan = BaseRSI->RegToChan; + std::vector UpdatedUndef = BaseRSI->UndefReg; + for (DenseMap::iterator It = RSI->RegToChan.begin(), + E = RSI->RegToChan.end(); It != E; ++It) { + unsigned DstReg = MRI->createVirtualRegister(&AMDGPU::R600_Reg128RegClass); + unsigned SubReg = (*It).first; + unsigned Swizzle = (*It).second; + unsigned Chan = getReassignedChan(RemapChan, Swizzle); + + MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::INSERT_SUBREG), + DstReg) + .addReg(SrcVec) + .addReg(SubReg) + .addImm(Chan); + UpdatedRegToChan[SubReg] = Chan; + std::vector::iterator ChanPos = + std::find(UpdatedUndef.begin(), UpdatedUndef.end(), Chan); + if (ChanPos != UpdatedUndef.end()) + UpdatedUndef.erase(ChanPos); + assert(std::find(UpdatedUndef.begin(), UpdatedUndef.end(), Chan) == + UpdatedUndef.end() && + "UpdatedUndef shouldn't contain Chan more than once!"); + DEBUG(dbgs() << " ->"; Tmp->dump();); + (void)Tmp; + SrcVec = DstReg; + } + Pos = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::COPY), Reg) + .addReg(SrcVec); + DEBUG(dbgs() << " ->"; Pos->dump();); + + DEBUG(dbgs() << " Updating Swizzle:\n"); + for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg), + E = MRI->use_instr_end(); It != E; ++It) { + DEBUG(dbgs() << " ";(*It).dump(); dbgs() << " ->"); + SwizzleInput(*It, RemapChan); + DEBUG((*It).dump()); + } + RSI->Instr->eraseFromParent(); + + // Update RSI + RSI->Instr = Pos; + RSI->RegToChan = UpdatedRegToChan; + RSI->UndefReg = UpdatedUndef; + + return Pos; +} + +void R600VectorRegMerger::RemoveMI(MachineInstr *MI) { + for (InstructionSetMap::iterator It = PreviousRegSeqByReg.begin(), + E = PreviousRegSeqByReg.end(); It != E; ++It) { + std::vector &MIs = (*It).second; + MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end()); + } + for (InstructionSetMap::iterator It = PreviousRegSeqByUndefCount.begin(), + E = PreviousRegSeqByUndefCount.end(); It != E; ++It) { + std::vector &MIs = (*It).second; + MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end()); + } +} + +void R600VectorRegMerger::SwizzleInput(MachineInstr &MI, + const std::vector > &RemapChan) const { + unsigned Offset; + if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) + Offset = 2; + else + Offset = 3; + for (unsigned i = 0; i < 4; i++) { + unsigned Swizzle = MI.getOperand(i + Offset).getImm() + 1; + for (unsigned j = 0, e = RemapChan.size(); j < e; j++) { + if (RemapChan[j].first == Swizzle) { + MI.getOperand(i + Offset).setImm(RemapChan[j].second - 1); + break; + } + } + } +} + +bool R600VectorRegMerger::areAllUsesSwizzeable(unsigned Reg) const { + for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg), + E = MRI->use_instr_end(); It != E; ++It) { + if (!canSwizzle(*It)) + return false; + } + return true; +} + +bool R600VectorRegMerger::tryMergeUsingCommonSlot(RegSeqInfo &RSI, + RegSeqInfo &CompatibleRSI, + std::vector > &RemapChan) { + for (MachineInstr::mop_iterator MOp = RSI.Instr->operands_begin(), + MOE = RSI.Instr->operands_end(); MOp != MOE; ++MOp) { + if (!MOp->isReg()) + continue; + if (PreviousRegSeqByReg[MOp->getReg()].empty()) + continue; + for (MachineInstr *MI : PreviousRegSeqByReg[MOp->getReg()]) { + CompatibleRSI = PreviousRegSeq[MI]; + if (RSI == CompatibleRSI) + continue; + if (tryMergeVector(&CompatibleRSI, &RSI, RemapChan)) + return true; + } + } + return false; +} + +bool R600VectorRegMerger::tryMergeUsingFreeSlot(RegSeqInfo &RSI, + RegSeqInfo &CompatibleRSI, + std::vector > &RemapChan) { + unsigned NeededUndefs = 4 - RSI.UndefReg.size(); + if (PreviousRegSeqByUndefCount[NeededUndefs].empty()) + return false; + std::vector &MIs = + PreviousRegSeqByUndefCount[NeededUndefs]; + CompatibleRSI = PreviousRegSeq[MIs.back()]; + tryMergeVector(&CompatibleRSI, &RSI, RemapChan); + return true; +} + +void R600VectorRegMerger::trackRSI(const RegSeqInfo &RSI) { + for (DenseMap::const_iterator + It = RSI.RegToChan.begin(), E = RSI.RegToChan.end(); It != E; ++It) { + PreviousRegSeqByReg[(*It).first].push_back(RSI.Instr); + } + PreviousRegSeqByUndefCount[RSI.UndefReg.size()].push_back(RSI.Instr); + PreviousRegSeq[RSI.Instr] = RSI; +} + +bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) { + TII = static_cast(Fn.getSubtarget().getInstrInfo()); + MRI = &(Fn.getRegInfo()); + for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); + MBB != MBBe; ++MBB) { + MachineBasicBlock *MB = MBB; + PreviousRegSeq.clear(); + PreviousRegSeqByReg.clear(); + PreviousRegSeqByUndefCount.clear(); + + for (MachineBasicBlock::iterator MII = MB->begin(), MIIE = MB->end(); + MII != MIIE; ++MII) { + MachineInstr *MI = MII; + if (MI->getOpcode() != AMDGPU::REG_SEQUENCE) { + if (TII->get(MI->getOpcode()).TSFlags & R600_InstFlag::TEX_INST) { + unsigned Reg = MI->getOperand(1).getReg(); + for (MachineRegisterInfo::def_instr_iterator + It = MRI->def_instr_begin(Reg), E = MRI->def_instr_end(); + It != E; ++It) { + RemoveMI(&(*It)); + } + } + continue; + } + + + RegSeqInfo RSI(*MRI, MI); + + // All uses of MI are swizzeable ? + unsigned Reg = MI->getOperand(0).getReg(); + if (!areAllUsesSwizzeable(Reg)) + continue; + + DEBUG (dbgs() << "Trying to optimize "; + MI->dump(); + ); + + RegSeqInfo CandidateRSI; + std::vector > RemapChan; + DEBUG(dbgs() << "Using common slots...\n";); + if (tryMergeUsingCommonSlot(RSI, CandidateRSI, RemapChan)) { + // Remove CandidateRSI mapping + RemoveMI(CandidateRSI.Instr); + MII = RebuildVector(&RSI, &CandidateRSI, RemapChan); + trackRSI(RSI); + continue; + } + DEBUG(dbgs() << "Using free slots...\n";); + RemapChan.clear(); + if (tryMergeUsingFreeSlot(RSI, CandidateRSI, RemapChan)) { + RemoveMI(CandidateRSI.Instr); + MII = RebuildVector(&RSI, &CandidateRSI, RemapChan); + trackRSI(RSI); + continue; + } + //Failed to merge + trackRSI(RSI); + } + } + return false; +} + +} + +llvm::FunctionPass *llvm::createR600VectorRegMerger(TargetMachine &tm) { + return new R600VectorRegMerger(tm); +} diff --git a/lib/Target/AMDGPU/R600Packetizer.cpp b/lib/Target/AMDGPU/R600Packetizer.cpp new file mode 100644 index 00000000000..deee5bc3997 --- /dev/null +++ b/lib/Target/AMDGPU/R600Packetizer.cpp @@ -0,0 +1,408 @@ +//===----- R600Packetizer.cpp - VLIW packetizer ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass implements instructions packetization for R600. It unsets isLast +/// bit of instructions inside a bundle and substitutes src register with +/// PreviousVector when applicable. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/Debug.h" +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "R600InstrInfo.h" +#include "llvm/CodeGen/DFAPacketizer.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "packets" + +namespace { + +class R600Packetizer : public MachineFunctionPass { + +public: + static char ID; + R600Packetizer(const TargetMachine &TM) : MachineFunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + const char *getPassName() const override { + return "R600 Packetizer"; + } + + bool runOnMachineFunction(MachineFunction &Fn) override; +}; +char R600Packetizer::ID = 0; + +class R600PacketizerList : public VLIWPacketizerList { + +private: + const R600InstrInfo *TII; + const R600RegisterInfo &TRI; + bool VLIW5; + bool ConsideredInstUsesAlreadyWrittenVectorElement; + + unsigned getSlot(const MachineInstr *MI) const { + return TRI.getHWRegChan(MI->getOperand(0).getReg()); + } + + /// \returns register to PV chan mapping for bundle/single instructions that + /// immediately precedes I. + DenseMap getPreviousVector(MachineBasicBlock::iterator I) + const { + DenseMap Result; + I--; + if (!TII->isALUInstr(I->getOpcode()) && !I->isBundle()) + return Result; + MachineBasicBlock::instr_iterator BI = I.getInstrIterator(); + if (I->isBundle()) + BI++; + int LastDstChan = -1; + do { + bool isTrans = false; + int BISlot = getSlot(BI); + if (LastDstChan >= BISlot) + isTrans = true; + LastDstChan = BISlot; + if (TII->isPredicated(BI)) + continue; + int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write); + if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0) + continue; + int DstIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::dst); + if (DstIdx == -1) { + continue; + } + unsigned Dst = BI->getOperand(DstIdx).getReg(); + if (isTrans || TII->isTransOnly(BI)) { + Result[Dst] = AMDGPU::PS; + continue; + } + if (BI->getOpcode() == AMDGPU::DOT4_r600 || + BI->getOpcode() == AMDGPU::DOT4_eg) { + Result[Dst] = AMDGPU::PV_X; + continue; + } + if (Dst == AMDGPU::OQAP) { + continue; + } + unsigned PVReg = 0; + switch (TRI.getHWRegChan(Dst)) { + case 0: + PVReg = AMDGPU::PV_X; + break; + case 1: + PVReg = AMDGPU::PV_Y; + break; + case 2: + PVReg = AMDGPU::PV_Z; + break; + case 3: + PVReg = AMDGPU::PV_W; + break; + default: + llvm_unreachable("Invalid Chan"); + } + Result[Dst] = PVReg; + } while ((++BI)->isBundledWithPred()); + return Result; + } + + void substitutePV(MachineInstr *MI, const DenseMap &PVs) + const { + unsigned Ops[] = { + AMDGPU::OpName::src0, + AMDGPU::OpName::src1, + AMDGPU::OpName::src2 + }; + for (unsigned i = 0; i < 3; i++) { + int OperandIdx = TII->getOperandIdx(MI->getOpcode(), Ops[i]); + if (OperandIdx < 0) + continue; + unsigned Src = MI->getOperand(OperandIdx).getReg(); + const DenseMap::const_iterator It = PVs.find(Src); + if (It != PVs.end()) + MI->getOperand(OperandIdx).setReg(It->second); + } + } +public: + // Ctor. + R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI) + : VLIWPacketizerList(MF, MLI, true), + TII(static_cast( + MF.getSubtarget().getInstrInfo())), + TRI(TII->getRegisterInfo()) { + VLIW5 = !MF.getSubtarget().hasCaymanISA(); + } + + // initPacketizerState - initialize some internal flags. + void initPacketizerState() override { + ConsideredInstUsesAlreadyWrittenVectorElement = false; + } + + // ignorePseudoInstruction - Ignore bundling of pseudo instructions. + bool ignorePseudoInstruction(MachineInstr *MI, + MachineBasicBlock *MBB) override { + return false; + } + + // isSoloInstruction - return true if instruction MI can not be packetized + // with any other instruction, which means that MI itself is a packet. + bool isSoloInstruction(MachineInstr *MI) override { + if (TII->isVector(*MI)) + return true; + if (!TII->isALUInstr(MI->getOpcode())) + return true; + if (MI->getOpcode() == AMDGPU::GROUP_BARRIER) + return true; + // XXX: This can be removed once the packetizer properly handles all the + // LDS instruction group restrictions. + if (TII->isLDSInstr(MI->getOpcode())) + return true; + return false; + } + + // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ + // together. + bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override { + MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr(); + if (getSlot(MII) == getSlot(MIJ)) + ConsideredInstUsesAlreadyWrittenVectorElement = true; + // Does MII and MIJ share the same pred_sel ? + int OpI = TII->getOperandIdx(MII->getOpcode(), AMDGPU::OpName::pred_sel), + OpJ = TII->getOperandIdx(MIJ->getOpcode(), AMDGPU::OpName::pred_sel); + unsigned PredI = (OpI > -1)?MII->getOperand(OpI).getReg():0, + PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg():0; + if (PredI != PredJ) + return false; + if (SUJ->isSucc(SUI)) { + for (unsigned i = 0, e = SUJ->Succs.size(); i < e; ++i) { + const SDep &Dep = SUJ->Succs[i]; + if (Dep.getSUnit() != SUI) + continue; + if (Dep.getKind() == SDep::Anti) + continue; + if (Dep.getKind() == SDep::Output) + if (MII->getOperand(0).getReg() != MIJ->getOperand(0).getReg()) + continue; + return false; + } + } + + bool ARDef = TII->definesAddressRegister(MII) || + TII->definesAddressRegister(MIJ); + bool ARUse = TII->usesAddressRegister(MII) || + TII->usesAddressRegister(MIJ); + if (ARDef && ARUse) + return false; + + return true; + } + + // isLegalToPruneDependencies - Is it legal to prune dependece between SUI + // and SUJ. + bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override { + return false; + } + + void setIsLastBit(MachineInstr *MI, unsigned Bit) const { + unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::last); + MI->getOperand(LastOp).setImm(Bit); + } + + bool isBundlableWithCurrentPMI(MachineInstr *MI, + const DenseMap &PV, + std::vector &BS, + bool &isTransSlot) { + isTransSlot = TII->isTransOnly(MI); + assert (!isTransSlot || VLIW5); + + // Is the dst reg sequence legal ? + if (!isTransSlot && !CurrentPacketMIs.empty()) { + if (getSlot(MI) <= getSlot(CurrentPacketMIs.back())) { + if (ConsideredInstUsesAlreadyWrittenVectorElement && + !TII->isVectorOnly(MI) && VLIW5) { + isTransSlot = true; + DEBUG(dbgs() << "Considering as Trans Inst :"; MI->dump();); + } + else + return false; + } + } + + // Are the Constants limitations met ? + CurrentPacketMIs.push_back(MI); + if (!TII->fitsConstReadLimitations(CurrentPacketMIs)) { + DEBUG( + dbgs() << "Couldn't pack :\n"; + MI->dump(); + dbgs() << "with the following packets :\n"; + for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) { + CurrentPacketMIs[i]->dump(); + dbgs() << "\n"; + } + dbgs() << "because of Consts read limitations\n"; + ); + CurrentPacketMIs.pop_back(); + return false; + } + + // Is there a BankSwizzle set that meet Read Port limitations ? + if (!TII->fitsReadPortLimitations(CurrentPacketMIs, + PV, BS, isTransSlot)) { + DEBUG( + dbgs() << "Couldn't pack :\n"; + MI->dump(); + dbgs() << "with the following packets :\n"; + for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) { + CurrentPacketMIs[i]->dump(); + dbgs() << "\n"; + } + dbgs() << "because of Read port limitations\n"; + ); + CurrentPacketMIs.pop_back(); + return false; + } + + // We cannot read LDS source registrs from the Trans slot. + if (isTransSlot && TII->readsLDSSrcReg(MI)) + return false; + + CurrentPacketMIs.pop_back(); + return true; + } + + MachineBasicBlock::iterator addToPacket(MachineInstr *MI) override { + MachineBasicBlock::iterator FirstInBundle = + CurrentPacketMIs.empty() ? MI : CurrentPacketMIs.front(); + const DenseMap &PV = + getPreviousVector(FirstInBundle); + std::vector BS; + bool isTransSlot; + + if (isBundlableWithCurrentPMI(MI, PV, BS, isTransSlot)) { + for (unsigned i = 0, e = CurrentPacketMIs.size(); i < e; i++) { + MachineInstr *MI = CurrentPacketMIs[i]; + unsigned Op = TII->getOperandIdx(MI->getOpcode(), + AMDGPU::OpName::bank_swizzle); + MI->getOperand(Op).setImm(BS[i]); + } + unsigned Op = TII->getOperandIdx(MI->getOpcode(), + AMDGPU::OpName::bank_swizzle); + MI->getOperand(Op).setImm(BS.back()); + if (!CurrentPacketMIs.empty()) + setIsLastBit(CurrentPacketMIs.back(), 0); + substitutePV(MI, PV); + MachineBasicBlock::iterator It = VLIWPacketizerList::addToPacket(MI); + if (isTransSlot) { + endPacket(std::next(It)->getParent(), std::next(It)); + } + return It; + } + endPacket(MI->getParent(), MI); + if (TII->isTransOnly(MI)) + return MI; + return VLIWPacketizerList::addToPacket(MI); + } +}; + +bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) { + const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo(); + MachineLoopInfo &MLI = getAnalysis(); + + // Instantiate the packetizer. + R600PacketizerList Packetizer(Fn, MLI); + + // DFA state table should not be empty. + assert(Packetizer.getResourceTracker() && "Empty DFA table!"); + + // + // Loop over all basic blocks and remove KILL pseudo-instructions + // These instructions confuse the dependence analysis. Consider: + // D0 = ... (Insn 0) + // R0 = KILL R0, D0 (Insn 1) + // R0 = ... (Insn 2) + // Here, Insn 1 will result in the dependence graph not emitting an output + // dependence between Insn 0 and Insn 2. This can lead to incorrect + // packetization + // + for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); + MBB != MBBe; ++MBB) { + MachineBasicBlock::iterator End = MBB->end(); + MachineBasicBlock::iterator MI = MBB->begin(); + while (MI != End) { + if (MI->isKill() || MI->getOpcode() == AMDGPU::IMPLICIT_DEF || + (MI->getOpcode() == AMDGPU::CF_ALU && !MI->getOperand(8).getImm())) { + MachineBasicBlock::iterator DeleteMI = MI; + ++MI; + MBB->erase(DeleteMI); + End = MBB->end(); + continue; + } + ++MI; + } + } + + // Loop over all of the basic blocks. + for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); + MBB != MBBe; ++MBB) { + // Find scheduling regions and schedule / packetize each region. + unsigned RemainingCount = MBB->size(); + for(MachineBasicBlock::iterator RegionEnd = MBB->end(); + RegionEnd != MBB->begin();) { + // The next region starts above the previous region. Look backward in the + // instruction stream until we find the nearest boundary. + MachineBasicBlock::iterator I = RegionEnd; + for(;I != MBB->begin(); --I, --RemainingCount) { + if (TII->isSchedulingBoundary(std::prev(I), MBB, Fn)) + break; + } + I = MBB->begin(); + + // Skip empty scheduling regions. + if (I == RegionEnd) { + RegionEnd = std::prev(RegionEnd); + --RemainingCount; + continue; + } + // Skip regions with one instruction. + if (I == std::prev(RegionEnd)) { + RegionEnd = std::prev(RegionEnd); + continue; + } + + Packetizer.PacketizeMIs(MBB, I, RegionEnd); + RegionEnd = I; + } + } + + return true; + +} + +} // end anonymous namespace + +llvm::FunctionPass *llvm::createR600Packetizer(TargetMachine &tm) { + return new R600Packetizer(tm); +} diff --git a/lib/Target/AMDGPU/R600RegisterInfo.cpp b/lib/Target/AMDGPU/R600RegisterInfo.cpp new file mode 100644 index 00000000000..fb0359cfc65 --- /dev/null +++ b/lib/Target/AMDGPU/R600RegisterInfo.cpp @@ -0,0 +1,91 @@ +//===-- R600RegisterInfo.cpp - R600 Register Information ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief R600 implementation of the TargetRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#include "R600RegisterInfo.h" +#include "AMDGPUTargetMachine.h" +#include "R600Defines.h" +#include "R600InstrInfo.h" +#include "R600MachineFunctionInfo.h" + +using namespace llvm; + +R600RegisterInfo::R600RegisterInfo() : AMDGPURegisterInfo() { + RCW.RegWeight = 0; + RCW.WeightLimit = 0; +} + +BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const { + BitVector Reserved(getNumRegs()); + + const R600InstrInfo *TII = + static_cast(MF.getSubtarget().getInstrInfo()); + + Reserved.set(AMDGPU::ZERO); + Reserved.set(AMDGPU::HALF); + Reserved.set(AMDGPU::ONE); + Reserved.set(AMDGPU::ONE_INT); + Reserved.set(AMDGPU::NEG_HALF); + Reserved.set(AMDGPU::NEG_ONE); + Reserved.set(AMDGPU::PV_X); + Reserved.set(AMDGPU::ALU_LITERAL_X); + Reserved.set(AMDGPU::ALU_CONST); + Reserved.set(AMDGPU::PREDICATE_BIT); + Reserved.set(AMDGPU::PRED_SEL_OFF); + Reserved.set(AMDGPU::PRED_SEL_ZERO); + Reserved.set(AMDGPU::PRED_SEL_ONE); + Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); + + for (TargetRegisterClass::iterator I = AMDGPU::R600_AddrRegClass.begin(), + E = AMDGPU::R600_AddrRegClass.end(); I != E; ++I) { + Reserved.set(*I); + } + + TII->reserveIndirectRegisters(Reserved, MF); + + return Reserved; +} + +unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const { + return this->getEncodingValue(reg) >> HW_CHAN_SHIFT; +} + +unsigned R600RegisterInfo::getHWRegIndex(unsigned Reg) const { + return GET_REG_INDEX(getEncodingValue(Reg)); +} + +const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass( + MVT VT) const { + switch(VT.SimpleTy) { + default: + case MVT::i32: return &AMDGPU::R600_TReg32RegClass; + } +} + +const RegClassWeight &R600RegisterInfo::getRegClassWeight( + const TargetRegisterClass *RC) const { + return RCW; +} + +bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const { + assert(!TargetRegisterInfo::isVirtualRegister(Reg)); + + switch (Reg) { + case AMDGPU::OQAP: + case AMDGPU::OQBP: + case AMDGPU::AR_X: + return false; + default: + return true; + } +} diff --git a/lib/Target/AMDGPU/R600RegisterInfo.h b/lib/Target/AMDGPU/R600RegisterInfo.h new file mode 100644 index 00000000000..9713e600a72 --- /dev/null +++ b/lib/Target/AMDGPU/R600RegisterInfo.h @@ -0,0 +1,49 @@ +//===-- R600RegisterInfo.h - R600 Register Info Interface ------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface definition for R600RegisterInfo +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_R600REGISTERINFO_H +#define LLVM_LIB_TARGET_R600_R600REGISTERINFO_H + +#include "AMDGPURegisterInfo.h" + +namespace llvm { + +class AMDGPUSubtarget; + +struct R600RegisterInfo : public AMDGPURegisterInfo { + RegClassWeight RCW; + + R600RegisterInfo(); + + BitVector getReservedRegs(const MachineFunction &MF) const override; + + /// \brief get the HW encoding for a register's channel. + unsigned getHWRegChan(unsigned reg) const; + + unsigned getHWRegIndex(unsigned Reg) const override; + + /// \brief get the register class of the specified type to use in the + /// CFGStructurizer + const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override; + + const RegClassWeight & + getRegClassWeight(const TargetRegisterClass *RC) const override; + + // \returns true if \p Reg can be defined in one ALU caluse and used in another. + bool isPhysRegLiveAcrossClauses(unsigned Reg) const; +}; + +} // End namespace llvm + +#endif diff --git a/lib/Target/AMDGPU/R600RegisterInfo.td b/lib/Target/AMDGPU/R600RegisterInfo.td new file mode 100644 index 00000000000..cc667d985a8 --- /dev/null +++ b/lib/Target/AMDGPU/R600RegisterInfo.td @@ -0,0 +1,252 @@ + +class R600Reg encoding> : Register { + let Namespace = "AMDGPU"; + let HWEncoding = encoding; +} + +class R600RegWithChan sel, string chan> : + Register { + + field bits<2> chan_encoding = !if(!eq(chan, "X"), 0, + !if(!eq(chan, "Y"), 1, + !if(!eq(chan, "Z"), 2, + !if(!eq(chan, "W"), 3, 0)))); + let HWEncoding{8-0} = sel; + let HWEncoding{10-9} = chan_encoding; + let Namespace = "AMDGPU"; +} + +class R600Reg_128 subregs, bits<16> encoding> : + RegisterWithSubRegs { + field bits<2> chan_encoding = 0; + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1, sub2, sub3]; + let HWEncoding{8-0} = encoding{8-0}; + let HWEncoding{10-9} = chan_encoding; +} + +class R600Reg_64 subregs, bits<16> encoding> : + RegisterWithSubRegs { + field bits<2> chan_encoding = 0; + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = encoding; + let HWEncoding{8-0} = encoding{8-0}; + let HWEncoding{10-9} = chan_encoding; +} + +class R600Reg_64Vertical : R600Reg_64 < + "V"#lo#hi#"_"#chan, + [!cast("T"#lo#"_"#chan), !cast("T"#hi#"_"#chan)], + lo +>; + +foreach Index = 0-127 in { + foreach Chan = [ "X", "Y", "Z", "W" ] in { + // 32-bit Temporary Registers + def T#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index, Chan>; + + // Indirect addressing offset registers + def Addr#Index#_#Chan : R600RegWithChan <"T("#Index#" + AR.x)."#Chan, + Index, Chan>; + } + // 128-bit Temporary Registers + def T#Index#_XYZW : R600Reg_128 <"T"#Index#"", + [!cast("T"#Index#"_X"), + !cast("T"#Index#"_Y"), + !cast("T"#Index#"_Z"), + !cast("T"#Index#"_W")], + Index>; + + def T#Index#_XY : R600Reg_64 <"T"#Index#"", + [!cast("T"#Index#"_X"), + !cast("T"#Index#"_Y")], + Index>; +} + +foreach Chan = [ "X", "Y", "Z", "W"] in { + + let chan_encoding = !if(!eq(Chan, "X"), 0, + !if(!eq(Chan, "Y"), 1, + !if(!eq(Chan, "Z"), 2, + !if(!eq(Chan, "W"), 3, 0)))) in { + def V0123_#Chan : R600Reg_128 <"V0123_"#Chan, + [!cast("T0_"#Chan), + !cast("T1_"#Chan), + !cast("T2_"#Chan), + !cast("T3_"#Chan)], + 0>; + def V01_#Chan : R600Reg_64Vertical<0, 1, Chan>; + def V23_#Chan : R600Reg_64Vertical<2, 3, Chan>; + } +} + + +// KCACHE_BANK0 +foreach Index = 159-128 in { + foreach Chan = [ "X", "Y", "Z", "W" ] in { + // 32-bit Temporary Registers + def KC0_#Index#_#Chan : R600RegWithChan <"KC0["#!add(Index,-128)#"]."#Chan, Index, Chan>; + } + // 128-bit Temporary Registers + def KC0_#Index#_XYZW : R600Reg_128 <"KC0["#!add(Index, -128)#"].XYZW", + [!cast("KC0_"#Index#"_X"), + !cast("KC0_"#Index#"_Y"), + !cast("KC0_"#Index#"_Z"), + !cast("KC0_"#Index#"_W")], + Index>; +} + +// KCACHE_BANK1 +foreach Index = 191-160 in { + foreach Chan = [ "X", "Y", "Z", "W" ] in { + // 32-bit Temporary Registers + def KC1_#Index#_#Chan : R600RegWithChan <"KC1["#!add(Index,-160)#"]."#Chan, Index, Chan>; + } + // 128-bit Temporary Registers + def KC1_#Index#_XYZW : R600Reg_128 <"KC1["#!add(Index, -160)#"].XYZW", + [!cast("KC1_"#Index#"_X"), + !cast("KC1_"#Index#"_Y"), + !cast("KC1_"#Index#"_Z"), + !cast("KC1_"#Index#"_W")], + Index>; +} + + +// Array Base Register holding input in FS +foreach Index = 448-480 in { + def ArrayBase#Index : R600Reg<"ARRAY_BASE", Index>; +} + + +// Special Registers + +def OQA : R600Reg<"OQA", 219>; +def OQB : R600Reg<"OQB", 220>; +def OQAP : R600Reg<"OQAP", 221>; +def OQBP : R600Reg<"OQAP", 222>; +def LDS_DIRECT_A : R600Reg<"LDS_DIRECT_A", 223>; +def LDS_DIRECT_B : R600Reg<"LDS_DIRECT_B", 224>; +def ZERO : R600Reg<"0.0", 248>; +def ONE : R600Reg<"1.0", 249>; +def NEG_ONE : R600Reg<"-1.0", 249>; +def ONE_INT : R600Reg<"1", 250>; +def HALF : R600Reg<"0.5", 252>; +def NEG_HALF : R600Reg<"-0.5", 252>; +def ALU_LITERAL_X : R600RegWithChan<"literal.x", 253, "X">; +def ALU_LITERAL_Y : R600RegWithChan<"literal.y", 253, "Y">; +def ALU_LITERAL_Z : R600RegWithChan<"literal.z", 253, "Z">; +def ALU_LITERAL_W : R600RegWithChan<"literal.w", 253, "W">; +def PV_X : R600RegWithChan<"PV.X", 254, "X">; +def PV_Y : R600RegWithChan<"PV.Y", 254, "Y">; +def PV_Z : R600RegWithChan<"PV.Z", 254, "Z">; +def PV_W : R600RegWithChan<"PV.W", 254, "W">; +def PS: R600Reg<"PS", 255>; +def PREDICATE_BIT : R600Reg<"PredicateBit", 0>; +def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>; +def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>; +def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>; +def AR_X : R600Reg<"AR.x", 0>; + +def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "ArrayBase%u", 448, 480))>; +// special registers for ALU src operands +// const buffer reference, SRCx_SEL contains index +def ALU_CONST : R600Reg<"CBuf", 0>; +// interpolation param reference, SRCx_SEL contains index +def ALU_PARAM : R600Reg<"Param", 0>; + +let isAllocatable = 0 in { + +def R600_Addr : RegisterClass <"AMDGPU", [i32], 32, (add (sequence "Addr%u_X", 0, 127))>; + +// We only use Addr_[YZW] for vertical vectors. +// FIXME if we add more vertical vector registers we will need to ad more +// registers to these classes. +def R600_Addr_Y : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Y)>; +def R600_Addr_Z : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Z)>; +def R600_Addr_W : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_W)>; + +def R600_LDS_SRC_REG : RegisterClass<"AMDGPU", [i32], 32, + (add OQA, OQB, OQAP, OQBP, LDS_DIRECT_A, LDS_DIRECT_B)>; + +def R600_KC0_X : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC0_%u_X", 128, 159))>; + +def R600_KC0_Y : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC0_%u_Y", 128, 159))>; + +def R600_KC0_Z : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC0_%u_Z", 128, 159))>; + +def R600_KC0_W : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC0_%u_W", 128, 159))>; + +def R600_KC0 : RegisterClass <"AMDGPU", [f32, i32], 32, + (interleave R600_KC0_X, R600_KC0_Y, + R600_KC0_Z, R600_KC0_W)>; + +def R600_KC1_X : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC1_%u_X", 160, 191))>; + +def R600_KC1_Y : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC1_%u_Y", 160, 191))>; + +def R600_KC1_Z : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC1_%u_Z", 160, 191))>; + +def R600_KC1_W : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "KC1_%u_W", 160, 191))>; + +def R600_KC1 : RegisterClass <"AMDGPU", [f32, i32], 32, + (interleave R600_KC1_X, R600_KC1_Y, + R600_KC1_Z, R600_KC1_W)>; + +} // End isAllocatable = 0 + +def R600_TReg32_X : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "T%u_X", 0, 127), AR_X)>; + +def R600_TReg32_Y : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "T%u_Y", 0, 127))>; + +def R600_TReg32_Z : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "T%u_Z", 0, 127))>; + +def R600_TReg32_W : RegisterClass <"AMDGPU", [f32, i32], 32, + (add (sequence "T%u_W", 0, 127))>; + +def R600_TReg32 : RegisterClass <"AMDGPU", [f32, i32], 32, + (interleave R600_TReg32_X, R600_TReg32_Y, + R600_TReg32_Z, R600_TReg32_W)>; + +def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add + R600_TReg32, + R600_ArrayBase, + R600_Addr, + R600_KC0, R600_KC1, + ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF, + ALU_CONST, ALU_PARAM, OQAP + )>; + +def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add + PRED_SEL_OFF, PRED_SEL_ZERO, PRED_SEL_ONE)>; + +def R600_Predicate_Bit: RegisterClass <"AMDGPU", [i32], 32, (add + PREDICATE_BIT)>; + +def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, + (add (sequence "T%u_XYZW", 0, 127))> { + let CopyCost = -1; +} + +def R600_Reg128Vertical : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, + (add V0123_W, V0123_Z, V0123_Y, V0123_X) +>; + +def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64, + (add (sequence "T%u_XY", 0, 63))>; + +def R600_Reg64Vertical : RegisterClass<"AMDGPU", [v2f32, v2i32], 64, + (add V01_X, V01_Y, V01_Z, V01_W, + V23_X, V23_Y, V23_Z, V23_W)>; diff --git a/lib/Target/AMDGPU/R600Schedule.td b/lib/Target/AMDGPU/R600Schedule.td new file mode 100644 index 00000000000..df62bf85c0a --- /dev/null +++ b/lib/Target/AMDGPU/R600Schedule.td @@ -0,0 +1,49 @@ +//===-- R600Schedule.td - R600 Scheduling definitions ------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// R600 has a VLIW architecture. On pre-cayman cards there are 5 instruction +// slots ALU.X, ALU.Y, ALU.Z, ALU.W, and TRANS. For cayman cards, the TRANS +// slot has been removed. +// +//===----------------------------------------------------------------------===// + + +def ALU_X : FuncUnit; +def ALU_Y : FuncUnit; +def ALU_Z : FuncUnit; +def ALU_W : FuncUnit; +def TRANS : FuncUnit; + +def AnyALU : InstrItinClass; +def VecALU : InstrItinClass; +def TransALU : InstrItinClass; +def XALU : InstrItinClass; + +def R600_VLIW5_Itin : ProcessorItineraries < + [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS, ALU_NULL], + [], + [ + InstrItinData]>, + InstrItinData]>, + InstrItinData]>, + InstrItinData]>, + InstrItinData]> + ] +>; + +def R600_VLIW4_Itin : ProcessorItineraries < + [ALU_X, ALU_Y, ALU_Z, ALU_W, ALU_NULL], + [], + [ + InstrItinData]>, + InstrItinData]>, + InstrItinData]>, + InstrItinData]> + ] +>; diff --git a/lib/Target/AMDGPU/R600TextureIntrinsicsReplacer.cpp b/lib/Target/AMDGPU/R600TextureIntrinsicsReplacer.cpp new file mode 100644 index 00000000000..2fc7b02f673 --- /dev/null +++ b/lib/Target/AMDGPU/R600TextureIntrinsicsReplacer.cpp @@ -0,0 +1,303 @@ +//===-- R600TextureIntrinsicsReplacer.cpp ---------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass translates tgsi-like texture intrinsics into R600 texture +/// closer to hardware intrinsics. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/Passes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstVisitor.h" + +using namespace llvm; + +namespace { +class R600TextureIntrinsicsReplacer : + public FunctionPass, public InstVisitor { + static char ID; + + Module *Mod; + Type *FloatType; + Type *Int32Type; + Type *V4f32Type; + Type *V4i32Type; + FunctionType *TexSign; + FunctionType *TexQSign; + + void getAdjustmentFromTextureTarget(unsigned TextureType, bool hasLOD, + unsigned SrcSelect[4], unsigned CT[4], + bool &useShadowVariant) { + enum TextureTypes { + TEXTURE_1D = 1, + TEXTURE_2D, + TEXTURE_3D, + TEXTURE_CUBE, + TEXTURE_RECT, + TEXTURE_SHADOW1D, + TEXTURE_SHADOW2D, + TEXTURE_SHADOWRECT, + TEXTURE_1D_ARRAY, + TEXTURE_2D_ARRAY, + TEXTURE_SHADOW1D_ARRAY, + TEXTURE_SHADOW2D_ARRAY, + TEXTURE_SHADOWCUBE, + TEXTURE_2D_MSAA, + TEXTURE_2D_ARRAY_MSAA, + TEXTURE_CUBE_ARRAY, + TEXTURE_SHADOWCUBE_ARRAY + }; + + switch (TextureType) { + case 0: + useShadowVariant = false; + return; + case TEXTURE_RECT: + case TEXTURE_1D: + case TEXTURE_2D: + case TEXTURE_3D: + case TEXTURE_CUBE: + case TEXTURE_1D_ARRAY: + case TEXTURE_2D_ARRAY: + case TEXTURE_CUBE_ARRAY: + case TEXTURE_2D_MSAA: + case TEXTURE_2D_ARRAY_MSAA: + useShadowVariant = false; + break; + case TEXTURE_SHADOW1D: + case TEXTURE_SHADOW2D: + case TEXTURE_SHADOWRECT: + case TEXTURE_SHADOW1D_ARRAY: + case TEXTURE_SHADOW2D_ARRAY: + case TEXTURE_SHADOWCUBE: + case TEXTURE_SHADOWCUBE_ARRAY: + useShadowVariant = true; + break; + default: + llvm_unreachable("Unknow Texture Type"); + } + + if (TextureType == TEXTURE_RECT || + TextureType == TEXTURE_SHADOWRECT) { + CT[0] = 0; + CT[1] = 0; + } + + if (TextureType == TEXTURE_CUBE_ARRAY || + TextureType == TEXTURE_SHADOWCUBE_ARRAY) + CT[2] = 0; + + if (TextureType == TEXTURE_1D_ARRAY || + TextureType == TEXTURE_SHADOW1D_ARRAY) { + if (hasLOD && useShadowVariant) { + CT[1] = 0; + } else { + CT[2] = 0; + SrcSelect[2] = 1; + } + } else if (TextureType == TEXTURE_2D_ARRAY || + TextureType == TEXTURE_SHADOW2D_ARRAY) { + CT[2] = 0; + } + + if ((TextureType == TEXTURE_SHADOW1D || + TextureType == TEXTURE_SHADOW2D || + TextureType == TEXTURE_SHADOWRECT || + TextureType == TEXTURE_SHADOW1D_ARRAY) && + !(hasLOD && useShadowVariant)) + SrcSelect[3] = 2; + } + + void ReplaceCallInst(CallInst &I, FunctionType *FT, const char *Name, + unsigned SrcSelect[4], Value *Offset[3], Value *Resource, + Value *Sampler, unsigned CT[4], Value *Coord) { + IRBuilder<> Builder(&I); + Constant *Mask[] = { + ConstantInt::get(Int32Type, SrcSelect[0]), + ConstantInt::get(Int32Type, SrcSelect[1]), + ConstantInt::get(Int32Type, SrcSelect[2]), + ConstantInt::get(Int32Type, SrcSelect[3]) + }; + Value *SwizzleMask = ConstantVector::get(Mask); + Value *SwizzledCoord = + Builder.CreateShuffleVector(Coord, Coord, SwizzleMask); + + Value *Args[] = { + SwizzledCoord, + Offset[0], + Offset[1], + Offset[2], + Resource, + Sampler, + ConstantInt::get(Int32Type, CT[0]), + ConstantInt::get(Int32Type, CT[1]), + ConstantInt::get(Int32Type, CT[2]), + ConstantInt::get(Int32Type, CT[3]) + }; + + Function *F = Mod->getFunction(Name); + if (!F) { + F = Function::Create(FT, GlobalValue::ExternalLinkage, Name, Mod); + F->addFnAttr(Attribute::ReadNone); + } + I.replaceAllUsesWith(Builder.CreateCall(F, Args)); + I.eraseFromParent(); + } + + void ReplaceTexIntrinsic(CallInst &I, bool hasLOD, FunctionType *FT, + const char *VanillaInt, + const char *ShadowInt) { + Value *Coord = I.getArgOperand(0); + Value *ResourceId = I.getArgOperand(1); + Value *SamplerId = I.getArgOperand(2); + + unsigned TextureType = + cast(I.getArgOperand(3))->getZExtValue(); + + unsigned SrcSelect[4] = { 0, 1, 2, 3 }; + unsigned CT[4] = {1, 1, 1, 1}; + Value *Offset[3] = { + ConstantInt::get(Int32Type, 0), + ConstantInt::get(Int32Type, 0), + ConstantInt::get(Int32Type, 0) + }; + bool useShadowVariant; + + getAdjustmentFromTextureTarget(TextureType, hasLOD, SrcSelect, CT, + useShadowVariant); + + ReplaceCallInst(I, FT, useShadowVariant?ShadowInt:VanillaInt, SrcSelect, + Offset, ResourceId, SamplerId, CT, Coord); + } + + void ReplaceTXF(CallInst &I) { + Value *Coord = I.getArgOperand(0); + Value *ResourceId = I.getArgOperand(4); + Value *SamplerId = I.getArgOperand(5); + + unsigned TextureType = + cast(I.getArgOperand(6))->getZExtValue(); + + unsigned SrcSelect[4] = { 0, 1, 2, 3 }; + unsigned CT[4] = {1, 1, 1, 1}; + Value *Offset[3] = { + I.getArgOperand(1), + I.getArgOperand(2), + I.getArgOperand(3), + }; + bool useShadowVariant; + + getAdjustmentFromTextureTarget(TextureType, false, SrcSelect, CT, + useShadowVariant); + + ReplaceCallInst(I, TexQSign, "llvm.R600.txf", SrcSelect, + Offset, ResourceId, SamplerId, CT, Coord); + } + +public: + R600TextureIntrinsicsReplacer(): + FunctionPass(ID) { + } + + bool doInitialization(Module &M) override { + LLVMContext &Ctx = M.getContext(); + Mod = &M; + FloatType = Type::getFloatTy(Ctx); + Int32Type = Type::getInt32Ty(Ctx); + V4f32Type = VectorType::get(FloatType, 4); + V4i32Type = VectorType::get(Int32Type, 4); + Type *ArgsType[] = { + V4f32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + }; + TexSign = FunctionType::get(V4f32Type, ArgsType, /*isVarArg=*/false); + Type *ArgsQType[] = { + V4i32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + Int32Type, + }; + TexQSign = FunctionType::get(V4f32Type, ArgsQType, /*isVarArg=*/false); + return false; + } + + bool runOnFunction(Function &F) override { + visit(F); + return false; + } + + const char *getPassName() const override { + return "R600 Texture Intrinsics Replacer"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + } + + void visitCallInst(CallInst &I) { + if (!I.getCalledFunction()) + return; + + StringRef Name = I.getCalledFunction()->getName(); + if (Name == "llvm.AMDGPU.tex") { + ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.tex", "llvm.R600.texc"); + return; + } + if (Name == "llvm.AMDGPU.txl") { + ReplaceTexIntrinsic(I, true, TexSign, "llvm.R600.txl", "llvm.R600.txlc"); + return; + } + if (Name == "llvm.AMDGPU.txb") { + ReplaceTexIntrinsic(I, true, TexSign, "llvm.R600.txb", "llvm.R600.txbc"); + return; + } + if (Name == "llvm.AMDGPU.txf") { + ReplaceTXF(I); + return; + } + if (Name == "llvm.AMDGPU.txq") { + ReplaceTexIntrinsic(I, false, TexQSign, "llvm.R600.txq", "llvm.R600.txq"); + return; + } + if (Name == "llvm.AMDGPU.ddx") { + ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.ddx", "llvm.R600.ddx"); + return; + } + if (Name == "llvm.AMDGPU.ddy") { + ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.ddy", "llvm.R600.ddy"); + return; + } + } + +}; + +char R600TextureIntrinsicsReplacer::ID = 0; + +} + +FunctionPass *llvm::createR600TextureIntrinsicsReplacer() { + return new R600TextureIntrinsicsReplacer(); +} diff --git a/lib/Target/AMDGPU/R700Instructions.td b/lib/Target/AMDGPU/R700Instructions.td new file mode 100644 index 00000000000..613a0d729bb --- /dev/null +++ b/lib/Target/AMDGPU/R700Instructions.td @@ -0,0 +1,21 @@ +//===-- R700Instructions.td - R700 Instruction defs -------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TableGen definitions for instructions which are: +// - Available to R700 and newer VLIW4/VLIW5 GPUs +// - Available only on R700 family GPUs. +// +//===----------------------------------------------------------------------===// + +def isR700 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::R700">; + +let Predicates = [isR700] in { + def SIN_r700 : SIN_Common<0x6E>; + def COS_r700 : COS_Common<0x6F>; +} diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp new file mode 100644 index 00000000000..ccfbf1bf19e --- /dev/null +++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -0,0 +1,365 @@ +//===-- SIAnnotateControlFlow.cpp - ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Annotates the control flow with hardware specific intrinsics. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-annotate-control-flow" + +namespace { + +// Complex types used in this pass +typedef std::pair StackEntry; +typedef SmallVector StackVector; + +// Intrinsic names the control flow is annotated with +static const char *const IfIntrinsic = "llvm.SI.if"; +static const char *const ElseIntrinsic = "llvm.SI.else"; +static const char *const BreakIntrinsic = "llvm.SI.break"; +static const char *const IfBreakIntrinsic = "llvm.SI.if.break"; +static const char *const ElseBreakIntrinsic = "llvm.SI.else.break"; +static const char *const LoopIntrinsic = "llvm.SI.loop"; +static const char *const EndCfIntrinsic = "llvm.SI.end.cf"; + +class SIAnnotateControlFlow : public FunctionPass { + + static char ID; + + Type *Boolean; + Type *Void; + Type *Int64; + Type *ReturnStruct; + + ConstantInt *BoolTrue; + ConstantInt *BoolFalse; + UndefValue *BoolUndef; + Constant *Int64Zero; + + Constant *If; + Constant *Else; + Constant *Break; + Constant *IfBreak; + Constant *ElseBreak; + Constant *Loop; + Constant *EndCf; + + DominatorTree *DT; + StackVector Stack; + + LoopInfo *LI; + + bool isTopOfStack(BasicBlock *BB); + + Value *popSaved(); + + void push(BasicBlock *BB, Value *Saved); + + bool isElse(PHINode *Phi); + + void eraseIfUnused(PHINode *Phi); + + void openIf(BranchInst *Term); + + void insertElse(BranchInst *Term); + + Value *handleLoopCondition(Value *Cond, PHINode *Broken, llvm::Loop *L); + + void handleLoop(BranchInst *Term); + + void closeControlFlow(BasicBlock *BB); + +public: + SIAnnotateControlFlow(): + FunctionPass(ID) { } + + bool doInitialization(Module &M) override; + + bool runOnFunction(Function &F) override; + + const char *getPassName() const override { + return "SI annotate control flow"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + FunctionPass::getAnalysisUsage(AU); + } + +}; + +} // end anonymous namespace + +char SIAnnotateControlFlow::ID = 0; + +/// \brief Initialize all the types and constants used in the pass +bool SIAnnotateControlFlow::doInitialization(Module &M) { + LLVMContext &Context = M.getContext(); + + Void = Type::getVoidTy(Context); + Boolean = Type::getInt1Ty(Context); + Int64 = Type::getInt64Ty(Context); + ReturnStruct = StructType::get(Boolean, Int64, (Type *)nullptr); + + BoolTrue = ConstantInt::getTrue(Context); + BoolFalse = ConstantInt::getFalse(Context); + BoolUndef = UndefValue::get(Boolean); + Int64Zero = ConstantInt::get(Int64, 0); + + If = M.getOrInsertFunction( + IfIntrinsic, ReturnStruct, Boolean, (Type *)nullptr); + + Else = M.getOrInsertFunction( + ElseIntrinsic, ReturnStruct, Int64, (Type *)nullptr); + + Break = M.getOrInsertFunction( + BreakIntrinsic, Int64, Int64, (Type *)nullptr); + + IfBreak = M.getOrInsertFunction( + IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)nullptr); + + ElseBreak = M.getOrInsertFunction( + ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)nullptr); + + Loop = M.getOrInsertFunction( + LoopIntrinsic, Boolean, Int64, (Type *)nullptr); + + EndCf = M.getOrInsertFunction( + EndCfIntrinsic, Void, Int64, (Type *)nullptr); + + return false; +} + +/// \brief Is BB the last block saved on the stack ? +bool SIAnnotateControlFlow::isTopOfStack(BasicBlock *BB) { + return !Stack.empty() && Stack.back().first == BB; +} + +/// \brief Pop the last saved value from the control flow stack +Value *SIAnnotateControlFlow::popSaved() { + return Stack.pop_back_val().second; +} + +/// \brief Push a BB and saved value to the control flow stack +void SIAnnotateControlFlow::push(BasicBlock *BB, Value *Saved) { + Stack.push_back(std::make_pair(BB, Saved)); +} + +/// \brief Can the condition represented by this PHI node treated like +/// an "Else" block? +bool SIAnnotateControlFlow::isElse(PHINode *Phi) { + BasicBlock *IDom = DT->getNode(Phi->getParent())->getIDom()->getBlock(); + for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { + if (Phi->getIncomingBlock(i) == IDom) { + + if (Phi->getIncomingValue(i) != BoolTrue) + return false; + + } else { + if (Phi->getIncomingValue(i) != BoolFalse) + return false; + + } + } + return true; +} + +// \brief Erase "Phi" if it is not used any more +void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) { + if (!Phi->hasNUsesOrMore(1)) + Phi->eraseFromParent(); +} + +/// \brief Open a new "If" block +void SIAnnotateControlFlow::openIf(BranchInst *Term) { + Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term); + Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term)); + push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); +} + +/// \brief Close the last "If" block and open a new "Else" block +void SIAnnotateControlFlow::insertElse(BranchInst *Term) { + Value *Ret = CallInst::Create(Else, popSaved(), "", Term); + Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term)); + push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); +} + +/// \brief Recursively handle the condition leading to a loop +Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken, + llvm::Loop *L) { + + // Only search through PHI nodes which are inside the loop. If we try this + // with PHI nodes that are outside of the loop, we end up inserting new PHI + // nodes outside of the loop which depend on values defined inside the loop. + // This will break the module with + // 'Instruction does not dominate all users!' errors. + PHINode *Phi = nullptr; + if ((Phi = dyn_cast(Cond)) && L->contains(Phi)) { + + BasicBlock *Parent = Phi->getParent(); + PHINode *NewPhi = PHINode::Create(Int64, 0, "", &Parent->front()); + Value *Ret = NewPhi; + + // Handle all non-constant incoming values first + for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { + Value *Incoming = Phi->getIncomingValue(i); + BasicBlock *From = Phi->getIncomingBlock(i); + if (isa(Incoming)) { + NewPhi->addIncoming(Broken, From); + continue; + } + + Phi->setIncomingValue(i, BoolFalse); + Value *PhiArg = handleLoopCondition(Incoming, Broken, L); + NewPhi->addIncoming(PhiArg, From); + } + + BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock(); + + for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { + + Value *Incoming = Phi->getIncomingValue(i); + if (Incoming != BoolTrue) + continue; + + BasicBlock *From = Phi->getIncomingBlock(i); + if (From == IDom) { + CallInst *OldEnd = dyn_cast(Parent->getFirstInsertionPt()); + if (OldEnd && OldEnd->getCalledFunction() == EndCf) { + Value *Args[] = { OldEnd->getArgOperand(0), NewPhi }; + Ret = CallInst::Create(ElseBreak, Args, "", OldEnd); + continue; + } + } + TerminatorInst *Insert = From->getTerminator(); + Value *PhiArg = CallInst::Create(Break, Broken, "", Insert); + NewPhi->setIncomingValue(i, PhiArg); + } + eraseIfUnused(Phi); + return Ret; + + } else if (Instruction *Inst = dyn_cast(Cond)) { + BasicBlock *Parent = Inst->getParent(); + Instruction *Insert; + if (L->contains(Inst)) { + Insert = Parent->getTerminator(); + } else { + Insert = L->getHeader()->getFirstNonPHIOrDbgOrLifetime(); + } + Value *Args[] = { Cond, Broken }; + return CallInst::Create(IfBreak, Args, "", Insert); + + } else { + llvm_unreachable("Unhandled loop condition!"); + } + return 0; +} + +/// \brief Handle a back edge (loop) +void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { + BasicBlock *BB = Term->getParent(); + llvm::Loop *L = LI->getLoopFor(BB); + BasicBlock *Target = Term->getSuccessor(1); + PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front()); + + Value *Cond = Term->getCondition(); + Term->setCondition(BoolTrue); + Value *Arg = handleLoopCondition(Cond, Broken, L); + + for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target); + PI != PE; ++PI) { + + Broken->addIncoming(*PI == BB ? Arg : Int64Zero, *PI); + } + + Term->setCondition(CallInst::Create(Loop, Arg, "", Term)); + push(Term->getSuccessor(0), Arg); +}/// \brief Close the last opened control flow +void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { + llvm::Loop *L = LI->getLoopFor(BB); + + if (L && L->getHeader() == BB) { + // We can't insert an EndCF call into a loop header, because it will + // get executed on every iteration of the loop, when it should be + // executed only once before the loop. + SmallVector Latches; + L->getLoopLatches(Latches); + + std::vector Preds; + for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) { + if (std::find(Latches.begin(), Latches.end(), *PI) == Latches.end()) + Preds.push_back(*PI); + } + BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", nullptr, DT, + LI, false); + } + + CallInst::Create(EndCf, popSaved(), "", BB->getFirstInsertionPt()); +} + +/// \brief Annotate the control flow with intrinsics so the backend can +/// recognize if/then/else and loops. +bool SIAnnotateControlFlow::runOnFunction(Function &F) { + DT = &getAnalysis().getDomTree(); + LI = &getAnalysis().getLoopInfo(); + + for (df_iterator I = df_begin(&F.getEntryBlock()), + E = df_end(&F.getEntryBlock()); I != E; ++I) { + + BranchInst *Term = dyn_cast((*I)->getTerminator()); + + if (!Term || Term->isUnconditional()) { + if (isTopOfStack(*I)) + closeControlFlow(*I); + continue; + } + + if (I.nodeVisited(Term->getSuccessor(1))) { + if (isTopOfStack(*I)) + closeControlFlow(*I); + handleLoop(Term); + continue; + } + + if (isTopOfStack(*I)) { + PHINode *Phi = dyn_cast(Term->getCondition()); + if (Phi && Phi->getParent() == *I && isElse(Phi)) { + insertElse(Term); + eraseIfUnused(Phi); + continue; + } + closeControlFlow(*I); + } + openIf(Term); + } + + assert(Stack.empty()); + return true; +} + +/// \brief Create the annotation pass +FunctionPass *llvm::createSIAnnotateControlFlowPass() { + return new SIAnnotateControlFlow(); +} diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h new file mode 100644 index 00000000000..4727d971ab7 --- /dev/null +++ b/lib/Target/AMDGPU/SIDefines.h @@ -0,0 +1,172 @@ +//===-- SIDefines.h - SI Helper Macros ----------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#include "llvm/MC/MCInstrDesc.h" + +#ifndef LLVM_LIB_TARGET_R600_SIDEFINES_H +#define LLVM_LIB_TARGET_R600_SIDEFINES_H + +namespace SIInstrFlags { +// This needs to be kept in sync with the field bits in InstSI. +enum { + SALU = 1 << 3, + VALU = 1 << 4, + + SOP1 = 1 << 5, + SOP2 = 1 << 6, + SOPC = 1 << 7, + SOPK = 1 << 8, + SOPP = 1 << 9, + + VOP1 = 1 << 10, + VOP2 = 1 << 11, + VOP3 = 1 << 12, + VOPC = 1 << 13, + + MUBUF = 1 << 14, + MTBUF = 1 << 15, + SMRD = 1 << 16, + DS = 1 << 17, + MIMG = 1 << 18, + FLAT = 1 << 19, + WQM = 1 << 20, + VGPRSpill = 1 << 21 +}; +} + +namespace llvm { +namespace AMDGPU { + enum OperandType { + /// Operand with register or 32-bit immediate + OPERAND_REG_IMM32 = llvm::MCOI::OPERAND_FIRST_TARGET, + /// Operand with register or inline constant + OPERAND_REG_INLINE_C + }; +} +} + +namespace SIInstrFlags { + enum Flags { + // First 4 bits are the instruction encoding + VM_CNT = 1 << 0, + EXP_CNT = 1 << 1, + LGKM_CNT = 1 << 2 + }; + + // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. + // The result is true if any of these tests are true. + enum ClassFlags { + S_NAN = 1 << 0, // Signaling NaN + Q_NAN = 1 << 1, // Quiet NaN + N_INFINITY = 1 << 2, // Negative infinity + N_NORMAL = 1 << 3, // Negative normal + N_SUBNORMAL = 1 << 4, // Negative subnormal + N_ZERO = 1 << 5, // Negative zero + P_ZERO = 1 << 6, // Positive zero + P_SUBNORMAL = 1 << 7, // Positive subnormal + P_NORMAL = 1 << 8, // Positive normal + P_INFINITY = 1 << 9 // Positive infinity + }; +} + +namespace SISrcMods { + enum { + NEG = 1 << 0, + ABS = 1 << 1 + }; +} + +namespace SIOutMods { + enum { + NONE = 0, + MUL2 = 1, + MUL4 = 2, + DIV2 = 3 + }; +} + +#define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028 +#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS 0x00B02C +#define S_00B02C_EXTRA_LDS_SIZE(x) (((x) & 0xFF) << 8) +#define R_00B128_SPI_SHADER_PGM_RSRC1_VS 0x00B128 +#define R_00B228_SPI_SHADER_PGM_RSRC1_GS 0x00B228 +#define R_00B848_COMPUTE_PGM_RSRC1 0x00B848 +#define S_00B028_VGPRS(x) (((x) & 0x3F) << 0) +#define S_00B028_SGPRS(x) (((x) & 0x0F) << 6) +#define R_00B84C_COMPUTE_PGM_RSRC2 0x00B84C +#define S_00B84C_SCRATCH_EN(x) (((x) & 0x1) << 0) +#define S_00B84C_USER_SGPR(x) (((x) & 0x1F) << 1) +#define S_00B84C_TGID_X_EN(x) (((x) & 0x1) << 7) +#define S_00B84C_TGID_Y_EN(x) (((x) & 0x1) << 8) +#define S_00B84C_TGID_Z_EN(x) (((x) & 0x1) << 9) +#define S_00B84C_TG_SIZE_EN(x) (((x) & 0x1) << 10) +#define S_00B84C_TIDIG_COMP_CNT(x) (((x) & 0x03) << 11) + +#define S_00B84C_LDS_SIZE(x) (((x) & 0x1FF) << 15) +#define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC + + +#define R_00B848_COMPUTE_PGM_RSRC1 0x00B848 +#define S_00B848_VGPRS(x) (((x) & 0x3F) << 0) +#define G_00B848_VGPRS(x) (((x) >> 0) & 0x3F) +#define C_00B848_VGPRS 0xFFFFFFC0 +#define S_00B848_SGPRS(x) (((x) & 0x0F) << 6) +#define G_00B848_SGPRS(x) (((x) >> 6) & 0x0F) +#define C_00B848_SGPRS 0xFFFFFC3F +#define S_00B848_PRIORITY(x) (((x) & 0x03) << 10) +#define G_00B848_PRIORITY(x) (((x) >> 10) & 0x03) +#define C_00B848_PRIORITY 0xFFFFF3FF +#define S_00B848_FLOAT_MODE(x) (((x) & 0xFF) << 12) +#define G_00B848_FLOAT_MODE(x) (((x) >> 12) & 0xFF) +#define C_00B848_FLOAT_MODE 0xFFF00FFF +#define S_00B848_PRIV(x) (((x) & 0x1) << 20) +#define G_00B848_PRIV(x) (((x) >> 20) & 0x1) +#define C_00B848_PRIV 0xFFEFFFFF +#define S_00B848_DX10_CLAMP(x) (((x) & 0x1) << 21) +#define G_00B848_DX10_CLAMP(x) (((x) >> 21) & 0x1) +#define C_00B848_DX10_CLAMP 0xFFDFFFFF +#define S_00B848_DEBUG_MODE(x) (((x) & 0x1) << 22) +#define G_00B848_DEBUG_MODE(x) (((x) >> 22) & 0x1) +#define C_00B848_DEBUG_MODE 0xFFBFFFFF +#define S_00B848_IEEE_MODE(x) (((x) & 0x1) << 23) +#define G_00B848_IEEE_MODE(x) (((x) >> 23) & 0x1) +#define C_00B848_IEEE_MODE 0xFF7FFFFF + + +// Helpers for setting FLOAT_MODE +#define FP_ROUND_ROUND_TO_NEAREST 0 +#define FP_ROUND_ROUND_TO_INF 1 +#define FP_ROUND_ROUND_TO_NEGINF 2 +#define FP_ROUND_ROUND_TO_ZERO 3 + +// Bits 3:0 control rounding mode. 1:0 control single precision, 3:2 double +// precision. +#define FP_ROUND_MODE_SP(x) ((x) & 0x3) +#define FP_ROUND_MODE_DP(x) (((x) & 0x3) << 2) + +#define FP_DENORM_FLUSH_IN_FLUSH_OUT 0 +#define FP_DENORM_FLUSH_OUT 1 +#define FP_DENORM_FLUSH_IN 2 +#define FP_DENORM_FLUSH_NONE 3 + + +// Bits 7:4 control denormal handling. 5:4 control single precision, 6:7 double +// precision. +#define FP_DENORM_MODE_SP(x) (((x) & 0x3) << 4) +#define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6) + +#define R_00B860_COMPUTE_TMPRING_SIZE 0x00B860 +#define S_00B860_WAVESIZE(x) (((x) & 0x1FFF) << 12) + +#define R_0286E8_SPI_TMPRING_SIZE 0x0286E8 +#define S_0286E8_WAVESIZE(x) (((x) & 0x1FFF) << 12) + + +#endif diff --git a/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp b/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp new file mode 100644 index 00000000000..5fe8d19426d --- /dev/null +++ b/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp @@ -0,0 +1,96 @@ +//===-- SIFixControlFlowLiveIntervals.cpp - Fix CF live intervals ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Spilling of EXEC masks used for control flow messes up control flow +/// lowering, so mark all live intervals associated with CF instructions as +/// non-spillable. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-fix-cf-live-intervals" + +namespace { + +class SIFixControlFlowLiveIntervals : public MachineFunctionPass { +public: + static char ID; + +public: + SIFixControlFlowLiveIntervals() : MachineFunctionPass(ID) { + initializeSIFixControlFlowLiveIntervalsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Fix CF Live Intervals"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIFixControlFlowLiveIntervals, DEBUG_TYPE, + "SI Fix CF Live Intervals", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_END(SIFixControlFlowLiveIntervals, DEBUG_TYPE, + "SI Fix CF Live Intervals", false, false) + +char SIFixControlFlowLiveIntervals::ID = 0; + +char &llvm::SIFixControlFlowLiveIntervalsID = SIFixControlFlowLiveIntervals::ID; + +FunctionPass *llvm::createSIFixControlFlowLiveIntervalsPass() { + return new SIFixControlFlowLiveIntervals(); +} + +bool SIFixControlFlowLiveIntervals::runOnMachineFunction(MachineFunction &MF) { + LiveIntervals *LIS = &getAnalysis(); + + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + switch (MI.getOpcode()) { + case AMDGPU::SI_IF: + case AMDGPU::SI_ELSE: + case AMDGPU::SI_BREAK: + case AMDGPU::SI_IF_BREAK: + case AMDGPU::SI_ELSE_BREAK: + case AMDGPU::SI_END_CF: { + unsigned Reg = MI.getOperand(0).getReg(); + LIS->getInterval(Reg).markNotSpillable(); + break; + } + default: + break; + } + } + } + + return false; +} diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp new file mode 100644 index 00000000000..23502b45905 --- /dev/null +++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -0,0 +1,338 @@ +//===-- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Copies from VGPR to SGPR registers are illegal and the register coalescer +/// will sometimes generate these illegal copies in situations like this: +/// +/// Register Class is the union of and +/// +/// BB0: +/// %vreg0 = SCALAR_INST +/// %vreg1 = COPY %vreg0 +/// ... +/// BRANCH %cond BB1, BB2 +/// BB1: +/// %vreg2 = VECTOR_INST +/// %vreg3 = COPY %vreg2 +/// BB2: +/// %vreg4 = PHI %vreg1 , , %vreg3 , +/// %vreg5 = VECTOR_INST %vreg4 +/// +/// +/// The coalescer will begin at BB0 and eliminate its copy, then the resulting +/// code will look like this: +/// +/// BB0: +/// %vreg0 = SCALAR_INST +/// ... +/// BRANCH %cond BB1, BB2 +/// BB1: +/// %vreg2 = VECTOR_INST +/// %vreg3 = COPY %vreg2 +/// BB2: +/// %vreg4 = PHI %vreg0 , , %vreg3 , +/// %vreg5 = VECTOR_INST %vreg4 +/// +/// Now that the result of the PHI instruction is an SGPR, the register +/// allocator is now forced to constrain the register class of %vreg3 to +/// so we end up with final code like this: +/// +/// BB0: +/// %vreg0 = SCALAR_INST +/// ... +/// BRANCH %cond BB1, BB2 +/// BB1: +/// %vreg2 = VECTOR_INST +/// %vreg3 = COPY %vreg2 +/// BB2: +/// %vreg4 = PHI %vreg0 , , %vreg3 , +/// %vreg5 = VECTOR_INST %vreg4 +/// +/// Now this code contains an illegal copy from a VGPR to an SGPR. +/// +/// In order to avoid this problem, this pass searches for PHI instructions +/// which define a register and constrains its definition class to +/// if the user of the PHI's definition register is a vector instruction. +/// If the PHI's definition class is constrained to then the coalescer +/// will be unable to perform the COPY removal from the above example which +/// ultimately led to the creation of an illegal COPY. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "sgpr-copies" + +namespace { + +class SIFixSGPRCopies : public MachineFunctionPass { + +private: + static char ID; + const TargetRegisterClass *inferRegClassFromUses(const SIRegisterInfo *TRI, + const MachineRegisterInfo &MRI, + unsigned Reg, + unsigned SubReg) const; + const TargetRegisterClass *inferRegClassFromDef(const SIRegisterInfo *TRI, + const MachineRegisterInfo &MRI, + unsigned Reg, + unsigned SubReg) const; + bool isVGPRToSGPRCopy(const MachineInstr &Copy, const SIRegisterInfo *TRI, + const MachineRegisterInfo &MRI) const; + +public: + SIFixSGPRCopies(TargetMachine &tm) : MachineFunctionPass(ID) { } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Fix SGPR copies"; + } + +}; + +} // End anonymous namespace + +char SIFixSGPRCopies::ID = 0; + +FunctionPass *llvm::createSIFixSGPRCopiesPass(TargetMachine &tm) { + return new SIFixSGPRCopies(tm); +} + +static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) { + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + if (!MI.getOperand(i).isReg() || + !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) + continue; + + if (TRI->hasVGPRs(MRI.getRegClass(MI.getOperand(i).getReg()))) + return true; + } + return false; +} + +/// This functions walks the use list of Reg until it finds an Instruction +/// that isn't a COPY returns the register class of that instruction. +/// \return The register defined by the first non-COPY instruction. +const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromUses( + const SIRegisterInfo *TRI, + const MachineRegisterInfo &MRI, + unsigned Reg, + unsigned SubReg) const { + + const TargetRegisterClass *RC + = TargetRegisterInfo::isVirtualRegister(Reg) ? + MRI.getRegClass(Reg) : + TRI->getPhysRegClass(Reg); + + RC = TRI->getSubRegClass(RC, SubReg); + for (MachineRegisterInfo::use_instr_iterator + I = MRI.use_instr_begin(Reg), E = MRI.use_instr_end(); I != E; ++I) { + switch (I->getOpcode()) { + case AMDGPU::COPY: + RC = TRI->getCommonSubClass(RC, inferRegClassFromUses(TRI, MRI, + I->getOperand(0).getReg(), + I->getOperand(0).getSubReg())); + break; + } + } + + return RC; +} + +const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromDef( + const SIRegisterInfo *TRI, + const MachineRegisterInfo &MRI, + unsigned Reg, + unsigned SubReg) const { + if (!TargetRegisterInfo::isVirtualRegister(Reg)) { + const TargetRegisterClass *RC = TRI->getPhysRegClass(Reg); + return TRI->getSubRegClass(RC, SubReg); + } + MachineInstr *Def = MRI.getVRegDef(Reg); + if (Def->getOpcode() != AMDGPU::COPY) { + return TRI->getSubRegClass(MRI.getRegClass(Reg), SubReg); + } + + return inferRegClassFromDef(TRI, MRI, Def->getOperand(1).getReg(), + Def->getOperand(1).getSubReg()); +} + +bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy, + const SIRegisterInfo *TRI, + const MachineRegisterInfo &MRI) const { + + unsigned DstReg = Copy.getOperand(0).getReg(); + unsigned SrcReg = Copy.getOperand(1).getReg(); + unsigned SrcSubReg = Copy.getOperand(1).getSubReg(); + + if (!TargetRegisterInfo::isVirtualRegister(DstReg)) { + // If the destination register is a physical register there isn't really + // much we can do to fix this. + return false; + } + + const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); + + const TargetRegisterClass *SrcRC; + + if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || + MRI.getRegClass(SrcReg) == &AMDGPU::VReg_1RegClass) + return false; + + SrcRC = TRI->getSubRegClass(MRI.getRegClass(SrcReg), SrcSubReg); + return TRI->isSGPRClass(DstRC) && TRI->hasVGPRs(SrcRC); +} + +bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIRegisterInfo *TRI = + static_cast(MF.getSubtarget().getRegisterInfo()); + const SIInstrInfo *TII = + static_cast(MF.getSubtarget().getInstrInfo()); + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + MachineInstr &MI = *I; + if (MI.getOpcode() == AMDGPU::COPY && isVGPRToSGPRCopy(MI, TRI, MRI)) { + DEBUG(dbgs() << "Fixing VGPR -> SGPR copy:\n"); + DEBUG(MI.print(dbgs())); + TII->moveToVALU(MI); + + } + + switch (MI.getOpcode()) { + default: continue; + case AMDGPU::PHI: { + DEBUG(dbgs() << "Fixing PHI: " << MI); + + for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { + const MachineOperand &Op = MI.getOperand(i); + unsigned Reg = Op.getReg(); + const TargetRegisterClass *RC + = inferRegClassFromDef(TRI, MRI, Reg, Op.getSubReg()); + + MRI.constrainRegClass(Op.getReg(), RC); + } + unsigned Reg = MI.getOperand(0).getReg(); + const TargetRegisterClass *RC = inferRegClassFromUses(TRI, MRI, Reg, + MI.getOperand(0).getSubReg()); + if (TRI->getCommonSubClass(RC, &AMDGPU::VGPR_32RegClass)) { + MRI.constrainRegClass(Reg, &AMDGPU::VGPR_32RegClass); + } + + if (!TRI->isSGPRClass(MRI.getRegClass(Reg))) + break; + + // If a PHI node defines an SGPR and any of its operands are VGPRs, + // then we need to move it to the VALU. + // + // Also, if a PHI node defines an SGPR and has all SGPR operands + // we must move it to the VALU, because the SGPR operands will + // all end up being assigned the same register, which means + // there is a potential for a conflict if different threads take + // different control flow paths. + // + // For Example: + // + // sgpr0 = def; + // ... + // sgpr1 = def; + // ... + // sgpr2 = PHI sgpr0, sgpr1 + // use sgpr2; + // + // Will Become: + // + // sgpr2 = def; + // ... + // sgpr2 = def; + // ... + // use sgpr2 + // + // FIXME: This is OK if the branching decision is made based on an + // SGPR value. + bool SGPRBranch = false; + + // The one exception to this rule is when one of the operands + // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK + // instruction. In this case, there we know the program will + // never enter the second block (the loop) without entering + // the first block (where the condition is computed), so there + // is no chance for values to be over-written. + + bool HasBreakDef = false; + for (unsigned i = 1; i < MI.getNumOperands(); i+=2) { + unsigned Reg = MI.getOperand(i).getReg(); + if (TRI->hasVGPRs(MRI.getRegClass(Reg))) { + TII->moveToVALU(MI); + break; + } + MachineInstr *DefInstr = MRI.getUniqueVRegDef(Reg); + assert(DefInstr); + switch(DefInstr->getOpcode()) { + + case AMDGPU::SI_BREAK: + case AMDGPU::SI_IF_BREAK: + case AMDGPU::SI_ELSE_BREAK: + // If we see a PHI instruction that defines an SGPR, then that PHI + // instruction has already been considered and should have + // a *_BREAK as an operand. + case AMDGPU::PHI: + HasBreakDef = true; + break; + } + } + + if (!SGPRBranch && !HasBreakDef) + TII->moveToVALU(MI); + break; + } + case AMDGPU::REG_SEQUENCE: { + if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) || + !hasVGPROperands(MI, TRI)) + continue; + + DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI); + + TII->moveToVALU(MI); + break; + } + case AMDGPU::INSERT_SUBREG: { + const TargetRegisterClass *DstRC, *Src0RC, *Src1RC; + DstRC = MRI.getRegClass(MI.getOperand(0).getReg()); + Src0RC = MRI.getRegClass(MI.getOperand(1).getReg()); + Src1RC = MRI.getRegClass(MI.getOperand(2).getReg()); + if (TRI->isSGPRClass(DstRC) && + (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) { + DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI); + TII->moveToVALU(MI); + } + break; + } + } + } + } + + return true; +} diff --git a/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp b/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp new file mode 100644 index 00000000000..0c54446b0fb --- /dev/null +++ b/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp @@ -0,0 +1,192 @@ +//===-- SIFixSGPRLiveRanges.cpp - Fix SGPR live ranges ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// SALU instructions ignore control flow, so we need to modify the live ranges +/// of the registers they define in some cases. +/// +/// The main case we need to handle is when a def is used in one side of a +/// branch and not another. For example: +/// +/// %def +/// IF +/// ... +/// ... +/// ELSE +/// %use +/// ... +/// ENDIF +/// +/// Here we need the register allocator to avoid assigning any of the defs +/// inside of the IF to the same register as %def. In traditional live +/// interval analysis %def is not live inside the IF branch, however, since +/// SALU instructions inside of IF will be executed even if the branch is not +/// taken, there is the chance that one of the instructions will overwrite the +/// value of %def, so the use in ELSE will see the wrong value. +/// +/// The strategy we use for solving this is to add an extra use after the ENDIF: +/// +/// %def +/// IF +/// ... +/// ... +/// ELSE +/// %use +/// ... +/// ENDIF +/// %use +/// +/// Adding this use will make the def live thoughout the IF branch, which is +/// what we want. + +#include "AMDGPU.h" +#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-fix-sgpr-live-ranges" + +namespace { + +class SIFixSGPRLiveRanges : public MachineFunctionPass { +public: + static char ID; + +public: + SIFixSGPRLiveRanges() : MachineFunctionPass(ID) { + initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Fix SGPR live ranges"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIFixSGPRLiveRanges, DEBUG_TYPE, + "SI Fix SGPR Live Ranges", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_END(SIFixSGPRLiveRanges, DEBUG_TYPE, + "SI Fix SGPR Live Ranges", false, false) + +char SIFixSGPRLiveRanges::ID = 0; + +char &llvm::SIFixSGPRLiveRangesID = SIFixSGPRLiveRanges::ID; + +FunctionPass *llvm::createSIFixSGPRLiveRangesPass() { + return new SIFixSGPRLiveRanges(); +} + +bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + const SIRegisterInfo *TRI = static_cast( + MF.getSubtarget().getRegisterInfo()); + LiveIntervals *LIS = &getAnalysis(); + MachinePostDominatorTree *PDT = &getAnalysis(); + std::vector> SGPRLiveRanges; + + // First pass, collect all live intervals for SGPRs + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + for (const MachineOperand &MO : MI.defs()) { + if (MO.isImplicit()) + continue; + unsigned Def = MO.getReg(); + if (TargetRegisterInfo::isVirtualRegister(Def)) { + if (TRI->isSGPRClass(MRI.getRegClass(Def))) + SGPRLiveRanges.push_back( + std::make_pair(Def, &LIS->getInterval(Def))); + } else if (TRI->isSGPRClass(TRI->getPhysRegClass(Def))) { + SGPRLiveRanges.push_back( + std::make_pair(Def, &LIS->getRegUnit(Def))); + } + } + } + } + + // Second pass fix the intervals + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + MachineBasicBlock &MBB = *BI; + if (MBB.succ_size() < 2) + continue; + + // We have structured control flow, so number of succesors should be two. + assert(MBB.succ_size() == 2); + MachineBasicBlock *SuccA = *MBB.succ_begin(); + MachineBasicBlock *SuccB = *(++MBB.succ_begin()); + MachineBasicBlock *NCD = PDT->findNearestCommonDominator(SuccA, SuccB); + + if (!NCD) + continue; + + MachineBasicBlock::iterator NCDTerm = NCD->getFirstTerminator(); + + if (NCDTerm != NCD->end() && NCDTerm->getOpcode() == AMDGPU::SI_ELSE) { + assert(NCD->succ_size() == 2); + // We want to make sure we insert the Use after the ENDIF, not after + // the ELSE. + NCD = PDT->findNearestCommonDominator(*NCD->succ_begin(), + *(++NCD->succ_begin())); + } + assert(SuccA && SuccB); + for (std::pair RegLR : SGPRLiveRanges) { + unsigned Reg = RegLR.first; + LiveRange *LR = RegLR.second; + + // FIXME: We could be smarter here. If the register is Live-In to + // one block, but the other doesn't have any SGPR defs, then there + // won't be a conflict. Also, if the branch decision is based on + // a value in an SGPR, then there will be no conflict. + bool LiveInToA = LIS->isLiveInToMBB(*LR, SuccA); + bool LiveInToB = LIS->isLiveInToMBB(*LR, SuccB); + + if ((!LiveInToA && !LiveInToB) || + (LiveInToA && LiveInToB)) + continue; + + // This interval is live in to one successor, but not the other, so + // we need to update its range so it is live in to both. + DEBUG(dbgs() << "Possible SGPR conflict detected " << " in " << *LR << + " BB#" << SuccA->getNumber() << ", BB#" << + SuccB->getNumber() << + " with NCD = " << NCD->getNumber() << '\n'); + + // FIXME: Need to figure out how to update LiveRange here so this pass + // will be able to preserve LiveInterval analysis. + BuildMI(*NCD, NCD->getFirstNonPHI(), DebugLoc(), + TII->get(AMDGPU::SGPR_USE)) + .addReg(Reg, RegState::Implicit); + DEBUG(NCD->getFirstNonPHI()->dump()); + } + } + + return false; +} diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp new file mode 100644 index 00000000000..d14e37a6461 --- /dev/null +++ b/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -0,0 +1,288 @@ +//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// +// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +#define DEBUG_TYPE "si-fold-operands" +using namespace llvm; + +namespace { + +class SIFoldOperands : public MachineFunctionPass { +public: + static char ID; + +public: + SIFoldOperands() : MachineFunctionPass(ID) { + initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Fold Operands"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +struct FoldCandidate { + MachineInstr *UseMI; + unsigned UseOpNo; + MachineOperand *OpToFold; + uint64_t ImmToFold; + + FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) : + UseMI(MI), UseOpNo(OpNo) { + + if (FoldOp->isImm()) { + OpToFold = nullptr; + ImmToFold = FoldOp->getImm(); + } else { + assert(FoldOp->isReg()); + OpToFold = FoldOp; + } + } + + bool isImm() const { + return !OpToFold; + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIFoldOperands, DEBUG_TYPE, + "SI Fold Operands", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(SIFoldOperands, DEBUG_TYPE, + "SI Fold Operands", false, false) + +char SIFoldOperands::ID = 0; + +char &llvm::SIFoldOperandsID = SIFoldOperands::ID; + +FunctionPass *llvm::createSIFoldOperandsPass() { + return new SIFoldOperands(); +} + +static bool isSafeToFold(unsigned Opcode) { + switch(Opcode) { + case AMDGPU::V_MOV_B32_e32: + case AMDGPU::V_MOV_B32_e64: + case AMDGPU::V_MOV_B64_PSEUDO: + case AMDGPU::S_MOV_B32: + case AMDGPU::S_MOV_B64: + case AMDGPU::COPY: + return true; + default: + return false; + } +} + +static bool updateOperand(FoldCandidate &Fold, + const TargetRegisterInfo &TRI) { + MachineInstr *MI = Fold.UseMI; + MachineOperand &Old = MI->getOperand(Fold.UseOpNo); + assert(Old.isReg()); + + if (Fold.isImm()) { + Old.ChangeToImmediate(Fold.ImmToFold); + return true; + } + + MachineOperand *New = Fold.OpToFold; + if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) && + TargetRegisterInfo::isVirtualRegister(New->getReg())) { + Old.substVirtReg(New->getReg(), New->getSubReg(), TRI); + return true; + } + + // FIXME: Handle physical registers. + + return false; +} + +static bool tryAddToFoldList(std::vector &FoldList, + MachineInstr *MI, unsigned OpNo, + MachineOperand *OpToFold, + const SIInstrInfo *TII) { + if (!TII->isOperandLegal(MI, OpNo, OpToFold)) { + // Operand is not legal, so try to commute the instruction to + // see if this makes it possible to fold. + unsigned CommuteIdx0; + unsigned CommuteIdx1; + bool CanCommute = TII->findCommutedOpIndices(MI, CommuteIdx0, CommuteIdx1); + + if (CanCommute) { + if (CommuteIdx0 == OpNo) + OpNo = CommuteIdx1; + else if (CommuteIdx1 == OpNo) + OpNo = CommuteIdx0; + } + + if (!CanCommute || !TII->commuteInstruction(MI)) + return false; + + if (!TII->isOperandLegal(MI, OpNo, OpToFold)) + return false; + } + + FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold)); + return true; +} + +bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIInstrInfo *TII = + static_cast(MF.getSubtarget().getInstrInfo()); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + MachineInstr &MI = *I; + + if (!isSafeToFold(MI.getOpcode())) + continue; + + unsigned OpSize = TII->getOpSize(MI, 1); + MachineOperand &OpToFold = MI.getOperand(1); + bool FoldingImm = OpToFold.isImm(); + + // FIXME: We could also be folding things like FrameIndexes and + // TargetIndexes. + if (!FoldingImm && !OpToFold.isReg()) + continue; + + // Folding immediates with more than one use will increase program size. + // FIXME: This will also reduce register usage, which may be better + // in some cases. A better heuristic is needed. + if (FoldingImm && !TII->isInlineConstant(OpToFold, OpSize) && + !MRI.hasOneUse(MI.getOperand(0).getReg())) + continue; + + // FIXME: Fold operands with subregs. + if (OpToFold.isReg() && + (!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()) || + OpToFold.getSubReg())) + continue; + + std::vector FoldList; + for (MachineRegisterInfo::use_iterator + Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end(); + Use != E; ++Use) { + + MachineInstr *UseMI = Use->getParent(); + const MachineOperand &UseOp = UseMI->getOperand(Use.getOperandNo()); + + // FIXME: Fold operands with subregs. + if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) || + UseOp.isImplicit())) { + continue; + } + + APInt Imm; + + if (FoldingImm) { + unsigned UseReg = UseOp.getReg(); + const TargetRegisterClass *UseRC + = TargetRegisterInfo::isVirtualRegister(UseReg) ? + MRI.getRegClass(UseReg) : + TRI.getPhysRegClass(UseReg); + + Imm = APInt(64, OpToFold.getImm()); + + // Split 64-bit constants into 32-bits for folding. + if (UseOp.getSubReg()) { + if (UseRC->getSize() != 8) + continue; + + if (UseOp.getSubReg() == AMDGPU::sub0) { + Imm = Imm.getLoBits(32); + } else { + assert(UseOp.getSubReg() == AMDGPU::sub1); + Imm = Imm.getHiBits(32); + } + } + + // In order to fold immediates into copies, we need to change the + // copy to a MOV. + if (UseMI->getOpcode() == AMDGPU::COPY) { + unsigned DestReg = UseMI->getOperand(0).getReg(); + const TargetRegisterClass *DestRC + = TargetRegisterInfo::isVirtualRegister(DestReg) ? + MRI.getRegClass(DestReg) : + TRI.getPhysRegClass(DestReg); + + unsigned MovOp = TII->getMovOpcode(DestRC); + if (MovOp == AMDGPU::COPY) + continue; + + UseMI->setDesc(TII->get(MovOp)); + } + } + + const MCInstrDesc &UseDesc = UseMI->getDesc(); + + // Don't fold into target independent nodes. Target independent opcodes + // don't have defined register classes. + if (UseDesc.isVariadic() || + UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1) + continue; + + if (FoldingImm) { + MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); + tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &ImmOp, TII); + continue; + } + + tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &OpToFold, TII); + + // FIXME: We could try to change the instruction from 64-bit to 32-bit + // to enable more folding opportunites. The shrink operands pass + // already does this. + } + + for (FoldCandidate &Fold : FoldList) { + if (updateOperand(Fold, TRI)) { + // Clear kill flags. + if (!Fold.isImm()) { + assert(Fold.OpToFold && Fold.OpToFold->isReg()); + Fold.OpToFold->setIsKill(false); + } + DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << + Fold.UseOpNo << " of " << *Fold.UseMI << '\n'); + } + } + } + } + return false; +} diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp new file mode 100644 index 00000000000..12d08cf4c7f --- /dev/null +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -0,0 +1,2241 @@ +//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Custom DAG lowering for SI +// +//===----------------------------------------------------------------------===// + +#ifdef _MSC_VER +// Provide M_PI. +#define _USE_MATH_DEFINES +#include +#endif + +#include "SIISelLowering.h" +#include "AMDGPU.h" +#include "AMDGPUIntrinsicInfo.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/IR/Function.h" +#include "llvm/ADT/SmallString.h" + +using namespace llvm; + +SITargetLowering::SITargetLowering(TargetMachine &TM, + const AMDGPUSubtarget &STI) + : AMDGPUTargetLowering(TM, STI) { + addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); + addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); + + addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); + addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); + + addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); + addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); + + addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); + addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); + addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); + + addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); + addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); + + addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass); + addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); + + addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass); + addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); + + computeRegisterProperties(STI.getRegisterInfo()); + + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); + + setOperationAction(ISD::ADD, MVT::i32, Legal); + setOperationAction(ISD::ADDC, MVT::i32, Legal); + setOperationAction(ISD::ADDE, MVT::i32, Legal); + setOperationAction(ISD::SUBC, MVT::i32, Legal); + setOperationAction(ISD::SUBE, MVT::i32, Legal); + + setOperationAction(ISD::FSIN, MVT::f32, Custom); + setOperationAction(ISD::FCOS, MVT::f32, Custom); + + setOperationAction(ISD::FMINNUM, MVT::f64, Legal); + setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); + + // We need to custom lower vector stores from local memory + setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + setOperationAction(ISD::LOAD, MVT::v8i32, Custom); + setOperationAction(ISD::LOAD, MVT::v16i32, Custom); + + setOperationAction(ISD::STORE, MVT::v8i32, Custom); + setOperationAction(ISD::STORE, MVT::v16i32, Custom); + + setOperationAction(ISD::STORE, MVT::i1, Custom); + setOperationAction(ISD::STORE, MVT::v4i32, Custom); + + setOperationAction(ISD::SELECT, MVT::i64, Custom); + setOperationAction(ISD::SELECT, MVT::f64, Promote); + AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); + + setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); + + setOperationAction(ISD::SETCC, MVT::v2i1, Expand); + setOperationAction(ISD::SETCC, MVT::v4i1, Expand); + + setOperationAction(ISD::BSWAP, MVT::i32, Legal); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); + + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); + + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + setOperationAction(ISD::BRCOND, MVT::Other, Custom); + + for (MVT VT : MVT::integer_valuetypes()) { + if (VT == MVT::i64) + continue; + + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand); + + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand); + + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand); + } + + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i16, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v16i16, Expand); + } + + for (MVT VT : MVT::fp_valuetypes()) + setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); + + setTruncStoreAction(MVT::i64, MVT::i32, Expand); + setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); + setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); + + setOperationAction(ISD::LOAD, MVT::i1, Custom); + + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); + setOperationAction(ISD::FrameIndex, MVT::i32, Custom); + + // These should use UDIVREM, so set them to expand + setOperationAction(ISD::UDIV, MVT::i64, Expand); + setOperationAction(ISD::UREM, MVT::i64, Expand); + + setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); + setOperationAction(ISD::SELECT, MVT::i1, Promote); + + // We only support LOAD/STORE and vector manipulation ops for vectors + // with > 4 elements. + for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}) { + for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { + switch(Op) { + case ISD::LOAD: + case ISD::STORE: + case ISD::BUILD_VECTOR: + case ISD::BITCAST: + case ISD::EXTRACT_VECTOR_ELT: + case ISD::INSERT_VECTOR_ELT: + case ISD::INSERT_SUBVECTOR: + case ISD::EXTRACT_SUBVECTOR: + break; + case ISD::CONCAT_VECTORS: + setOperationAction(Op, VT, Custom); + break; + default: + setOperationAction(Op, VT, Expand); + break; + } + } + } + + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { + setOperationAction(ISD::FTRUNC, MVT::f64, Legal); + setOperationAction(ISD::FCEIL, MVT::f64, Legal); + setOperationAction(ISD::FRINT, MVT::f64, Legal); + } + + setOperationAction(ISD::FFLOOR, MVT::f64, Legal); + setOperationAction(ISD::FDIV, MVT::f32, Custom); + setOperationAction(ISD::FDIV, MVT::f64, Custom); + + setTargetDAGCombine(ISD::FADD); + setTargetDAGCombine(ISD::FSUB); + setTargetDAGCombine(ISD::FMINNUM); + setTargetDAGCombine(ISD::FMAXNUM); + setTargetDAGCombine(ISD::SMIN); + setTargetDAGCombine(ISD::SMAX); + setTargetDAGCombine(ISD::UMIN); + setTargetDAGCombine(ISD::UMAX); + setTargetDAGCombine(ISD::SELECT_CC); + setTargetDAGCombine(ISD::SETCC); + setTargetDAGCombine(ISD::AND); + setTargetDAGCombine(ISD::OR); + setTargetDAGCombine(ISD::UINT_TO_FP); + + // All memory operations. Some folding on the pointer operand is done to help + // matching the constant offsets in the addressing modes. + setTargetDAGCombine(ISD::LOAD); + setTargetDAGCombine(ISD::STORE); + setTargetDAGCombine(ISD::ATOMIC_LOAD); + setTargetDAGCombine(ISD::ATOMIC_STORE); + setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP); + setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS); + setTargetDAGCombine(ISD::ATOMIC_SWAP); + setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD); + setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB); + setTargetDAGCombine(ISD::ATOMIC_LOAD_AND); + setTargetDAGCombine(ISD::ATOMIC_LOAD_OR); + setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR); + setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND); + setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN); + setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX); + setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN); + setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX); + + setSchedulingPreference(Sched::RegPressure); +} + +//===----------------------------------------------------------------------===// +// TargetLowering queries +//===----------------------------------------------------------------------===// + +bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl &, + EVT) const { + // SI has some legal vector types, but no legal vector operations. Say no + // shuffles are legal in order to prefer scalarizing some vector operations. + return false; +} + +bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM, + Type *Ty, unsigned AS) const { + // No global is ever allowed as a base. + if (AM.BaseGV) + return false; + + switch (AS) { + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions? + case AMDGPUAS::PRIVATE_ADDRESS: + case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: { + // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and + // additionally can do r + r + i with addr64. 32-bit has more addressing + // mode options. Depending on the resource constant, it can also do + // (i64 r0) + (i32 r1) * (i14 i). + // + // SMRD instructions have an 8-bit, dword offset. + // + // Assume nonunifom access, since the address space isn't enough to know + // what instruction we will use, and since we don't know if this is a load + // or store and scalar stores are only available on VI. + // + // We also know if we are doing an extload, we can't do a scalar load. + // + // Private arrays end up using a scratch buffer most of the time, so also + // assume those use MUBUF instructions. Scratch loads / stores are currently + // implemented as mubuf instructions with offen bit set, so slightly + // different than the normal addr64. + if (!isUInt<12>(AM.BaseOffs)) + return false; + + // FIXME: Since we can split immediate into soffset and immediate offset, + // would it make sense to allow any immediate? + + switch (AM.Scale) { + case 0: // r + i or just i, depending on HasBaseReg. + return true; + case 1: + return true; // We have r + r or r + i. + case 2: + if (AM.HasBaseReg) { + // Reject 2 * r + r. + return false; + } + + // Allow 2 * r as r + r + // Or 2 * r + i is allowed as r + r + i. + return true; + default: // Don't allow n * r + return false; + } + } + case AMDGPUAS::LOCAL_ADDRESS: + case AMDGPUAS::REGION_ADDRESS: { + // Basic, single offset DS instructions allow a 16-bit unsigned immediate + // field. + // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have + // an 8-bit dword offset but we don't know the alignment here. + if (!isUInt<16>(AM.BaseOffs)) + return false; + + if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. + return true; + + if (AM.Scale == 1 && AM.HasBaseReg) + return true; + + return false; + } + case AMDGPUAS::FLAT_ADDRESS: { + // Flat instructions do not have offsets, and only have the register + // address. + return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1); + } + default: + llvm_unreachable("unhandled address space"); + } +} + +bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, + unsigned AddrSpace, + unsigned Align, + bool *IsFast) const { + if (IsFast) + *IsFast = false; + + // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, + // which isn't a simple VT. + if (!VT.isSimple() || VT == MVT::Other) + return false; + + // TODO - CI+ supports unaligned memory accesses, but this requires driver + // support. + + // XXX - The only mention I see of this in the ISA manual is for LDS direct + // reads the "byte address and must be dword aligned". Is it also true for the + // normal loads and stores? + if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) { + // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte + // aligned, 8 byte access in a single operation using ds_read2/write2_b32 + // with adjacent offsets. + return Align % 4 == 0; + } + + // Smaller than dword value must be aligned. + // FIXME: This should be allowed on CI+ + if (VT.bitsLT(MVT::i32)) + return false; + + // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the + // byte-address are ignored, thus forcing Dword alignment. + // This applies to private, global, and constant memory. + if (IsFast) + *IsFast = true; + + return VT.bitsGT(MVT::i32) && Align % 4 == 0; +} + +EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, + unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, + bool MemcpyStrSrc, + MachineFunction &MF) const { + // FIXME: Should account for address space here. + + // The default fallback uses the private pointer size as a guess for a type to + // use. Make sure we switch these to 64-bit accesses. + + if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global + return MVT::v4i32; + + if (Size >= 8 && DstAlign >= 4) + return MVT::v2i32; + + // Use the default. + return MVT::Other; +} + +TargetLoweringBase::LegalizeTypeAction +SITargetLowering::getPreferredVectorAction(EVT VT) const { + if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) + return TypeSplitVector; + + return TargetLoweringBase::getPreferredVectorAction(VT); +} + +bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, + Type *Ty) const { + const SIInstrInfo *TII = + static_cast(Subtarget->getInstrInfo()); + return TII->isInlineConstant(Imm); +} + +static EVT toIntegerVT(EVT VT) { + if (VT.isVector()) + return VT.changeVectorElementTypeToInteger(); + return MVT::getIntegerVT(VT.getSizeInBits()); +} + +SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, + SDLoc SL, SDValue Chain, + unsigned Offset, bool Signed) const { + const DataLayout *DL = getDataLayout(); + MachineFunction &MF = DAG.getMachineFunction(); + const SIRegisterInfo *TRI = + static_cast(Subtarget->getRegisterInfo()); + unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); + + Type *Ty = VT.getTypeForEVT(*DAG.getContext()); + + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); + MVT PtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS); + PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); + SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, + MRI.getLiveInVirtReg(InputPtrReg), PtrVT); + SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, + DAG.getConstant(Offset, SL, PtrVT)); + SDValue PtrOffset = DAG.getUNDEF(getPointerTy(AMDGPUAS::CONSTANT_ADDRESS)); + MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); + + unsigned Align = DL->getABITypeAlignment(Ty); + + if (VT != MemVT && VT.isFloatingPoint()) { + // Do an integer load and convert. + // FIXME: This is mostly because load legalization after type legalization + // doesn't handle FP extloads. + assert(VT.getScalarType() == MVT::f32 && + MemVT.getScalarType() == MVT::f16); + + EVT IVT = toIntegerVT(VT); + EVT MemIVT = toIntegerVT(MemVT); + SDValue Load = DAG.getLoad(ISD::UNINDEXED, ISD::ZEXTLOAD, + IVT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemIVT, + false, // isVolatile + true, // isNonTemporal + true, // isInvariant + Align); // Alignment + return DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load); + } + + ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD; + return DAG.getLoad(ISD::UNINDEXED, ExtTy, + VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT, + false, // isVolatile + true, // isNonTemporal + true, // isInvariant + Align); // Alignment +} + +SDValue SITargetLowering::LowerFormalArguments( + SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl &Ins, SDLoc DL, SelectionDAG &DAG, + SmallVectorImpl &InVals) const { + const SIRegisterInfo *TRI = + static_cast(Subtarget->getRegisterInfo()); + + MachineFunction &MF = DAG.getMachineFunction(); + FunctionType *FType = MF.getFunction()->getFunctionType(); + SIMachineFunctionInfo *Info = MF.getInfo(); + + assert(CallConv == CallingConv::C); + + SmallVector Splits; + BitVector Skipped(Ins.size()); + + for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { + const ISD::InputArg &Arg = Ins[i]; + + // First check if it's a PS input addr + if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() && + !Arg.Flags.isByVal()) { + + assert((PSInputNum <= 15) && "Too many PS inputs!"); + + if (!Arg.Used) { + // We can savely skip PS inputs + Skipped.set(i); + ++PSInputNum; + continue; + } + + Info->PSInputAddr |= 1 << PSInputNum++; + } + + // Second split vertices into their elements + if (Info->getShaderType() != ShaderType::COMPUTE && Arg.VT.isVector()) { + ISD::InputArg NewArg = Arg; + NewArg.Flags.setSplit(); + NewArg.VT = Arg.VT.getVectorElementType(); + + // We REALLY want the ORIGINAL number of vertex elements here, e.g. a + // three or five element vertex only needs three or five registers, + // NOT four or eigth. + Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); + unsigned NumElements = ParamType->getVectorNumElements(); + + for (unsigned j = 0; j != NumElements; ++j) { + Splits.push_back(NewArg); + NewArg.PartOffset += NewArg.VT.getStoreSize(); + } + + } else if (Info->getShaderType() != ShaderType::COMPUTE) { + Splits.push_back(Arg); + } + } + + SmallVector ArgLocs; + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); + + // At least one interpolation mode must be enabled or else the GPU will hang. + if (Info->getShaderType() == ShaderType::PIXEL && + (Info->PSInputAddr & 0x7F) == 0) { + Info->PSInputAddr |= 1; + CCInfo.AllocateReg(AMDGPU::VGPR0); + CCInfo.AllocateReg(AMDGPU::VGPR1); + } + + // The pointer to the list of arguments is stored in SGPR0, SGPR1 + // The pointer to the scratch buffer is stored in SGPR2, SGPR3 + if (Info->getShaderType() == ShaderType::COMPUTE) { + if (Subtarget->isAmdHsaOS()) + Info->NumUserSGPRs = 2; // FIXME: Need to support scratch buffers. + else + Info->NumUserSGPRs = 4; + + unsigned InputPtrReg = + TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); + unsigned InputPtrRegLo = + TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0); + unsigned InputPtrRegHi = + TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1); + + unsigned ScratchPtrReg = + TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); + unsigned ScratchPtrRegLo = + TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 0); + unsigned ScratchPtrRegHi = + TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 1); + + CCInfo.AllocateReg(InputPtrRegLo); + CCInfo.AllocateReg(InputPtrRegHi); + CCInfo.AllocateReg(ScratchPtrRegLo); + CCInfo.AllocateReg(ScratchPtrRegHi); + MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); + MF.addLiveIn(ScratchPtrReg, &AMDGPU::SReg_64RegClass); + } + + if (Info->getShaderType() == ShaderType::COMPUTE) { + getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, + Splits); + } + + AnalyzeFormalArguments(CCInfo, Splits); + + for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { + + const ISD::InputArg &Arg = Ins[i]; + if (Skipped[i]) { + InVals.push_back(DAG.getUNDEF(Arg.VT)); + continue; + } + + CCValAssign &VA = ArgLocs[ArgIdx++]; + MVT VT = VA.getLocVT(); + + if (VA.isMemLoc()) { + VT = Ins[i].VT; + EVT MemVT = Splits[i].VT; + const unsigned Offset = 36 + VA.getLocMemOffset(); + // The first 36 bytes of the input buffer contains information about + // thread group and global sizes. + SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, DAG.getRoot(), + Offset, Ins[i].Flags.isSExt()); + + const PointerType *ParamTy = + dyn_cast(FType->getParamType(Ins[i].getOrigArgIndex())); + if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && + ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + // On SI local pointers are just offsets into LDS, so they are always + // less than 16-bits. On CI and newer they could potentially be + // real pointers, so we can't guarantee their size. + Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg, + DAG.getValueType(MVT::i16)); + } + + InVals.push_back(Arg); + Info->ABIArgOffset = Offset + MemVT.getStoreSize(); + continue; + } + assert(VA.isRegLoc() && "Parameter must be in a register!"); + + unsigned Reg = VA.getLocReg(); + + if (VT == MVT::i64) { + // For now assume it is a pointer + Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, + &AMDGPU::SReg_64RegClass); + Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass); + InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); + continue; + } + + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); + + Reg = MF.addLiveIn(Reg, RC); + SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); + + if (Arg.VT.isVector()) { + + // Build a vector from the registers + Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); + unsigned NumElements = ParamType->getVectorNumElements(); + + SmallVector Regs; + Regs.push_back(Val); + for (unsigned j = 1; j != NumElements; ++j) { + Reg = ArgLocs[ArgIdx++].getLocReg(); + Reg = MF.addLiveIn(Reg, RC); + Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); + } + + // Fill up the missing vector elements + NumElements = Arg.VT.getVectorNumElements() - NumElements; + Regs.append(NumElements, DAG.getUNDEF(VT)); + + InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs)); + continue; + } + + InVals.push_back(Val); + } + + if (Info->getShaderType() != ShaderType::COMPUTE) { + unsigned ScratchIdx = CCInfo.getFirstUnallocated(ArrayRef( + AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs())); + Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx); + } + return Chain; +} + +MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( + MachineInstr * MI, MachineBasicBlock * BB) const { + + MachineBasicBlock::iterator I = *MI; + const SIInstrInfo *TII = + static_cast(Subtarget->getInstrInfo()); + + switch (MI->getOpcode()) { + default: + return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); + case AMDGPU::BRANCH: + return BB; + case AMDGPU::SI_RegisterStorePseudo: { + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + MachineInstrBuilder MIB = + BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore), + Reg); + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) + MIB.addOperand(MI->getOperand(i)); + + MI->eraseFromParent(); + break; + } + } + return BB; +} + +bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { + // This currently forces unfolding various combinations of fsub into fma with + // free fneg'd operands. As long as we have fast FMA (controlled by + // isFMAFasterThanFMulAndFAdd), we should perform these. + + // When fma is quarter rate, for f64 where add / sub are at best half rate, + // most of these combines appear to be cycle neutral but save on instruction + // count / code size. + return true; +} + +EVT SITargetLowering::getSetCCResultType(LLVMContext &Ctx, EVT VT) const { + if (!VT.isVector()) { + return MVT::i1; + } + return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements()); +} + +MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const { + return MVT::i32; +} + +// Answering this is somewhat tricky and depends on the specific device which +// have different rates for fma or all f64 operations. +// +// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other +// regardless of which device (although the number of cycles differs between +// devices), so it is always profitable for f64. +// +// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable +// only on full rate devices. Normally, we should prefer selecting v_mad_f32 +// which we can always do even without fused FP ops since it returns the same +// result as the separate operations and since it is always full +// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32 +// however does not support denormals, so we do report fma as faster if we have +// a fast fma device and require denormals. +// +bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { + VT = VT.getScalarType(); + + if (!VT.isSimple()) + return false; + + switch (VT.getSimpleVT().SimpleTy) { + case MVT::f32: + // This is as fast on some subtargets. However, we always have full rate f32 + // mad available which returns the same result as the separate operations + // which we should prefer over fma. We can't use this if we want to support + // denormals, so only report this in these cases. + return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32(); + case MVT::f64: + return true; + default: + break; + } + + return false; +} + +//===----------------------------------------------------------------------===// +// Custom DAG Lowering Operations +//===----------------------------------------------------------------------===// + +SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { + switch (Op.getOpcode()) { + default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); + case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); + case ISD::BRCOND: return LowerBRCOND(Op, DAG); + case ISD::LOAD: { + SDValue Result = LowerLOAD(Op, DAG); + assert((!Result.getNode() || + Result.getNode()->getNumValues() == 2) && + "Load should return a value and a chain"); + return Result; + } + + case ISD::FSIN: + case ISD::FCOS: + return LowerTrig(Op, DAG); + case ISD::SELECT: return LowerSELECT(Op, DAG); + case ISD::FDIV: return LowerFDIV(Op, DAG); + case ISD::STORE: return LowerSTORE(Op, DAG); + case ISD::GlobalAddress: { + MachineFunction &MF = DAG.getMachineFunction(); + SIMachineFunctionInfo *MFI = MF.getInfo(); + return LowerGlobalAddress(MFI, Op, DAG); + } + case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); + } + return SDValue(); +} + +/// \brief Helper function for LowerBRCOND +static SDNode *findUser(SDValue Value, unsigned Opcode) { + + SDNode *Parent = Value.getNode(); + for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); + I != E; ++I) { + + if (I.getUse().get() != Value) + continue; + + if (I->getOpcode() == Opcode) + return *I; + } + return nullptr; +} + +SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { + + FrameIndexSDNode *FINode = cast(Op); + unsigned FrameIndex = FINode->getIndex(); + + return DAG.getTargetFrameIndex(FrameIndex, MVT::i32); +} + +/// This transforms the control flow intrinsics to get the branch destination as +/// last parameter, also switches branch target with BR if the need arise +SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, + SelectionDAG &DAG) const { + + SDLoc DL(BRCOND); + + SDNode *Intr = BRCOND.getOperand(1).getNode(); + SDValue Target = BRCOND.getOperand(2); + SDNode *BR = nullptr; + + if (Intr->getOpcode() == ISD::SETCC) { + // As long as we negate the condition everything is fine + SDNode *SetCC = Intr; + assert(SetCC->getConstantOperandVal(1) == 1); + assert(cast(SetCC->getOperand(2).getNode())->get() == + ISD::SETNE); + Intr = SetCC->getOperand(0).getNode(); + + } else { + // Get the target from BR if we don't negate the condition + BR = findUser(BRCOND, ISD::BR); + Target = BR->getOperand(1); + } + + assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); + + // Build the result and + ArrayRef Res(Intr->value_begin() + 1, Intr->value_end()); + + // operands of the new intrinsic call + SmallVector Ops; + Ops.push_back(BRCOND.getOperand(0)); + Ops.append(Intr->op_begin() + 1, Intr->op_end()); + Ops.push_back(Target); + + // build the new intrinsic call + SDNode *Result = DAG.getNode( + Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, + DAG.getVTList(Res), Ops).getNode(); + + if (BR) { + // Give the branch instruction our target + SDValue Ops[] = { + BR->getOperand(0), + BRCOND.getOperand(2) + }; + SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops); + DAG.ReplaceAllUsesWith(BR, NewBR.getNode()); + BR = NewBR.getNode(); + } + + SDValue Chain = SDValue(Result, Result->getNumValues() - 1); + + // Copy the intrinsic results to registers + for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { + SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); + if (!CopyToReg) + continue; + + Chain = DAG.getCopyToReg( + Chain, DL, + CopyToReg->getOperand(1), + SDValue(Result, i - 1), + SDValue()); + + DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); + } + + // Remove the old intrinsic from the chain + DAG.ReplaceAllUsesOfValueWith( + SDValue(Intr, Intr->getNumValues() - 1), + Intr->getOperand(0)); + + return Chain; +} + +SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, + SDValue Op, + SelectionDAG &DAG) const { + GlobalAddressSDNode *GSD = cast(Op); + + if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) + return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); + + SDLoc DL(GSD); + const GlobalValue *GV = GSD->getGlobal(); + MVT PtrVT = getPointerTy(GSD->getAddressSpace()); + + SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT); + SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32); + + SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, + DAG.getConstant(0, DL, MVT::i32)); + SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, + DAG.getConstant(1, DL, MVT::i32)); + + SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue), + PtrLo, GA); + SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue), + PtrHi, DAG.getConstant(0, DL, MVT::i32), + SDValue(Lo.getNode(), 1)); + return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi); +} + +SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, + SDValue V) const { + // We can't use CopyToReg, because MachineCSE won't combine COPY instructions, + // so we will end up with redundant moves to m0. + // + // We can't use S_MOV_B32, because there is no way to specify m0 as the + // destination register. + // + // We have to use them both. Machine cse will combine all the S_MOV_B32 + // instructions and the register coalescer eliminate the extra copies. + SDNode *M0 = DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, V.getValueType(), V); + return DAG.getCopyToReg(Chain, DL, DAG.getRegister(AMDGPU::M0, MVT::i32), + SDValue(M0, 0), SDValue()); // Glue + // A Null SDValue creates + // a glue result. +} + +SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + const SIRegisterInfo *TRI = + static_cast(Subtarget->getRegisterInfo()); + + EVT VT = Op.getValueType(); + SDLoc DL(Op); + unsigned IntrinsicID = cast(Op.getOperand(0))->getZExtValue(); + + switch (IntrinsicID) { + case Intrinsic::r600_read_ngroups_x: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::NGROUPS_X, false); + case Intrinsic::r600_read_ngroups_y: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::NGROUPS_Y, false); + case Intrinsic::r600_read_ngroups_z: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::NGROUPS_Z, false); + case Intrinsic::r600_read_global_size_x: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::GLOBAL_SIZE_X, false); + case Intrinsic::r600_read_global_size_y: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); + case Intrinsic::r600_read_global_size_z: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); + case Intrinsic::r600_read_local_size_x: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::LOCAL_SIZE_X, false); + case Intrinsic::r600_read_local_size_y: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::LOCAL_SIZE_Y, false); + case Intrinsic::r600_read_local_size_z: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::LOCAL_SIZE_Z, false); + + case Intrinsic::AMDGPU_read_workdim: + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + MF.getInfo()->ABIArgOffset, + false); + + case Intrinsic::r600_read_tgid_x: + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT); + case Intrinsic::r600_read_tgid_y: + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Y), VT); + case Intrinsic::r600_read_tgid_z: + return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT); + case Intrinsic::r600_read_tidig_x: + return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT); + case Intrinsic::r600_read_tidig_y: + return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT); + case Intrinsic::r600_read_tidig_z: + return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT); + case AMDGPUIntrinsic::SI_load_const: { + SDValue Ops[] = { + Op.getOperand(1), + Op.getOperand(2) + }; + + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, + VT.getStoreSize(), 4); + return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, + Op->getVTList(), Ops, VT, MMO); + } + case AMDGPUIntrinsic::SI_sample: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG); + case AMDGPUIntrinsic::SI_sampleb: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG); + case AMDGPUIntrinsic::SI_sampled: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG); + case AMDGPUIntrinsic::SI_samplel: + return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG); + case AMDGPUIntrinsic::SI_vs_load_input: + return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)); + + case AMDGPUIntrinsic::AMDGPU_fract: + case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. + return DAG.getNode(ISD::FSUB, DL, VT, Op.getOperand(1), + DAG.getNode(ISD::FFLOOR, DL, VT, Op.getOperand(1))); + case AMDGPUIntrinsic::SI_fs_constant: { + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); + SDValue Glue = M0.getValue(1); + return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, + DAG.getConstant(2, DL, MVT::i32), // P0 + Op.getOperand(1), Op.getOperand(2), Glue); + } + case AMDGPUIntrinsic::SI_fs_interp: { + SDValue IJ = Op.getOperand(4); + SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, + DAG.getConstant(0, DL, MVT::i32)); + SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, + DAG.getConstant(1, DL, MVT::i32)); + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); + SDValue Glue = M0.getValue(1); + SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL, + DAG.getVTList(MVT::f32, MVT::Glue), + I, Op.getOperand(1), Op.getOperand(2), Glue); + Glue = SDValue(P1.getNode(), 1); + return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J, + Op.getOperand(1), Op.getOperand(2), Glue); + } + default: + return AMDGPUTargetLowering::LowerOperation(Op, DAG); + } +} + +SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + SDLoc DL(Op); + SDValue Chain = Op.getOperand(0); + unsigned IntrinsicID = cast(Op.getOperand(1))->getZExtValue(); + + switch (IntrinsicID) { + case AMDGPUIntrinsic::SI_sendmsg: { + Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3)); + SDValue Glue = Chain.getValue(1); + return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain, + Op.getOperand(2), Glue); + } + case AMDGPUIntrinsic::SI_tbuffer_store: { + SDValue Ops[] = { + Chain, + Op.getOperand(2), + Op.getOperand(3), + Op.getOperand(4), + Op.getOperand(5), + Op.getOperand(6), + Op.getOperand(7), + Op.getOperand(8), + Op.getOperand(9), + Op.getOperand(10), + Op.getOperand(11), + Op.getOperand(12), + Op.getOperand(13), + Op.getOperand(14) + }; + + EVT VT = Op.getOperand(3).getValueType(); + + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOStore, + VT.getStoreSize(), 4); + return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, + Op->getVTList(), Ops, VT, MMO); + } + default: + return SDValue(); + } +} + +SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + LoadSDNode *Load = cast(Op); + + if (Op.getValueType().isVector()) { + assert(Op.getValueType().getVectorElementType() == MVT::i32 && + "Custom lowering for non-i32 vectors hasn't been implemented."); + unsigned NumElements = Op.getValueType().getVectorNumElements(); + assert(NumElements != 2 && "v2 loads are supported for all address spaces."); + switch (Load->getAddressSpace()) { + default: break; + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::PRIVATE_ADDRESS: + // v4 loads are supported for private and global memory. + if (NumElements <= 4) + break; + // fall-through + case AMDGPUAS::LOCAL_ADDRESS: + return ScalarizeVectorLoad(Op, DAG); + } + } + + return AMDGPUTargetLowering::LowerLOAD(Op, DAG); +} + +SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, + const SDValue &Op, + SelectionDAG &DAG) const { + return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3), + Op.getOperand(4)); +} + +SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { + if (Op.getValueType() != MVT::i64) + return SDValue(); + + SDLoc DL(Op); + SDValue Cond = Op.getOperand(0); + + SDValue Zero = DAG.getConstant(0, DL, MVT::i32); + SDValue One = DAG.getConstant(1, DL, MVT::i32); + + SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); + SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2)); + + SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero); + SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero); + + SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1); + + SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One); + SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One); + + SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); + + SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i32, Lo, Hi); + return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); +} + +// Catch division cases where we can use shortcuts with rcp and rsq +// instructions. +SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + EVT VT = Op.getValueType(); + bool Unsafe = DAG.getTarget().Options.UnsafeFPMath; + + if (const ConstantFPSDNode *CLHS = dyn_cast(LHS)) { + if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals())) && + CLHS->isExactlyValue(1.0)) { + // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to + // the CI documentation has a worst case error of 1 ulp. + // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to + // use it as long as we aren't trying to use denormals. + + // 1.0 / sqrt(x) -> rsq(x) + // + // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP + // error seems really high at 2^29 ULP. + if (RHS.getOpcode() == ISD::FSQRT) + return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); + + // 1.0 / x -> rcp(x) + return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); + } + } + + if (Unsafe) { + // Turn into multiply by the reciprocal. + // x / y -> x * (1.0 / y) + SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); + return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip); + } + + return SDValue(); +} + +SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { + SDValue FastLowered = LowerFastFDIV(Op, DAG); + if (FastLowered.getNode()) + return FastLowered; + + // This uses v_rcp_f32 which does not handle denormals. Let this hit a + // selection error for now rather than do something incorrect. + if (Subtarget->hasFP32Denormals()) + return SDValue(); + + SDLoc SL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + + SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); + + const APFloat K0Val(BitsToFloat(0x6f800000)); + const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); + + const APFloat K1Val(BitsToFloat(0x2f800000)); + const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); + + const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); + + SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); + + SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); + + r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); + + SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); + + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); + + return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); +} + +SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { + if (DAG.getTarget().Options.UnsafeFPMath) + return LowerFastFDIV(Op, DAG); + + SDLoc SL(Op); + SDValue X = Op.getOperand(0); + SDValue Y = Op.getOperand(1); + + const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); + + SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1); + + SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X); + + SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0); + + SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0); + + SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One); + + SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp); + + SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One); + + SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X); + + SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3); + + SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64, + NegDivScale0, Mul, DivScale1); + + SDValue Scale; + + if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { + // Workaround a hardware bug on SI where the condition output from div_scale + // is not usable. + + const SDValue Hi = DAG.getConstant(1, SL, MVT::i32); + + // Figure out if the scale to use for div_fmas. + SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); + SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y); + SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0); + SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1); + + SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi); + SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi); + + SDValue Scale0Hi + = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi); + SDValue Scale1Hi + = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi); + + SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ); + SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ); + Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen); + } else { + Scale = DivScale1.getValue(1); + } + + SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, + Fma4, Fma3, Mul, Scale); + + return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X); +} + +SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + if (VT == MVT::f32) + return LowerFDIV32(Op, DAG); + + if (VT == MVT::f64) + return LowerFDIV64(Op, DAG); + + llvm_unreachable("Unexpected type for fdiv"); +} + +SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + StoreSDNode *Store = cast(Op); + EVT VT = Store->getMemoryVT(); + + // These stores are legal. + if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { + if (VT.isVector() && VT.getVectorNumElements() > 4) + return ScalarizeVectorStore(Op, DAG); + return SDValue(); + } + + SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); + if (Ret.getNode()) + return Ret; + + if (VT.isVector() && VT.getVectorNumElements() >= 8) + return ScalarizeVectorStore(Op, DAG); + + if (VT == MVT::i1) + return DAG.getTruncStore(Store->getChain(), DL, + DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), + Store->getBasePtr(), MVT::i1, Store->getMemOperand()); + + return SDValue(); +} + +SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue Arg = Op.getOperand(0); + SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, + DAG.getNode(ISD::FMUL, DL, VT, Arg, + DAG.getConstantFP(0.5/M_PI, DL, + VT))); + + switch (Op.getOpcode()) { + case ISD::FCOS: + return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart); + case ISD::FSIN: + return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart); + default: + llvm_unreachable("Wrong trig opcode"); + } +} + +//===----------------------------------------------------------------------===// +// Custom DAG optimizations +//===----------------------------------------------------------------------===// + +SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + EVT VT = N->getValueType(0); + EVT ScalarVT = VT.getScalarType(); + if (ScalarVT != MVT::f32) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + SDValue Src = N->getOperand(0); + EVT SrcVT = Src.getValueType(); + + // TODO: We could try to match extracting the higher bytes, which would be + // easier if i8 vectors weren't promoted to i32 vectors, particularly after + // types are legalized. v4i8 -> v4f32 is probably the only case to worry + // about in practice. + if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) { + if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) { + SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src); + DCI.AddToWorklist(Cvt.getNode()); + return Cvt; + } + } + + // We are primarily trying to catch operations on illegal vector types + // before they are expanded. + // For scalars, we can use the more flexible method of checking masked bits + // after legalization. + if (!DCI.isBeforeLegalize() || + !SrcVT.isVector() || + SrcVT.getVectorElementType() != MVT::i8) { + return SDValue(); + } + + assert(DCI.isBeforeLegalize() && "Unexpected legal type"); + + // Weird sized vectors are a pain to handle, but we know 3 is really the same + // size as 4. + unsigned NElts = SrcVT.getVectorNumElements(); + if (!SrcVT.isSimple() && NElts != 3) + return SDValue(); + + // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to + // prevent a mess from expanding to v4i32 and repacking. + if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) { + EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT); + EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT); + EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts); + LoadSDNode *Load = cast(Src); + + unsigned AS = Load->getAddressSpace(); + unsigned Align = Load->getAlignment(); + Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext()); + unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty); + + // Don't try to replace the load if we have to expand it due to alignment + // problems. Otherwise we will end up scalarizing the load, and trying to + // repack into the vector for no real reason. + if (Align < ABIAlignment && + !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) { + return SDValue(); + } + + SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT, + Load->getChain(), + Load->getBasePtr(), + LoadVT, + Load->getMemOperand()); + + // Make sure successors of the original load stay after it by updating + // them to use the new Chain. + DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1)); + + SmallVector Elts; + if (RegVT.isVector()) + DAG.ExtractVectorElements(NewLoad, Elts); + else + Elts.push_back(NewLoad); + + SmallVector Ops; + + unsigned EltIdx = 0; + for (SDValue Elt : Elts) { + unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx); + for (unsigned I = 0; I < ComponentsInElt; ++I) { + unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I; + SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt); + DCI.AddToWorklist(Cvt.getNode()); + Ops.push_back(Cvt); + } + + ++EltIdx; + } + + assert(Ops.size() == NElts); + + return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops); + } + + return SDValue(); +} + +/// \brief Return true if the given offset Size in bytes can be folded into +/// the immediate offsets of a memory instruction for the given address space. +static bool canFoldOffset(unsigned OffsetSize, unsigned AS, + const AMDGPUSubtarget &STI) { + switch (AS) { + case AMDGPUAS::GLOBAL_ADDRESS: { + // MUBUF instructions a 12-bit offset in bytes. + return isUInt<12>(OffsetSize); + } + case AMDGPUAS::CONSTANT_ADDRESS: { + // SMRD instructions have an 8-bit offset in dwords on SI and + // a 20-bit offset in bytes on VI. + if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + return isUInt<20>(OffsetSize); + else + return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4); + } + case AMDGPUAS::LOCAL_ADDRESS: + case AMDGPUAS::REGION_ADDRESS: { + // The single offset versions have a 16-bit offset in bytes. + return isUInt<16>(OffsetSize); + } + case AMDGPUAS::PRIVATE_ADDRESS: + // Indirect register addressing does not use any offsets. + default: + return 0; + } +} + +// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) + +// This is a variant of +// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2), +// +// The normal DAG combiner will do this, but only if the add has one use since +// that would increase the number of instructions. +// +// This prevents us from seeing a constant offset that can be folded into a +// memory instruction's addressing mode. If we know the resulting add offset of +// a pointer can be folded into an addressing offset, we can replace the pointer +// operand with the add of new constant offset. This eliminates one of the uses, +// and may allow the remaining use to also be simplified. +// +SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, + unsigned AddrSpace, + DAGCombinerInfo &DCI) const { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + if (N0.getOpcode() != ISD::ADD) + return SDValue(); + + const ConstantSDNode *CN1 = dyn_cast(N1); + if (!CN1) + return SDValue(); + + const ConstantSDNode *CAdd = dyn_cast(N0.getOperand(1)); + if (!CAdd) + return SDValue(); + + // If the resulting offset is too large, we can't fold it into the addressing + // mode offset. + APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); + if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *Subtarget)) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + EVT VT = N->getValueType(0); + + SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1); + SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32); + + return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset); +} + +SDValue SITargetLowering::performAndCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (DCI.isBeforeLegalize()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + + // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> + // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity) + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + if (LHS.getOpcode() == ISD::SETCC && + RHS.getOpcode() == ISD::SETCC) { + ISD::CondCode LCC = cast(LHS.getOperand(2))->get(); + ISD::CondCode RCC = cast(RHS.getOperand(2))->get(); + + SDValue X = LHS.getOperand(0); + SDValue Y = RHS.getOperand(0); + if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X) + return SDValue(); + + if (LCC == ISD::SETO) { + if (X != LHS.getOperand(1)) + return SDValue(); + + if (RCC == ISD::SETUNE) { + const ConstantFPSDNode *C1 = dyn_cast(RHS.getOperand(1)); + if (!C1 || !C1->isInfinity() || C1->isNegative()) + return SDValue(); + + const uint32_t Mask = SIInstrFlags::N_NORMAL | + SIInstrFlags::N_SUBNORMAL | + SIInstrFlags::N_ZERO | + SIInstrFlags::P_ZERO | + SIInstrFlags::P_SUBNORMAL | + SIInstrFlags::P_NORMAL; + + static_assert(((~(SIInstrFlags::S_NAN | + SIInstrFlags::Q_NAN | + SIInstrFlags::N_INFINITY | + SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask, + "mask not equal"); + + SDLoc DL(N); + return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, + X, DAG.getConstant(Mask, DL, MVT::i32)); + } + } + } + + return SDValue(); +} + +SDValue SITargetLowering::performOrCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) + if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && + RHS.getOpcode() == AMDGPUISD::FP_CLASS) { + SDValue Src = LHS.getOperand(0); + if (Src != RHS.getOperand(0)) + return SDValue(); + + const ConstantSDNode *CLHS = dyn_cast(LHS.getOperand(1)); + const ConstantSDNode *CRHS = dyn_cast(RHS.getOperand(1)); + if (!CLHS || !CRHS) + return SDValue(); + + // Only 10 bits are used. + static const uint32_t MaxMask = 0x3ff; + + uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; + SDLoc DL(N); + return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, + Src, DAG.getConstant(NewMask, DL, MVT::i32)); + } + + return SDValue(); +} + +SDValue SITargetLowering::performClassCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDValue Mask = N->getOperand(1); + + // fp_class x, 0 -> false + if (const ConstantSDNode *CMask = dyn_cast(Mask)) { + if (CMask->isNullValue()) + return DAG.getConstant(0, SDLoc(N), MVT::i1); + } + + return SDValue(); +} + +static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { + switch (Opc) { + case ISD::FMAXNUM: + return AMDGPUISD::FMAX3; + case ISD::SMAX: + return AMDGPUISD::SMAX3; + case ISD::UMAX: + return AMDGPUISD::UMAX3; + case ISD::FMINNUM: + return AMDGPUISD::FMIN3; + case ISD::SMIN: + return AMDGPUISD::SMIN3; + case ISD::UMIN: + return AMDGPUISD::UMIN3; + default: + llvm_unreachable("Not a min/max opcode"); + } +} + +SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + + unsigned Opc = N->getOpcode(); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + // Only do this if the inner op has one use since this will just increases + // register pressure for no benefit. + + // max(max(a, b), c) + if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { + SDLoc DL(N); + return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), + DL, + N->getValueType(0), + Op0.getOperand(0), + Op0.getOperand(1), + Op1); + } + + // max(a, max(b, c)) + if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { + SDLoc DL(N); + return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), + DL, + N->getValueType(0), + Op0, + Op1.getOperand(0), + Op1.getOperand(1)); + } + + return SDValue(); +} + +SDValue SITargetLowering::performSetCCCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + EVT VT = LHS.getValueType(); + + if (VT != MVT::f32 && VT != MVT::f64) + return SDValue(); + + // Match isinf pattern + // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity)) + ISD::CondCode CC = cast(N->getOperand(2))->get(); + if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) { + const ConstantFPSDNode *CRHS = dyn_cast(RHS); + if (!CRHS) + return SDValue(); + + const APFloat &APF = CRHS->getValueAPF(); + if (APF.isInfinity() && !APF.isNegative()) { + unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY; + return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0), + DAG.getConstant(Mask, SL, MVT::i32)); + } + } + + return SDValue(); +} + +SDValue SITargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + switch (N->getOpcode()) { + default: + return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); + case ISD::SETCC: + return performSetCCCombine(N, DCI); + case ISD::FMAXNUM: // TODO: What about fmax_legacy? + case ISD::FMINNUM: + case ISD::SMAX: + case ISD::SMIN: + case ISD::UMAX: + case ISD::UMIN: { + if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && + N->getValueType(0) != MVT::f64 && + getTargetMachine().getOptLevel() > CodeGenOpt::None) + return performMin3Max3Combine(N, DCI); + break; + } + + case AMDGPUISD::CVT_F32_UBYTE0: + case AMDGPUISD::CVT_F32_UBYTE1: + case AMDGPUISD::CVT_F32_UBYTE2: + case AMDGPUISD::CVT_F32_UBYTE3: { + unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; + + SDValue Src = N->getOperand(0); + APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); + + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLO.ShrinkDemandedConstant(Src, Demanded) || + TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) { + DCI.CommitTargetLoweringOpt(TLO); + } + + break; + } + + case ISD::UINT_TO_FP: { + return performUCharToFloatCombine(N, DCI); + + case ISD::FADD: { + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) + break; + + EVT VT = N->getValueType(0); + if (VT != MVT::f32) + break; + + // Only do this if we are not trying to support denormals. v_mad_f32 does + // not support denormals ever. + if (Subtarget->hasFP32Denormals()) + break; + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // These should really be instruction patterns, but writing patterns with + // source modiifiers is a pain. + + // fadd (fadd (a, a), b) -> mad 2.0, a, b + if (LHS.getOpcode() == ISD::FADD) { + SDValue A = LHS.getOperand(0); + if (A == LHS.getOperand(1)) { + const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); + return DAG.getNode(ISD::FMAD, DL, VT, Two, A, RHS); + } + } + + // fadd (b, fadd (a, a)) -> mad 2.0, a, b + if (RHS.getOpcode() == ISD::FADD) { + SDValue A = RHS.getOperand(0); + if (A == RHS.getOperand(1)) { + const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); + return DAG.getNode(ISD::FMAD, DL, VT, Two, A, LHS); + } + } + + return SDValue(); + } + case ISD::FSUB: { + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) + break; + + EVT VT = N->getValueType(0); + + // Try to get the fneg to fold into the source modifier. This undoes generic + // DAG combines and folds them into the mad. + // + // Only do this if we are not trying to support denormals. v_mad_f32 does + // not support denormals ever. + if (VT == MVT::f32 && + !Subtarget->hasFP32Denormals()) { + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + if (LHS.getOpcode() == ISD::FADD) { + // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) + + SDValue A = LHS.getOperand(0); + if (A == LHS.getOperand(1)) { + const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); + SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS); + + return DAG.getNode(ISD::FMAD, DL, VT, Two, A, NegRHS); + } + } + + if (RHS.getOpcode() == ISD::FADD) { + // (fsub c, (fadd a, a)) -> mad -2.0, a, c + + SDValue A = RHS.getOperand(0); + if (A == RHS.getOperand(1)) { + const SDValue NegTwo = DAG.getConstantFP(-2.0, DL, MVT::f32); + return DAG.getNode(ISD::FMAD, DL, VT, NegTwo, A, LHS); + } + } + + return SDValue(); + } + + break; + } + } + case ISD::LOAD: + case ISD::STORE: + case ISD::ATOMIC_LOAD: + case ISD::ATOMIC_STORE: + case ISD::ATOMIC_CMP_SWAP: + case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: + case ISD::ATOMIC_SWAP: + case ISD::ATOMIC_LOAD_ADD: + case ISD::ATOMIC_LOAD_SUB: + case ISD::ATOMIC_LOAD_AND: + case ISD::ATOMIC_LOAD_OR: + case ISD::ATOMIC_LOAD_XOR: + case ISD::ATOMIC_LOAD_NAND: + case ISD::ATOMIC_LOAD_MIN: + case ISD::ATOMIC_LOAD_MAX: + case ISD::ATOMIC_LOAD_UMIN: + case ISD::ATOMIC_LOAD_UMAX: { // TODO: Target mem intrinsics. + if (DCI.isBeforeLegalize()) + break; + + MemSDNode *MemNode = cast(N); + SDValue Ptr = MemNode->getBasePtr(); + + // TODO: We could also do this for multiplies. + unsigned AS = MemNode->getAddressSpace(); + if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) { + SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI); + if (NewPtr) { + SmallVector NewOps(MemNode->op_begin(), MemNode->op_end()); + + NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr; + return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0); + } + } + break; + } + case ISD::AND: + return performAndCombine(N, DCI); + case ISD::OR: + return performOrCombine(N, DCI); + case AMDGPUISD::FP_CLASS: + return performClassCombine(N, DCI); + } + return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); +} + +/// \brief Analyze the possible immediate value Op +/// +/// Returns -1 if it isn't an immediate, 0 if it's and inline immediate +/// and the immediate value if it's a literal immediate +int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { + + const SIInstrInfo *TII = + static_cast(Subtarget->getInstrInfo()); + + if (const ConstantSDNode *Node = dyn_cast(N)) { + if (TII->isInlineConstant(Node->getAPIntValue())) + return 0; + + uint64_t Val = Node->getZExtValue(); + return isUInt<32>(Val) ? Val : -1; + } + + if (const ConstantFPSDNode *Node = dyn_cast(N)) { + if (TII->isInlineConstant(Node->getValueAPF().bitcastToAPInt())) + return 0; + + if (Node->getValueType(0) == MVT::f32) + return FloatToBits(Node->getValueAPF().convertToFloat()); + + return -1; + } + + return -1; +} + +/// \brief Helper function for adjustWritemask +static unsigned SubIdx2Lane(unsigned Idx) { + switch (Idx) { + default: return 0; + case AMDGPU::sub0: return 0; + case AMDGPU::sub1: return 1; + case AMDGPU::sub2: return 2; + case AMDGPU::sub3: return 3; + } +} + +/// \brief Adjust the writemask of MIMG instructions +void SITargetLowering::adjustWritemask(MachineSDNode *&Node, + SelectionDAG &DAG) const { + SDNode *Users[4] = { }; + unsigned Lane = 0; + unsigned OldDmask = Node->getConstantOperandVal(0); + unsigned NewDmask = 0; + + // Try to figure out the used register components + for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); + I != E; ++I) { + + // Abort if we can't understand the usage + if (!I->isMachineOpcode() || + I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) + return; + + // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used. + // Note that subregs are packed, i.e. Lane==0 is the first bit set + // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit + // set, etc. + Lane = SubIdx2Lane(I->getConstantOperandVal(1)); + + // Set which texture component corresponds to the lane. + unsigned Comp; + for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) { + assert(Dmask); + Comp = countTrailingZeros(Dmask); + Dmask &= ~(1 << Comp); + } + + // Abort if we have more than one user per component + if (Users[Lane]) + return; + + Users[Lane] = *I; + NewDmask |= 1 << Comp; + } + + // Abort if there's no change + if (NewDmask == OldDmask) + return; + + // Adjust the writemask in the node + std::vector Ops; + Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32)); + Ops.insert(Ops.end(), Node->op_begin() + 1, Node->op_end()); + Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops); + + // If we only got one lane, replace it with a copy + // (if NewDmask has only one bit set...) + if (NewDmask && (NewDmask & (NewDmask-1)) == 0) { + SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(), + MVT::i32); + SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, + SDLoc(), Users[Lane]->getValueType(0), + SDValue(Node, 0), RC); + DAG.ReplaceAllUsesWith(Users[Lane], Copy); + return; + } + + // Update the users of the node with the new indices + for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { + + SDNode *User = Users[i]; + if (!User) + continue; + + SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); + DAG.UpdateNodeOperands(User, User->getOperand(0), Op); + + switch (Idx) { + default: break; + case AMDGPU::sub0: Idx = AMDGPU::sub1; break; + case AMDGPU::sub1: Idx = AMDGPU::sub2; break; + case AMDGPU::sub2: Idx = AMDGPU::sub3; break; + } + } +} + +/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG) +/// with frame index operands. +/// LLVM assumes that inputs are to these instructions are registers. +void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, + SelectionDAG &DAG) const { + + SmallVector Ops; + for (unsigned i = 0; i < Node->getNumOperands(); ++i) { + if (!isa(Node->getOperand(i))) { + Ops.push_back(Node->getOperand(i)); + continue; + } + + SDLoc DL(Node); + Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, + Node->getOperand(i).getValueType(), + Node->getOperand(i)), 0)); + } + + DAG.UpdateNodeOperands(Node, Ops); +} + +/// \brief Fold the instructions after selecting them. +SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, + SelectionDAG &DAG) const { + const SIInstrInfo *TII = + static_cast(Subtarget->getInstrInfo()); + + if (TII->isMIMG(Node->getMachineOpcode())) + adjustWritemask(Node, DAG); + + if (Node->getMachineOpcode() == AMDGPU::INSERT_SUBREG || + Node->getMachineOpcode() == AMDGPU::REG_SEQUENCE) { + legalizeTargetIndependentNode(Node, DAG); + return Node; + } + return Node; +} + +/// \brief Assign the register class depending on the number of +/// bits set in the writemask +void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, + SDNode *Node) const { + const SIInstrInfo *TII = + static_cast(Subtarget->getInstrInfo()); + + MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + TII->legalizeOperands(MI); + + if (TII->isMIMG(MI->getOpcode())) { + unsigned VReg = MI->getOperand(0).getReg(); + unsigned Writemask = MI->getOperand(1).getImm(); + unsigned BitsSet = 0; + for (unsigned i = 0; i < 4; ++i) + BitsSet += Writemask & (1 << i) ? 1 : 0; + + const TargetRegisterClass *RC; + switch (BitsSet) { + default: return; + case 1: RC = &AMDGPU::VGPR_32RegClass; break; + case 2: RC = &AMDGPU::VReg_64RegClass; break; + case 3: RC = &AMDGPU::VReg_96RegClass; break; + } + + unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet); + MI->setDesc(TII->get(NewOpcode)); + MRI.setRegClass(VReg, RC); + return; + } + + // Replace unused atomics with the no return version. + int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI->getOpcode()); + if (NoRetAtomicOp != -1) { + if (!Node->hasAnyUseOfValue(0)) { + MI->setDesc(TII->get(NoRetAtomicOp)); + MI->RemoveOperand(0); + } + + return; + } +} + +static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) { + SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32); + return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0); +} + +MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, + SDLoc DL, + SDValue Ptr) const { + const SIInstrInfo *TII = + static_cast(Subtarget->getInstrInfo()); +#if 1 + // XXX - Workaround for moveToVALU not handling different register class + // inserts for REG_SEQUENCE. + + // Build the half of the subregister with the constants. + const SDValue Ops0[] = { + DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32), + buildSMovImm32(DAG, DL, 0), + DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), + buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), + DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32) + }; + + SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::v2i32, Ops0), 0); + + // Combine the constants and the pointer. + const SDValue Ops1[] = { + DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), + Ptr, + DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), + SubRegHi, + DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32) + }; + + return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); +#else + const SDValue Ops[] = { + DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32), + Ptr, + DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32), + buildSMovImm32(DAG, DL, 0), + DAG.getTargetConstant(AMDGPU::sub2, MVT::i32), + buildSMovImm32(DAG, DL, TII->getDefaultRsrcFormat() >> 32), + DAG.getTargetConstant(AMDGPU::sub3, MVT::i32) + }; + + return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); + +#endif +} + +/// \brief Return a resource descriptor with the 'Add TID' bit enabled +/// The TID (Thread ID) is multipled by the stride value (bits [61:48] +/// of the resource descriptor) to create an offset, which is added to the +/// resource ponter. +MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, + SDLoc DL, + SDValue Ptr, + uint32_t RsrcDword1, + uint64_t RsrcDword2And3) const { + SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr); + SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr); + if (RsrcDword1) { + PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi, + DAG.getConstant(RsrcDword1, DL, MVT::i32)), + 0); + } + + SDValue DataLo = buildSMovImm32(DAG, DL, + RsrcDword2And3 & UINT64_C(0xFFFFFFFF)); + SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32); + + const SDValue Ops[] = { + DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), + PtrLo, + DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), + PtrHi, + DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32), + DataLo, + DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32), + DataHi, + DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32) + }; + + return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); +} + +MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG, + SDLoc DL, + SDValue Ptr) const { + const SIInstrInfo *TII = + static_cast(Subtarget->getInstrInfo()); + uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | AMDGPU::RSRC_TID_ENABLE | + 0xffffffff; // Size + + return buildRSRC(DAG, DL, Ptr, 0, Rsrc); +} + +SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, + const TargetRegisterClass *RC, + unsigned Reg, EVT VT) const { + SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT); + + return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()), + cast(VReg)->getReg(), VT); +} + +//===----------------------------------------------------------------------===// +// SI Inline Assembly Support +//===----------------------------------------------------------------------===// + +std::pair +SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, + const std::string &Constraint, + MVT VT) const { + if (Constraint == "r") { + switch(VT.SimpleTy) { + default: llvm_unreachable("Unhandled type for 'r' inline asm constraint"); + case MVT::i64: + return std::make_pair(0U, &AMDGPU::SGPR_64RegClass); + case MVT::i32: + return std::make_pair(0U, &AMDGPU::SGPR_32RegClass); + } + } + + if (Constraint.size() > 1) { + const TargetRegisterClass *RC = nullptr; + if (Constraint[1] == 'v') { + RC = &AMDGPU::VGPR_32RegClass; + } else if (Constraint[1] == 's') { + RC = &AMDGPU::SGPR_32RegClass; + } + + if (RC) { + unsigned Idx = std::atoi(Constraint.substr(2).c_str()); + if (Idx < RC->getNumRegs()) + return std::make_pair(RC->getRegister(Idx), RC); + } + } + return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); +} diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h new file mode 100644 index 00000000000..a956b013bdb --- /dev/null +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -0,0 +1,125 @@ +//===-- SIISelLowering.h - SI DAG Lowering Interface ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief SI DAG Lowering interface definition +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_R600_SIISELLOWERING_H +#define LLVM_LIB_TARGET_R600_SIISELLOWERING_H + +#include "AMDGPUISelLowering.h" +#include "SIInstrInfo.h" + +namespace llvm { + +class SITargetLowering : public AMDGPUTargetLowering { + SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc DL, + SDValue Chain, unsigned Offset, bool Signed) const; + SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op, + SelectionDAG &DAG) const; + SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, + SelectionDAG &DAG) const override; + + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const; + SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; + + void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; + + SDValue performUCharToFloatCombine(SDNode *N, + DAGCombinerInfo &DCI) const; + SDValue performSHLPtrCombine(SDNode *N, + unsigned AS, + DAGCombinerInfo &DCI) const; + SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const; + + SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; + +public: + SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI); + + bool isShuffleMaskLegal(const SmallVectorImpl &/*Mask*/, + EVT /*VT*/) const override; + + bool isLegalAddressingMode(const AddrMode &AM, + Type *Ty, unsigned AS) const override; + + bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, + unsigned Align, + bool *IsFast) const override; + + EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, + unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, + bool MemcpyStrSrc, + MachineFunction &MF) const override; + + TargetLoweringBase::LegalizeTypeAction + getPreferredVectorAction(EVT VT) const override; + + bool shouldConvertConstantLoadToIntImm(const APInt &Imm, + Type *Ty) const override; + + SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl &Ins, + SDLoc DL, SelectionDAG &DAG, + SmallVectorImpl &InVals) const override; + + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI, + MachineBasicBlock * BB) const override; + bool enableAggressiveFMAFusion(EVT VT) const override; + EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override; + MVT getScalarShiftAmountTy(EVT VT) const override; + bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; + SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; + SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override; + void AdjustInstrPostInstrSelection(MachineInstr *MI, + SDNode *Node) const override; + + int32_t analyzeImmediate(const SDNode *N) const; + SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, + unsigned Reg, EVT VT) const override; + void legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const; + + MachineSDNode *wrapAddr64Rsrc(SelectionDAG &DAG, SDLoc DL, SDValue Ptr) const; + MachineSDNode *buildRSRC(SelectionDAG &DAG, + SDLoc DL, + SDValue Ptr, + uint32_t RsrcDword1, + uint64_t RsrcDword2And3) const; + MachineSDNode *buildScratchRSRC(SelectionDAG &DAG, + SDLoc DL, + SDValue Ptr) const; + + std::pair getRegForInlineAsmConstraint( + const TargetRegisterInfo *TRI, + const std::string &Constraint, MVT VT) const override; + SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, SDValue V) const; +}; + +} // End namespace llvm + +#endif diff --git a/lib/Target/AMDGPU/SIInsertWaits.cpp b/lib/Target/AMDGPU/SIInsertWaits.cpp new file mode 100644 index 00000000000..90a37f17468 --- /dev/null +++ b/lib/Target/AMDGPU/SIInsertWaits.cpp @@ -0,0 +1,480 @@ +//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Insert wait instructions for memory reads and writes. +/// +/// Memory reads and writes are issued asynchronously, so we need to insert +/// S_WAITCNT instructions when we want to access any of their results or +/// overwrite any register that's used asynchronously. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIDefines.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +namespace { + +/// \brief One variable for each of the hardware counters +typedef union { + struct { + unsigned VM; + unsigned EXP; + unsigned LGKM; + } Named; + unsigned Array[3]; + +} Counters; + +typedef enum { + OTHER, + SMEM, + VMEM +} InstType; + +typedef Counters RegCounters[512]; +typedef std::pair RegInterval; + +class SIInsertWaits : public MachineFunctionPass { + +private: + static char ID; + const SIInstrInfo *TII; + const SIRegisterInfo *TRI; + const MachineRegisterInfo *MRI; + + /// \brief Constant hardware limits + static const Counters WaitCounts; + + /// \brief Constant zero value + static const Counters ZeroCounts; + + /// \brief Counter values we have already waited on. + Counters WaitedOn; + + /// \brief Counter values for last instruction issued. + Counters LastIssued; + + /// \brief Registers used by async instructions. + RegCounters UsedRegs; + + /// \brief Registers defined by async instructions. + RegCounters DefinedRegs; + + /// \brief Different export instruction types seen since last wait. + unsigned ExpInstrTypesSeen; + + /// \brief Type of the last opcode. + InstType LastOpcodeType; + + bool LastInstWritesM0; + + /// \brief Get increment/decrement amount for this instruction. + Counters getHwCounts(MachineInstr &MI); + + /// \brief Is operand relevant for async execution? + bool isOpRelevant(MachineOperand &Op); + + /// \brief Get register interval an operand affects. + RegInterval getRegInterval(MachineOperand &Op); + + /// \brief Handle instructions async components + void pushInstruction(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I); + + /// \brief Insert the actual wait instruction + bool insertWait(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const Counters &Counts); + + /// \brief Do we need def2def checks? + bool unorderedDefines(MachineInstr &MI); + + /// \brief Resolve all operand dependencies to counter requirements + Counters handleOperands(MachineInstr &MI); + + /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG. + void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); + +public: + SIInsertWaits(TargetMachine &tm) : + MachineFunctionPass(ID), + TII(nullptr), + TRI(nullptr), + ExpInstrTypesSeen(0) { } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI insert wait instructions"; + } + +}; + +} // End anonymous namespace + +char SIInsertWaits::ID = 0; + +const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } }; +const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } }; + +FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) { + return new SIInsertWaits(tm); +} + +Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { + + uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags; + Counters Result; + + Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT); + + // Only consider stores or EXP for EXP_CNT + Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT && + (MI.getOpcode() == AMDGPU::EXP || MI.getDesc().mayStore())); + + // LGKM may uses larger values + if (TSFlags & SIInstrFlags::LGKM_CNT) { + + if (TII->isSMRD(MI.getOpcode())) { + + MachineOperand &Op = MI.getOperand(0); + assert(Op.isReg() && "First LGKM operand must be a register!"); + + unsigned Reg = Op.getReg(); + unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize(); + Result.Named.LGKM = Size > 4 ? 2 : 1; + + } else { + // DS + Result.Named.LGKM = 1; + } + + } else { + Result.Named.LGKM = 0; + } + + return Result; +} + +bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { + + // Constants are always irrelevant + if (!Op.isReg()) + return false; + + // Defines are always relevant + if (Op.isDef()) + return true; + + // For exports all registers are relevant + MachineInstr &MI = *Op.getParent(); + if (MI.getOpcode() == AMDGPU::EXP) + return true; + + // For stores the stored value is also relevant + if (!MI.getDesc().mayStore()) + return false; + + // Check if this operand is the value being stored. + // Special case for DS instructions, since the address + // operand comes before the value operand and it may have + // multiple data operands. + + if (TII->isDS(MI.getOpcode())) { + MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data); + if (Data && Op.isIdenticalTo(*Data)) + return true; + + MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0); + if (Data0 && Op.isIdenticalTo(*Data0)) + return true; + + MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1); + if (Data1 && Op.isIdenticalTo(*Data1)) + return true; + + return false; + } + + // NOTE: This assumes that the value operand is before the + // address operand, and that there is only one value operand. + for (MachineInstr::mop_iterator I = MI.operands_begin(), + E = MI.operands_end(); I != E; ++I) { + + if (I->isReg() && I->isUse()) + return Op.isIdenticalTo(*I); + } + + return false; +} + +RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) { + + if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg())) + return std::make_pair(0, 0); + + unsigned Reg = Op.getReg(); + unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize(); + + assert(Size >= 4); + + RegInterval Result; + Result.first = TRI->getEncodingValue(Reg); + Result.second = Result.first + Size / 4; + + return Result; +} + +void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) { + + // Get the hardware counter increments and sum them up + Counters Increment = getHwCounts(*I); + unsigned Sum = 0; + + for (unsigned i = 0; i < 3; ++i) { + LastIssued.Array[i] += Increment.Array[i]; + Sum += Increment.Array[i]; + } + + // If we don't increase anything then that's it + if (Sum == 0) { + LastOpcodeType = OTHER; + return; + } + + if (MBB.getParent()->getSubtarget().getGeneration() >= + AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // Any occurence of consecutive VMEM or SMEM instructions forms a VMEM + // or SMEM clause, respectively. + // + // The temporary workaround is to break the clauses with S_NOP. + // + // The proper solution would be to allocate registers such that all source + // and destination registers don't overlap, e.g. this is illegal: + // r0 = load r2 + // r2 = load r0 + if ((LastOpcodeType == SMEM && TII->isSMRD(I->getOpcode())) || + (LastOpcodeType == VMEM && Increment.Named.VM)) { + // Insert a NOP to break the clause. + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)) + .addImm(0); + LastInstWritesM0 = false; + } + + if (TII->isSMRD(I->getOpcode())) + LastOpcodeType = SMEM; + else if (Increment.Named.VM) + LastOpcodeType = VMEM; + } + + // Remember which export instructions we have seen + if (Increment.Named.EXP) { + ExpInstrTypesSeen |= I->getOpcode() == AMDGPU::EXP ? 1 : 2; + } + + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { + + MachineOperand &Op = I->getOperand(i); + if (!isOpRelevant(Op)) + continue; + + RegInterval Interval = getRegInterval(Op); + for (unsigned j = Interval.first; j < Interval.second; ++j) { + + // Remember which registers we define + if (Op.isDef()) + DefinedRegs[j] = LastIssued; + + // and which one we are using + if (Op.isUse()) + UsedRegs[j] = LastIssued; + } + } +} + +bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const Counters &Required) { + + // End of program? No need to wait on anything + if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM) + return false; + + // Figure out if the async instructions execute in order + bool Ordered[3]; + + // VM_CNT is always ordered + Ordered[0] = true; + + // EXP_CNT is unordered if we have both EXP & VM-writes + Ordered[1] = ExpInstrTypesSeen == 3; + + // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS + Ordered[2] = false; + + // The values we are going to put into the S_WAITCNT instruction + Counters Counts = WaitCounts; + + // Do we really need to wait? + bool NeedWait = false; + + for (unsigned i = 0; i < 3; ++i) { + + if (Required.Array[i] <= WaitedOn.Array[i]) + continue; + + NeedWait = true; + + if (Ordered[i]) { + unsigned Value = LastIssued.Array[i] - Required.Array[i]; + + // Adjust the value to the real hardware possibilities. + Counts.Array[i] = std::min(Value, WaitCounts.Array[i]); + + } else + Counts.Array[i] = 0; + + // Remember on what we have waited on. + WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i]; + } + + if (!NeedWait) + return false; + + // Reset EXP_CNT instruction types + if (Counts.Named.EXP == 0) + ExpInstrTypesSeen = 0; + + // Build the wait instruction + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm((Counts.Named.VM & 0xF) | + ((Counts.Named.EXP & 0x7) << 4) | + ((Counts.Named.LGKM & 0x7) << 8)); + + LastOpcodeType = OTHER; + LastInstWritesM0 = false; + return true; +} + +/// \brief helper function for handleOperands +static void increaseCounters(Counters &Dst, const Counters &Src) { + + for (unsigned i = 0; i < 3; ++i) + Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]); +} + +Counters SIInsertWaits::handleOperands(MachineInstr &MI) { + + Counters Result = ZeroCounts; + + // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish, + // but we also want to wait for any other outstanding transfers before + // signalling other hardware blocks + if (MI.getOpcode() == AMDGPU::S_SENDMSG) + return LastIssued; + + // For each register affected by this + // instruction increase the result sequence + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + + MachineOperand &Op = MI.getOperand(i); + RegInterval Interval = getRegInterval(Op); + for (unsigned j = Interval.first; j < Interval.second; ++j) { + + if (Op.isDef()) { + increaseCounters(Result, UsedRegs[j]); + increaseCounters(Result, DefinedRegs[j]); + } + + if (Op.isUse()) + increaseCounters(Result, DefinedRegs[j]); + } + } + + return Result; +} + +void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) { + if (MBB.getParent()->getSubtarget().getGeneration() < + AMDGPUSubtarget::VOLCANIC_ISLANDS) + return; + + // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG. + if (LastInstWritesM0 && I->getOpcode() == AMDGPU::S_SENDMSG) { + BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0); + LastInstWritesM0 = false; + return; + } + + // Set whether this instruction sets M0 + LastInstWritesM0 = false; + + unsigned NumOperands = I->getNumOperands(); + for (unsigned i = 0; i < NumOperands; i++) { + const MachineOperand &Op = I->getOperand(i); + + if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0) + LastInstWritesM0 = true; + } +} + +// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States" +// around other non-memory instructions. +bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { + bool Changes = false; + + TII = static_cast(MF.getSubtarget().getInstrInfo()); + TRI = + static_cast(MF.getSubtarget().getRegisterInfo()); + + MRI = &MF.getRegInfo(); + + WaitedOn = ZeroCounts; + LastIssued = ZeroCounts; + LastOpcodeType = OTHER; + LastInstWritesM0 = false; + + memset(&UsedRegs, 0, sizeof(UsedRegs)); + memset(&DefinedRegs, 0, sizeof(DefinedRegs)); + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + + // Wait for everything before a barrier. + if (I->getOpcode() == AMDGPU::S_BARRIER) + Changes |= insertWait(MBB, I, LastIssued); + else + Changes |= insertWait(MBB, I, handleOperands(*I)); + + pushInstruction(MBB, I); + handleSendMsg(MBB, I); + } + + // Wait for everything at the end of the MBB + Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); + } + + return Changes; +} diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td new file mode 100644 index 00000000000..211666a9bdb --- /dev/null +++ b/lib/Target/AMDGPU/SIInstrFormats.td @@ -0,0 +1,673 @@ +//===-- SIInstrFormats.td - SI Instruction Encodings ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// SI Instruction format definitions. +// +//===----------------------------------------------------------------------===// + +class InstSI pattern> : + AMDGPUInst, PredicateControl { + + field bits<1> VM_CNT = 0; + field bits<1> EXP_CNT = 0; + field bits<1> LGKM_CNT = 0; + + field bits<1> SALU = 0; + field bits<1> VALU = 0; + + field bits<1> SOP1 = 0; + field bits<1> SOP2 = 0; + field bits<1> SOPC = 0; + field bits<1> SOPK = 0; + field bits<1> SOPP = 0; + + field bits<1> VOP1 = 0; + field bits<1> VOP2 = 0; + field bits<1> VOP3 = 0; + field bits<1> VOPC = 0; + + field bits<1> MUBUF = 0; + field bits<1> MTBUF = 0; + field bits<1> SMRD = 0; + field bits<1> DS = 0; + field bits<1> MIMG = 0; + field bits<1> FLAT = 0; + field bits<1> WQM = 0; + field bits<1> VGPRSpill = 0; + + // These need to be kept in sync with the enum in SIInstrFlags. + let TSFlags{0} = VM_CNT; + let TSFlags{1} = EXP_CNT; + let TSFlags{2} = LGKM_CNT; + + let TSFlags{3} = SALU; + let TSFlags{4} = VALU; + + let TSFlags{5} = SOP1; + let TSFlags{6} = SOP2; + let TSFlags{7} = SOPC; + let TSFlags{8} = SOPK; + let TSFlags{9} = SOPP; + + let TSFlags{10} = VOP1; + let TSFlags{11} = VOP2; + let TSFlags{12} = VOP3; + let TSFlags{13} = VOPC; + + let TSFlags{14} = MUBUF; + let TSFlags{15} = MTBUF; + let TSFlags{16} = SMRD; + let TSFlags{17} = DS; + let TSFlags{18} = MIMG; + let TSFlags{19} = FLAT; + let TSFlags{20} = WQM; + let TSFlags{21} = VGPRSpill; + + // Most instructions require adjustments after selection to satisfy + // operand requirements. + let hasPostISelHook = 1; + let SchedRW = [Write32Bit]; +} + +class Enc32 { + field bits<32> Inst; + int Size = 4; +} + +class Enc64 { + field bits<64> Inst; + int Size = 8; +} + +class VOPDstOperand : RegisterOperand ; +def VOPDstVCC : VOPDstOperand ; + +let Uses = [EXEC] in { + +class VOPAnyCommon pattern> : + InstSI { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + let VALU = 1; +} + +class VOPCCommon pattern> : + VOPAnyCommon <(outs VOPDstVCC:$dst), ins, asm, pattern> { + + let DisableEncoding = "$dst"; + let VOPC = 1; + let Size = 4; +} + +class VOP1Common pattern> : + VOPAnyCommon { + + let VOP1 = 1; + let Size = 4; +} + +class VOP2Common pattern> : + VOPAnyCommon { + + let VOP2 = 1; + let Size = 4; +} + +class VOP3Common pattern> : + VOPAnyCommon { + + // Using complex patterns gives VOP3 patterns a very high complexity rating, + // but standalone patterns are almost always prefered, so we need to adjust the + // priority lower. The goal is to use a high number to reduce complexity to + // zero (or less than zero). + let AddedComplexity = -1000; + + let VOP3 = 1; + let VALU = 1; + + let AsmMatchConverter = "cvtVOP3"; + let isCodeGenOnly = 0; + + int Size = 8; +} + +} // End Uses = [EXEC] + +//===----------------------------------------------------------------------===// +// Scalar operations +//===----------------------------------------------------------------------===// + +class SOP1e op> : Enc32 { + bits<7> sdst; + bits<8> ssrc0; + + let Inst{7-0} = ssrc0; + let Inst{15-8} = op; + let Inst{22-16} = sdst; + let Inst{31-23} = 0x17d; //encoding; +} + +class SOP2e op> : Enc32 { + bits<7> sdst; + bits<8> ssrc0; + bits<8> ssrc1; + + let Inst{7-0} = ssrc0; + let Inst{15-8} = ssrc1; + let Inst{22-16} = sdst; + let Inst{29-23} = op; + let Inst{31-30} = 0x2; // encoding +} + +class SOPCe op> : Enc32 { + bits<8> ssrc0; + bits<8> ssrc1; + + let Inst{7-0} = ssrc0; + let Inst{15-8} = ssrc1; + let Inst{22-16} = op; + let Inst{31-23} = 0x17e; +} + +class SOPKe op> : Enc32 { + bits <7> sdst; + bits <16> simm16; + + let Inst{15-0} = simm16; + let Inst{22-16} = sdst; + let Inst{27-23} = op; + let Inst{31-28} = 0xb; //encoding +} + +class SOPK64e op> : Enc64 { + bits <7> sdst = 0; + bits <16> simm16; + bits <32> imm; + + let Inst{15-0} = simm16; + let Inst{22-16} = sdst; + let Inst{27-23} = op; + let Inst{31-28} = 0xb; + + let Inst{63-32} = imm; +} + +class SOPPe op> : Enc32 { + bits <16> simm16; + + let Inst{15-0} = simm16; + let Inst{22-16} = op; + let Inst{31-23} = 0x17f; // encoding +} + +class SMRDe op, bits<1> imm> : Enc32 { + bits<7> sdst; + bits<7> sbase; + bits<8> offset; + + let Inst{7-0} = offset; + let Inst{8} = imm; + let Inst{14-9} = sbase{6-1}; + let Inst{21-15} = sdst; + let Inst{26-22} = op; + let Inst{31-27} = 0x18; //encoding +} + +let SchedRW = [WriteSALU] in { +class SOP1 pattern> : + InstSI { + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let isCodeGenOnly = 0; + let SALU = 1; + let SOP1 = 1; +} + +class SOP2 pattern> : + InstSI { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let isCodeGenOnly = 0; + let SALU = 1; + let SOP2 = 1; + + let UseNamedOperandTable = 1; +} + +class SOPC op, dag outs, dag ins, string asm, list pattern> : + InstSI, SOPCe { + + let DisableEncoding = "$dst"; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let SALU = 1; + let SOPC = 1; + let isCodeGenOnly = 0; + + let UseNamedOperandTable = 1; +} + +class SOPK pattern> : + InstSI { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let SALU = 1; + let SOPK = 1; + + let UseNamedOperandTable = 1; +} + +class SOPP op, dag ins, string asm, list pattern = []> : + InstSI <(outs), ins, asm, pattern >, SOPPe { + + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let SALU = 1; + let SOPP = 1; + + let UseNamedOperandTable = 1; +} + +} // let SchedRW = [WriteSALU] + +class SMRD pattern> : + InstSI { + + let LGKM_CNT = 1; + let SMRD = 1; + let mayStore = 0; + let mayLoad = 1; + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + let SchedRW = [WriteSMEM]; +} + +//===----------------------------------------------------------------------===// +// Vector ALU operations +//===----------------------------------------------------------------------===// + +class VOP1e op> : Enc32 { + bits<8> vdst; + bits<9> src0; + + let Inst{8-0} = src0; + let Inst{16-9} = op; + let Inst{24-17} = vdst; + let Inst{31-25} = 0x3f; //encoding +} + +class VOP2e op> : Enc32 { + bits<8> vdst; + bits<9> src0; + bits<8> src1; + + let Inst{8-0} = src0; + let Inst{16-9} = src1; + let Inst{24-17} = vdst; + let Inst{30-25} = op; + let Inst{31} = 0x0; //encoding +} + +class VOP2_MADKe op> : Enc64 { + + bits<8> vdst; + bits<9> src0; + bits<8> vsrc1; + bits<32> src2; + + let Inst{8-0} = src0; + let Inst{16-9} = vsrc1; + let Inst{24-17} = vdst; + let Inst{30-25} = op; + let Inst{31} = 0x0; // encoding + let Inst{63-32} = src2; +} + +class VOP3e op> : Enc64 { + bits<8> vdst; + bits<2> src0_modifiers; + bits<9> src0; + bits<2> src1_modifiers; + bits<9> src1; + bits<2> src2_modifiers; + bits<9> src2; + bits<1> clamp; + bits<2> omod; + + let Inst{7-0} = vdst; + let Inst{8} = src0_modifiers{1}; + let Inst{9} = src1_modifiers{1}; + let Inst{10} = src2_modifiers{1}; + let Inst{11} = clamp; + let Inst{25-17} = op; + let Inst{31-26} = 0x34; //encoding + let Inst{40-32} = src0; + let Inst{49-41} = src1; + let Inst{58-50} = src2; + let Inst{60-59} = omod; + let Inst{61} = src0_modifiers{0}; + let Inst{62} = src1_modifiers{0}; + let Inst{63} = src2_modifiers{0}; +} + +class VOP3be op> : Enc64 { + bits<8> vdst; + bits<2> src0_modifiers; + bits<9> src0; + bits<2> src1_modifiers; + bits<9> src1; + bits<2> src2_modifiers; + bits<9> src2; + bits<7> sdst; + bits<2> omod; + + let Inst{7-0} = vdst; + let Inst{14-8} = sdst; + let Inst{25-17} = op; + let Inst{31-26} = 0x34; //encoding + let Inst{40-32} = src0; + let Inst{49-41} = src1; + let Inst{58-50} = src2; + let Inst{60-59} = omod; + let Inst{61} = src0_modifiers{0}; + let Inst{62} = src1_modifiers{0}; + let Inst{63} = src2_modifiers{0}; +} + +class VOPCe op> : Enc32 { + bits<9> src0; + bits<8> vsrc1; + + let Inst{8-0} = src0; + let Inst{16-9} = vsrc1; + let Inst{24-17} = op; + let Inst{31-25} = 0x3e; +} + +class VINTRPe op> : Enc32 { + bits<8> vdst; + bits<8> vsrc; + bits<2> attrchan; + bits<6> attr; + + let Inst{7-0} = vsrc; + let Inst{9-8} = attrchan; + let Inst{15-10} = attr; + let Inst{17-16} = op; + let Inst{25-18} = vdst; + let Inst{31-26} = 0x32; // encoding +} + +class DSe op> : Enc64 { + bits<8> vdst; + bits<1> gds; + bits<8> addr; + bits<8> data0; + bits<8> data1; + bits<8> offset0; + bits<8> offset1; + + let Inst{7-0} = offset0; + let Inst{15-8} = offset1; + let Inst{17} = gds; + let Inst{25-18} = op; + let Inst{31-26} = 0x36; //encoding + let Inst{39-32} = addr; + let Inst{47-40} = data0; + let Inst{55-48} = data1; + let Inst{63-56} = vdst; +} + +class MUBUFe op> : Enc64 { + bits<12> offset; + bits<1> offen; + bits<1> idxen; + bits<1> glc; + bits<1> addr64; + bits<1> lds; + bits<8> vaddr; + bits<8> vdata; + bits<7> srsrc; + bits<1> slc; + bits<1> tfe; + bits<8> soffset; + + let Inst{11-0} = offset; + let Inst{12} = offen; + let Inst{13} = idxen; + let Inst{14} = glc; + let Inst{15} = addr64; + let Inst{16} = lds; + let Inst{24-18} = op; + let Inst{31-26} = 0x38; //encoding + let Inst{39-32} = vaddr; + let Inst{47-40} = vdata; + let Inst{52-48} = srsrc{6-2}; + let Inst{54} = slc; + let Inst{55} = tfe; + let Inst{63-56} = soffset; +} + +class MTBUFe op> : Enc64 { + bits<8> vdata; + bits<12> offset; + bits<1> offen; + bits<1> idxen; + bits<1> glc; + bits<1> addr64; + bits<4> dfmt; + bits<3> nfmt; + bits<8> vaddr; + bits<7> srsrc; + bits<1> slc; + bits<1> tfe; + bits<8> soffset; + + let Inst{11-0} = offset; + let Inst{12} = offen; + let Inst{13} = idxen; + let Inst{14} = glc; + let Inst{15} = addr64; + let Inst{18-16} = op; + let Inst{22-19} = dfmt; + let Inst{25-23} = nfmt; + let Inst{31-26} = 0x3a; //encoding + let Inst{39-32} = vaddr; + let Inst{47-40} = vdata; + let Inst{52-48} = srsrc{6-2}; + let Inst{54} = slc; + let Inst{55} = tfe; + let Inst{63-56} = soffset; +} + +class MIMGe op> : Enc64 { + bits<8> vdata; + bits<4> dmask; + bits<1> unorm; + bits<1> glc; + bits<1> da; + bits<1> r128; + bits<1> tfe; + bits<1> lwe; + bits<1> slc; + bits<8> vaddr; + bits<7> srsrc; + bits<7> ssamp; + + let Inst{11-8} = dmask; + let Inst{12} = unorm; + let Inst{13} = glc; + let Inst{14} = da; + let Inst{15} = r128; + let Inst{16} = tfe; + let Inst{17} = lwe; + let Inst{24-18} = op; + let Inst{25} = slc; + let Inst{31-26} = 0x3c; + let Inst{39-32} = vaddr; + let Inst{47-40} = vdata; + let Inst{52-48} = srsrc{6-2}; + let Inst{57-53} = ssamp{6-2}; +} + +class FLATe op> : Enc64 { + bits<8> addr; + bits<8> data; + bits<8> vdst; + bits<1> slc; + bits<1> glc; + bits<1> tfe; + + // 15-0 is reserved. + let Inst{16} = glc; + let Inst{17} = slc; + let Inst{24-18} = op; + let Inst{31-26} = 0x37; // Encoding. + let Inst{39-32} = addr; + let Inst{47-40} = data; + // 54-48 is reserved. + let Inst{55} = tfe; + let Inst{63-56} = vdst; +} + +class EXPe : Enc64 { + bits<4> en; + bits<6> tgt; + bits<1> compr; + bits<1> done; + bits<1> vm; + bits<8> vsrc0; + bits<8> vsrc1; + bits<8> vsrc2; + bits<8> vsrc3; + + let Inst{3-0} = en; + let Inst{9-4} = tgt; + let Inst{10} = compr; + let Inst{11} = done; + let Inst{12} = vm; + let Inst{31-26} = 0x3e; + let Inst{39-32} = vsrc0; + let Inst{47-40} = vsrc1; + let Inst{55-48} = vsrc2; + let Inst{63-56} = vsrc3; +} + +let Uses = [EXEC] in { + +class VOP1 op, dag outs, dag ins, string asm, list pattern> : + VOP1Common , + VOP1e { + let isCodeGenOnly = 0; +} + +class VOP2 op, dag outs, dag ins, string asm, list pattern> : + VOP2Common , VOP2e { + let isCodeGenOnly = 0; +} + +class VOPC op, dag ins, string asm, list pattern> : + VOPCCommon , VOPCe ; + +class VINTRPCommon pattern> : + InstSI { + let mayLoad = 1; + let mayStore = 0; + let hasSideEffects = 0; +} + +} // End Uses = [EXEC] + +//===----------------------------------------------------------------------===// +// Vector I/O operations +//===----------------------------------------------------------------------===// + +let Uses = [EXEC] in { + +class DS pattern> : + InstSI { + + let LGKM_CNT = 1; + let DS = 1; + let UseNamedOperandTable = 1; + let Uses = [M0]; + + // Most instruction load and store data, so set this as the default. + let mayLoad = 1; + let mayStore = 1; + + let hasSideEffects = 0; + let AsmMatchConverter = "cvtDS"; + let SchedRW = [WriteLDS]; +} + +class MUBUF pattern> : + InstSI { + + let VM_CNT = 1; + let EXP_CNT = 1; + let MUBUF = 1; + + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + let AsmMatchConverter = "cvtMubuf"; + let SchedRW = [WriteVMEM]; +} + +class MTBUF pattern> : + InstSI { + + let VM_CNT = 1; + let EXP_CNT = 1; + let MTBUF = 1; + + let hasSideEffects = 0; + let UseNamedOperandTable = 1; + let SchedRW = [WriteVMEM]; +} + +class FLAT op, dag outs, dag ins, string asm, list pattern> : + InstSI, FLATe { + let FLAT = 1; + // Internally, FLAT instruction are executed as both an LDS and a + // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT + // and are not considered done until both have been decremented. + let VM_CNT = 1; + let LGKM_CNT = 1; + + let Uses = [EXEC, FLAT_SCR]; // M0 + + let UseNamedOperandTable = 1; + let hasSideEffects = 0; + let AsmMatchConverter = "cvtFlat"; + let SchedRW = [WriteVMEM]; +} + +class MIMG op, dag outs, dag ins, string asm, list pattern> : + InstSI , MIMGe { + + let VM_CNT = 1; + let EXP_CNT = 1; + let MIMG = 1; + + let hasSideEffects = 0; // XXX ???? +} + + +} // End Uses = [EXEC] diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp new file mode 100644 index 00000000000..d647c25286f --- /dev/null +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -0,0 +1,2723 @@ +//===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief SI Implementation of TargetInstrInfo. +// +//===----------------------------------------------------------------------===// + + +#include "SIInstrInfo.h" +#include "AMDGPUTargetMachine.h" +#include "SIDefines.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st) + : AMDGPUInstrInfo(st), RI() {} + +//===----------------------------------------------------------------------===// +// TargetInstrInfo callbacks +//===----------------------------------------------------------------------===// + +static unsigned getNumOperandsNoGlue(SDNode *Node) { + unsigned N = Node->getNumOperands(); + while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) + --N; + return N; +} + +static SDValue findChainOperand(SDNode *Load) { + SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); + assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); + return LastOp; +} + +/// \brief Returns true if both nodes have the same value for the given +/// operand \p Op, or if both nodes do not have this operand. +static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { + unsigned Opc0 = N0->getMachineOpcode(); + unsigned Opc1 = N1->getMachineOpcode(); + + int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); + int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); + + if (Op0Idx == -1 && Op1Idx == -1) + return true; + + + if ((Op0Idx == -1 && Op1Idx != -1) || + (Op1Idx == -1 && Op0Idx != -1)) + return false; + + // getNamedOperandIdx returns the index for the MachineInstr's operands, + // which includes the result as the first operand. We are indexing into the + // MachineSDNode's operands, so we need to skip the result operand to get + // the real index. + --Op0Idx; + --Op1Idx; + + return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); +} + +bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, + AliasAnalysis *AA) const { + // TODO: The generic check fails for VALU instructions that should be + // rematerializable due to implicit reads of exec. We really want all of the + // generic logic for this except for this. + switch (MI->getOpcode()) { + case AMDGPU::V_MOV_B32_e32: + case AMDGPU::V_MOV_B32_e64: + return true; + default: + return false; + } +} + +bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, + int64_t &Offset0, + int64_t &Offset1) const { + if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) + return false; + + unsigned Opc0 = Load0->getMachineOpcode(); + unsigned Opc1 = Load1->getMachineOpcode(); + + // Make sure both are actually loads. + if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) + return false; + + if (isDS(Opc0) && isDS(Opc1)) { + + // FIXME: Handle this case: + if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) + return false; + + // Check base reg. + if (Load0->getOperand(1) != Load1->getOperand(1)) + return false; + + // Check chain. + if (findChainOperand(Load0) != findChainOperand(Load1)) + return false; + + // Skip read2 / write2 variants for simplicity. + // TODO: We should report true if the used offsets are adjacent (excluded + // st64 versions). + if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || + AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) + return false; + + Offset0 = cast(Load0->getOperand(2))->getZExtValue(); + Offset1 = cast(Load1->getOperand(2))->getZExtValue(); + return true; + } + + if (isSMRD(Opc0) && isSMRD(Opc1)) { + assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); + + // Check base reg. + if (Load0->getOperand(0) != Load1->getOperand(0)) + return false; + + const ConstantSDNode *Load0Offset = + dyn_cast(Load0->getOperand(1)); + const ConstantSDNode *Load1Offset = + dyn_cast(Load1->getOperand(1)); + + if (!Load0Offset || !Load1Offset) + return false; + + // Check chain. + if (findChainOperand(Load0) != findChainOperand(Load1)) + return false; + + Offset0 = Load0Offset->getZExtValue(); + Offset1 = Load1Offset->getZExtValue(); + return true; + } + + // MUBUF and MTBUF can access the same addresses. + if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { + + // MUBUF and MTBUF have vaddr at different indices. + if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || + findChainOperand(Load0) != findChainOperand(Load1) || + !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || + !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) + return false; + + int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); + int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); + + if (OffIdx0 == -1 || OffIdx1 == -1) + return false; + + // getNamedOperandIdx returns the index for MachineInstrs. Since they + // inlcude the output in the operand list, but SDNodes don't, we need to + // subtract the index by one. + --OffIdx0; + --OffIdx1; + + SDValue Off0 = Load0->getOperand(OffIdx0); + SDValue Off1 = Load1->getOperand(OffIdx1); + + // The offset might be a FrameIndexSDNode. + if (!isa(Off0) || !isa(Off1)) + return false; + + Offset0 = cast(Off0)->getZExtValue(); + Offset1 = cast(Off1)->getZExtValue(); + return true; + } + + return false; +} + +static bool isStride64(unsigned Opc) { + switch (Opc) { + case AMDGPU::DS_READ2ST64_B32: + case AMDGPU::DS_READ2ST64_B64: + case AMDGPU::DS_WRITE2ST64_B32: + case AMDGPU::DS_WRITE2ST64_B64: + return true; + default: + return false; + } +} + +bool SIInstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt, + unsigned &BaseReg, unsigned &Offset, + const TargetRegisterInfo *TRI) const { + unsigned Opc = LdSt->getOpcode(); + if (isDS(Opc)) { + const MachineOperand *OffsetImm = getNamedOperand(*LdSt, + AMDGPU::OpName::offset); + if (OffsetImm) { + // Normal, single offset LDS instruction. + const MachineOperand *AddrReg = getNamedOperand(*LdSt, + AMDGPU::OpName::addr); + + BaseReg = AddrReg->getReg(); + Offset = OffsetImm->getImm(); + return true; + } + + // The 2 offset instructions use offset0 and offset1 instead. We can treat + // these as a load with a single offset if the 2 offsets are consecutive. We + // will use this for some partially aligned loads. + const MachineOperand *Offset0Imm = getNamedOperand(*LdSt, + AMDGPU::OpName::offset0); + const MachineOperand *Offset1Imm = getNamedOperand(*LdSt, + AMDGPU::OpName::offset1); + + uint8_t Offset0 = Offset0Imm->getImm(); + uint8_t Offset1 = Offset1Imm->getImm(); + assert(Offset1 > Offset0); + + if (Offset1 - Offset0 == 1) { + // Each of these offsets is in element sized units, so we need to convert + // to bytes of the individual reads. + + unsigned EltSize; + if (LdSt->mayLoad()) + EltSize = getOpRegClass(*LdSt, 0)->getSize() / 2; + else { + assert(LdSt->mayStore()); + int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); + EltSize = getOpRegClass(*LdSt, Data0Idx)->getSize(); + } + + if (isStride64(Opc)) + EltSize *= 64; + + const MachineOperand *AddrReg = getNamedOperand(*LdSt, + AMDGPU::OpName::addr); + BaseReg = AddrReg->getReg(); + Offset = EltSize * Offset0; + return true; + } + + return false; + } + + if (isMUBUF(Opc) || isMTBUF(Opc)) { + if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1) + return false; + + const MachineOperand *AddrReg = getNamedOperand(*LdSt, + AMDGPU::OpName::vaddr); + if (!AddrReg) + return false; + + const MachineOperand *OffsetImm = getNamedOperand(*LdSt, + AMDGPU::OpName::offset); + BaseReg = AddrReg->getReg(); + Offset = OffsetImm->getImm(); + return true; + } + + if (isSMRD(Opc)) { + const MachineOperand *OffsetImm = getNamedOperand(*LdSt, + AMDGPU::OpName::offset); + if (!OffsetImm) + return false; + + const MachineOperand *SBaseReg = getNamedOperand(*LdSt, + AMDGPU::OpName::sbase); + BaseReg = SBaseReg->getReg(); + Offset = OffsetImm->getImm(); + return true; + } + + return false; +} + +bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, + MachineInstr *SecondLdSt, + unsigned NumLoads) const { + unsigned Opc0 = FirstLdSt->getOpcode(); + unsigned Opc1 = SecondLdSt->getOpcode(); + + // TODO: This needs finer tuning + if (NumLoads > 4) + return false; + + if (isDS(Opc0) && isDS(Opc1)) + return true; + + if (isSMRD(Opc0) && isSMRD(Opc1)) + return true; + + if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) + return true; + + return false; +} + +void +SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const { + + // If we are trying to copy to or from SCC, there is a bug somewhere else in + // the backend. While it may be theoretically possible to do this, it should + // never be necessary. + assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC); + + static const int16_t Sub0_15[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, + AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, + AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 0 + }; + + static const int16_t Sub0_7[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 0 + }; + + static const int16_t Sub0_3[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 0 + }; + + static const int16_t Sub0_2[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 0 + }; + + static const int16_t Sub0_1[] = { + AMDGPU::sub0, AMDGPU::sub1, 0 + }; + + unsigned Opcode; + const int16_t *SubIndices; + + if (AMDGPU::SReg_32RegClass.contains(DestReg)) { + assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + + } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) { + if (DestReg == AMDGPU::VCC) { + if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) + .addReg(SrcReg, getKillRegState(KillSrc)); + } else { + // FIXME: Hack until VReg_1 removed. + assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); + BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32), AMDGPU::VCC) + .addImm(0) + .addReg(SrcReg, getKillRegState(KillSrc)); + } + + return; + } + + assert(AMDGPU::SReg_64RegClass.contains(SrcReg)); + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + + } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) { + assert(AMDGPU::SReg_128RegClass.contains(SrcReg)); + Opcode = AMDGPU::S_MOV_B32; + SubIndices = Sub0_3; + + } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) { + assert(AMDGPU::SReg_256RegClass.contains(SrcReg)); + Opcode = AMDGPU::S_MOV_B32; + SubIndices = Sub0_7; + + } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) { + assert(AMDGPU::SReg_512RegClass.contains(SrcReg)); + Opcode = AMDGPU::S_MOV_B32; + SubIndices = Sub0_15; + + } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) { + assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || + AMDGPU::SReg_32RegClass.contains(SrcReg)); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + + } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) { + assert(AMDGPU::VReg_64RegClass.contains(SrcReg) || + AMDGPU::SReg_64RegClass.contains(SrcReg)); + Opcode = AMDGPU::V_MOV_B32_e32; + SubIndices = Sub0_1; + + } else if (AMDGPU::VReg_96RegClass.contains(DestReg)) { + assert(AMDGPU::VReg_96RegClass.contains(SrcReg)); + Opcode = AMDGPU::V_MOV_B32_e32; + SubIndices = Sub0_2; + + } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) { + assert(AMDGPU::VReg_128RegClass.contains(SrcReg) || + AMDGPU::SReg_128RegClass.contains(SrcReg)); + Opcode = AMDGPU::V_MOV_B32_e32; + SubIndices = Sub0_3; + + } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) { + assert(AMDGPU::VReg_256RegClass.contains(SrcReg) || + AMDGPU::SReg_256RegClass.contains(SrcReg)); + Opcode = AMDGPU::V_MOV_B32_e32; + SubIndices = Sub0_7; + + } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) { + assert(AMDGPU::VReg_512RegClass.contains(SrcReg) || + AMDGPU::SReg_512RegClass.contains(SrcReg)); + Opcode = AMDGPU::V_MOV_B32_e32; + SubIndices = Sub0_15; + + } else { + llvm_unreachable("Can't copy register!"); + } + + while (unsigned SubIdx = *SubIndices++) { + MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, + get(Opcode), RI.getSubReg(DestReg, SubIdx)); + + Builder.addReg(RI.getSubReg(SrcReg, SubIdx), getKillRegState(KillSrc)); + + if (*SubIndices) + Builder.addReg(DestReg, RegState::Define | RegState::Implicit); + } +} + +unsigned SIInstrInfo::commuteOpcode(const MachineInstr &MI) const { + const unsigned Opcode = MI.getOpcode(); + + int NewOpc; + + // Try to map original to commuted opcode + NewOpc = AMDGPU::getCommuteRev(Opcode); + // Check if the commuted (REV) opcode exists on the target. + if (NewOpc != -1 && pseudoToMCOpcode(NewOpc) != -1) + return NewOpc; + + // Try to map commuted to original opcode + NewOpc = AMDGPU::getCommuteOrig(Opcode); + // Check if the original (non-REV) opcode exists on the target. + if (NewOpc != -1 && pseudoToMCOpcode(NewOpc) != -1) + return NewOpc; + + return Opcode; +} + +unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { + + if (DstRC->getSize() == 4) { + return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; + } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) { + return AMDGPU::S_MOV_B64; + } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) { + return AMDGPU::V_MOV_B64_PSEUDO; + } + return AMDGPU::COPY; +} + +void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, bool isKill, + int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + MachineFunction *MF = MBB.getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo(); + MachineFrameInfo *FrameInfo = MF->getFrameInfo(); + DebugLoc DL = MBB.findDebugLoc(MI); + int Opcode = -1; + + if (RI.isSGPRClass(RC)) { + // We are only allowed to create one new instruction when spilling + // registers, so we need to use pseudo instruction for spilling + // SGPRs. + switch (RC->getSize() * 8) { + case 32: Opcode = AMDGPU::SI_SPILL_S32_SAVE; break; + case 64: Opcode = AMDGPU::SI_SPILL_S64_SAVE; break; + case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break; + case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break; + case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break; + } + } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) { + MFI->setHasSpilledVGPRs(); + + switch(RC->getSize() * 8) { + case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break; + case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break; + case 96: Opcode = AMDGPU::SI_SPILL_V96_SAVE; break; + case 128: Opcode = AMDGPU::SI_SPILL_V128_SAVE; break; + case 256: Opcode = AMDGPU::SI_SPILL_V256_SAVE; break; + case 512: Opcode = AMDGPU::SI_SPILL_V512_SAVE; break; + } + } + + if (Opcode != -1) { + FrameInfo->setObjectAlignment(FrameIndex, 4); + BuildMI(MBB, MI, DL, get(Opcode)) + .addReg(SrcReg) + .addFrameIndex(FrameIndex) + // Place-holder registers, these will be filled in by + // SIPrepareScratchRegs. + .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) + .addReg(AMDGPU::SGPR0, RegState::Undef); + } else { + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" + " spill register"); + BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) + .addReg(SrcReg); + } +} + +void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + MachineFunction *MF = MBB.getParent(); + const SIMachineFunctionInfo *MFI = MF->getInfo(); + MachineFrameInfo *FrameInfo = MF->getFrameInfo(); + DebugLoc DL = MBB.findDebugLoc(MI); + int Opcode = -1; + + if (RI.isSGPRClass(RC)){ + switch(RC->getSize() * 8) { + case 32: Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break; + case 64: Opcode = AMDGPU::SI_SPILL_S64_RESTORE; break; + case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break; + case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break; + case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break; + } + } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) { + switch(RC->getSize() * 8) { + case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break; + case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break; + case 96: Opcode = AMDGPU::SI_SPILL_V96_RESTORE; break; + case 128: Opcode = AMDGPU::SI_SPILL_V128_RESTORE; break; + case 256: Opcode = AMDGPU::SI_SPILL_V256_RESTORE; break; + case 512: Opcode = AMDGPU::SI_SPILL_V512_RESTORE; break; + } + } + + if (Opcode != -1) { + FrameInfo->setObjectAlignment(FrameIndex, 4); + BuildMI(MBB, MI, DL, get(Opcode), DestReg) + .addFrameIndex(FrameIndex) + // Place-holder registers, these will be filled in by + // SIPrepareScratchRegs. + .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) + .addReg(AMDGPU::SGPR0, RegState::Undef); + + } else { + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" + " restore register"); + BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); + } +} + +/// \param @Offset Offset in bytes of the FrameIndex being spilled +unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + RegScavenger *RS, unsigned TmpReg, + unsigned FrameOffset, + unsigned Size) const { + MachineFunction *MF = MBB.getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo(); + const AMDGPUSubtarget &ST = MF->getSubtarget(); + const SIRegisterInfo *TRI = + static_cast(ST.getRegisterInfo()); + DebugLoc DL = MBB.findDebugLoc(MI); + unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF); + unsigned WavefrontSize = ST.getWavefrontSize(); + + unsigned TIDReg = MFI->getTIDReg(); + if (!MFI->hasCalculatedTID()) { + MachineBasicBlock &Entry = MBB.getParent()->front(); + MachineBasicBlock::iterator Insert = Entry.front(); + DebugLoc DL = Insert->getDebugLoc(); + + TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass); + if (TIDReg == AMDGPU::NoRegister) + return TIDReg; + + + if (MFI->getShaderType() == ShaderType::COMPUTE && + WorkGroupSize > WavefrontSize) { + + unsigned TIDIGXReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_X); + unsigned TIDIGYReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Y); + unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Z); + unsigned InputPtrReg = + TRI->getPreloadedValue(*MF, SIRegisterInfo::INPUT_PTR); + for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { + if (!Entry.isLiveIn(Reg)) + Entry.addLiveIn(Reg); + } + + RS->enterBasicBlock(&Entry); + unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); + unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); + BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) + .addReg(InputPtrReg) + .addImm(SI::KernelInputOffsets::NGROUPS_Z); + BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) + .addReg(InputPtrReg) + .addImm(SI::KernelInputOffsets::NGROUPS_Y); + + // NGROUPS.X * NGROUPS.Y + BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) + .addReg(STmp1) + .addReg(STmp0); + // (NGROUPS.X * NGROUPS.Y) * TIDIG.X + BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) + .addReg(STmp1) + .addReg(TIDIGXReg); + // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) + BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) + .addReg(STmp0) + .addReg(TIDIGYReg) + .addReg(TIDReg); + // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z + BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg) + .addReg(TIDReg) + .addReg(TIDIGZReg); + } else { + // Get the wave id + BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), + TIDReg) + .addImm(-1) + .addImm(0); + + BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), + TIDReg) + .addImm(-1) + .addReg(TIDReg); + } + + BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), + TIDReg) + .addImm(2) + .addReg(TIDReg); + MFI->setTIDReg(TIDReg); + } + + // Add FrameIndex to LDS offset + unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize); + BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) + .addImm(LDSOffset) + .addReg(TIDReg); + + return TmpReg; +} + +void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI, + int Count) const { + while (Count > 0) { + int Arg; + if (Count >= 8) + Arg = 7; + else + Arg = Count - 1; + Count -= 8; + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(AMDGPU::S_NOP)) + .addImm(Arg); + } +} + +bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MBB.findDebugLoc(MI); + switch (MI->getOpcode()) { + default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); + + case AMDGPU::SI_CONSTDATA_PTR: { + unsigned Reg = MI->getOperand(0).getReg(); + unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0); + unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1); + + BuildMI(MBB, MI, DL, get(AMDGPU::S_GETPC_B64), Reg); + + // Add 32-bit offset from this instruction to the start of the constant data. + BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_U32), RegLo) + .addReg(RegLo) + .addTargetIndex(AMDGPU::TI_CONSTDATA_START) + .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit); + BuildMI(MBB, MI, DL, get(AMDGPU::S_ADDC_U32), RegHi) + .addReg(RegHi) + .addImm(0) + .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit) + .addReg(AMDGPU::SCC, RegState::Implicit); + MI->eraseFromParent(); + break; + } + case AMDGPU::SGPR_USE: + // This is just a placeholder for register allocation. + MI->eraseFromParent(); + break; + + case AMDGPU::V_MOV_B64_PSEUDO: { + unsigned Dst = MI->getOperand(0).getReg(); + unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); + unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); + + const MachineOperand &SrcOp = MI->getOperand(1); + // FIXME: Will this work for 64-bit floating point immediates? + assert(!SrcOp.isFPImm()); + if (SrcOp.isImm()) { + APInt Imm(64, SrcOp.getImm()); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) + .addImm(Imm.getLoBits(32).getZExtValue()) + .addReg(Dst, RegState::Implicit); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) + .addImm(Imm.getHiBits(32).getZExtValue()) + .addReg(Dst, RegState::Implicit); + } else { + assert(SrcOp.isReg()); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) + .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) + .addReg(Dst, RegState::Implicit); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) + .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) + .addReg(Dst, RegState::Implicit); + } + MI->eraseFromParent(); + break; + } + + case AMDGPU::V_CNDMASK_B64_PSEUDO: { + unsigned Dst = MI->getOperand(0).getReg(); + unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); + unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); + unsigned Src0 = MI->getOperand(1).getReg(); + unsigned Src1 = MI->getOperand(2).getReg(); + const MachineOperand &SrcCond = MI->getOperand(3); + + BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo) + .addReg(RI.getSubReg(Src0, AMDGPU::sub0)) + .addReg(RI.getSubReg(Src1, AMDGPU::sub0)) + .addOperand(SrcCond); + BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi) + .addReg(RI.getSubReg(Src0, AMDGPU::sub1)) + .addReg(RI.getSubReg(Src1, AMDGPU::sub1)) + .addOperand(SrcCond); + MI->eraseFromParent(); + break; + } + } + return true; +} + +MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, + bool NewMI) const { + + if (MI->getNumOperands() < 3) + return nullptr; + + int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::src0); + assert(Src0Idx != -1 && "Should always have src0 operand"); + + MachineOperand &Src0 = MI->getOperand(Src0Idx); + if (!Src0.isReg()) + return nullptr; + + int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::src1); + if (Src1Idx == -1) + return nullptr; + + MachineOperand &Src1 = MI->getOperand(Src1Idx); + + // Make sure it's legal to commute operands for VOP2. + if (isVOP2(MI->getOpcode()) && + (!isOperandLegal(MI, Src0Idx, &Src1) || + !isOperandLegal(MI, Src1Idx, &Src0))) { + return nullptr; + } + + if (!Src1.isReg()) { + // Allow commuting instructions with Imm operands. + if (NewMI || !Src1.isImm() || + (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) { + return nullptr; + } + + // Be sure to copy the source modifiers to the right place. + if (MachineOperand *Src0Mods + = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { + MachineOperand *Src1Mods + = getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers); + + int Src0ModsVal = Src0Mods->getImm(); + if (!Src1Mods && Src0ModsVal != 0) + return nullptr; + + // XXX - This assert might be a lie. It might be useful to have a neg + // modifier with 0.0. + int Src1ModsVal = Src1Mods->getImm(); + assert((Src1ModsVal == 0) && "Not expecting modifiers with immediates"); + + Src1Mods->setImm(Src0ModsVal); + Src0Mods->setImm(Src1ModsVal); + } + + unsigned Reg = Src0.getReg(); + unsigned SubReg = Src0.getSubReg(); + if (Src1.isImm()) + Src0.ChangeToImmediate(Src1.getImm()); + else + llvm_unreachable("Should only have immediates"); + + Src1.ChangeToRegister(Reg, false); + Src1.setSubReg(SubReg); + } else { + MI = TargetInstrInfo::commuteInstruction(MI, NewMI); + } + + if (MI) + MI->setDesc(get(commuteOpcode(*MI))); + + return MI; +} + +// This needs to be implemented because the source modifiers may be inserted +// between the true commutable operands, and the base +// TargetInstrInfo::commuteInstruction uses it. +bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, + unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const { + const MCInstrDesc &MCID = MI->getDesc(); + if (!MCID.isCommutable()) + return false; + + unsigned Opc = MI->getOpcode(); + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + if (Src0Idx == -1) + return false; + + // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on + // immediate. + if (!MI->getOperand(Src0Idx).isReg()) + return false; + + int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); + if (Src1Idx == -1) + return false; + + if (!MI->getOperand(Src1Idx).isReg()) + return false; + + // If any source modifiers are set, the generic instruction commuting won't + // understand how to copy the source modifiers. + if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) || + hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers)) + return false; + + SrcOpIdx1 = Src0Idx; + SrcOpIdx2 = Src1Idx; + return true; +} + +MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned DstReg, + unsigned SrcReg) const { + return BuildMI(*MBB, I, MBB->findDebugLoc(I), get(AMDGPU::V_MOV_B32_e32), + DstReg) .addReg(SrcReg); +} + +bool SIInstrInfo::isMov(unsigned Opcode) const { + switch(Opcode) { + default: return false; + case AMDGPU::S_MOV_B32: + case AMDGPU::S_MOV_B64: + case AMDGPU::V_MOV_B32_e32: + case AMDGPU::V_MOV_B32_e64: + return true; + } +} + +bool +SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { + return RC != &AMDGPU::EXECRegRegClass; +} + +static void removeModOperands(MachineInstr &MI) { + unsigned Opc = MI.getOpcode(); + int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, + AMDGPU::OpName::src0_modifiers); + int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, + AMDGPU::OpName::src1_modifiers); + int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, + AMDGPU::OpName::src2_modifiers); + + MI.RemoveOperand(Src2ModIdx); + MI.RemoveOperand(Src1ModIdx); + MI.RemoveOperand(Src0ModIdx); +} + +bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, + unsigned Reg, MachineRegisterInfo *MRI) const { + if (!MRI->hasOneNonDBGUse(Reg)) + return false; + + unsigned Opc = UseMI->getOpcode(); + if (Opc == AMDGPU::V_MAD_F32) { + // Don't fold if we are using source modifiers. The new VOP2 instructions + // don't have them. + if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) || + hasModifiersSet(*UseMI, AMDGPU::OpName::src1_modifiers) || + hasModifiersSet(*UseMI, AMDGPU::OpName::src2_modifiers)) { + return false; + } + + MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0); + MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1); + MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2); + + // Multiplied part is the constant: Use v_madmk_f32 + // We should only expect these to be on src0 due to canonicalizations. + if (Src0->isReg() && Src0->getReg() == Reg) { + if (!Src1->isReg() || + (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) + return false; + + if (!Src2->isReg() || + (Src2->isReg() && RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))) + return false; + + // We need to do some weird looking operand shuffling since the madmk + // operands are out of the normal expected order with the multiplied + // constant as the last operand. + // + // v_mad_f32 src0, src1, src2 -> v_madmk_f32 src0 * src2K + src1 + // src0 -> src2 K + // src1 -> src0 + // src2 -> src1 + + const int64_t Imm = DefMI->getOperand(1).getImm(); + + // FIXME: This would be a lot easier if we could return a new instruction + // instead of having to modify in place. + + // Remove these first since they are at the end. + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, + AMDGPU::OpName::omod)); + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, + AMDGPU::OpName::clamp)); + + unsigned Src1Reg = Src1->getReg(); + unsigned Src1SubReg = Src1->getSubReg(); + unsigned Src2Reg = Src2->getReg(); + unsigned Src2SubReg = Src2->getSubReg(); + Src0->setReg(Src1Reg); + Src0->setSubReg(Src1SubReg); + Src0->setIsKill(Src1->isKill()); + + Src1->setReg(Src2Reg); + Src1->setSubReg(Src2SubReg); + Src1->setIsKill(Src2->isKill()); + + Src2->ChangeToImmediate(Imm); + + removeModOperands(*UseMI); + UseMI->setDesc(get(AMDGPU::V_MADMK_F32)); + + bool DeleteDef = MRI->hasOneNonDBGUse(Reg); + if (DeleteDef) + DefMI->eraseFromParent(); + + return true; + } + + // Added part is the constant: Use v_madak_f32 + if (Src2->isReg() && Src2->getReg() == Reg) { + // Not allowed to use constant bus for another operand. + // We can however allow an inline immediate as src0. + if (!Src0->isImm() && + (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) + return false; + + if (!Src1->isReg() || + (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) + return false; + + const int64_t Imm = DefMI->getOperand(1).getImm(); + + // FIXME: This would be a lot easier if we could return a new instruction + // instead of having to modify in place. + + // Remove these first since they are at the end. + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, + AMDGPU::OpName::omod)); + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, + AMDGPU::OpName::clamp)); + + Src2->ChangeToImmediate(Imm); + + // These come before src2. + removeModOperands(*UseMI); + UseMI->setDesc(get(AMDGPU::V_MADAK_F32)); + + bool DeleteDef = MRI->hasOneNonDBGUse(Reg); + if (DeleteDef) + DefMI->eraseFromParent(); + + return true; + } + } + + return false; +} + +bool +SIInstrInfo::isTriviallyReMaterializable(const MachineInstr *MI, + AliasAnalysis *AA) const { + switch(MI->getOpcode()) { + default: return AMDGPUInstrInfo::isTriviallyReMaterializable(MI, AA); + case AMDGPU::S_MOV_B32: + case AMDGPU::S_MOV_B64: + case AMDGPU::V_MOV_B32_e32: + return MI->getOperand(1).isImm(); + } +} + +static bool offsetsDoNotOverlap(int WidthA, int OffsetA, + int WidthB, int OffsetB) { + int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; + int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; + int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; + return LowOffset + LowWidth <= HighOffset; +} + +bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa, + MachineInstr *MIb) const { + unsigned BaseReg0, Offset0; + unsigned BaseReg1, Offset1; + + if (getLdStBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && + getLdStBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { + assert(MIa->hasOneMemOperand() && MIb->hasOneMemOperand() && + "read2 / write2 not expected here yet"); + unsigned Width0 = (*MIa->memoperands_begin())->getSize(); + unsigned Width1 = (*MIb->memoperands_begin())->getSize(); + if (BaseReg0 == BaseReg1 && + offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { + return true; + } + } + + return false; +} + +bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, + MachineInstr *MIb, + AliasAnalysis *AA) const { + unsigned Opc0 = MIa->getOpcode(); + unsigned Opc1 = MIb->getOpcode(); + + assert(MIa && (MIa->mayLoad() || MIa->mayStore()) && + "MIa must load from or modify a memory location"); + assert(MIb && (MIb->mayLoad() || MIb->mayStore()) && + "MIb must load from or modify a memory location"); + + if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects()) + return false; + + // XXX - Can we relax this between address spaces? + if (MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef()) + return false; + + // TODO: Should we check the address space from the MachineMemOperand? That + // would allow us to distinguish objects we know don't alias based on the + // underlying addres space, even if it was lowered to a different one, + // e.g. private accesses lowered to use MUBUF instructions on a scratch + // buffer. + if (isDS(Opc0)) { + if (isDS(Opc1)) + return checkInstOffsetsDoNotOverlap(MIa, MIb); + + return !isFLAT(Opc1); + } + + if (isMUBUF(Opc0) || isMTBUF(Opc0)) { + if (isMUBUF(Opc1) || isMTBUF(Opc1)) + return checkInstOffsetsDoNotOverlap(MIa, MIb); + + return !isFLAT(Opc1) && !isSMRD(Opc1); + } + + if (isSMRD(Opc0)) { + if (isSMRD(Opc1)) + return checkInstOffsetsDoNotOverlap(MIa, MIb); + + return !isFLAT(Opc1) && !isMUBUF(Opc0) && !isMTBUF(Opc0); + } + + if (isFLAT(Opc0)) { + if (isFLAT(Opc1)) + return checkInstOffsetsDoNotOverlap(MIa, MIb); + + return false; + } + + return false; +} + +bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { + int64_t SVal = Imm.getSExtValue(); + if (SVal >= -16 && SVal <= 64) + return true; + + if (Imm.getBitWidth() == 64) { + uint64_t Val = Imm.getZExtValue(); + return (DoubleToBits(0.0) == Val) || + (DoubleToBits(1.0) == Val) || + (DoubleToBits(-1.0) == Val) || + (DoubleToBits(0.5) == Val) || + (DoubleToBits(-0.5) == Val) || + (DoubleToBits(2.0) == Val) || + (DoubleToBits(-2.0) == Val) || + (DoubleToBits(4.0) == Val) || + (DoubleToBits(-4.0) == Val); + } + + // The actual type of the operand does not seem to matter as long + // as the bits match one of the inline immediate values. For example: + // + // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal, + // so it is a legal inline immediate. + // + // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in + // floating-point, so it is a legal inline immediate. + uint32_t Val = Imm.getZExtValue(); + + return (FloatToBits(0.0f) == Val) || + (FloatToBits(1.0f) == Val) || + (FloatToBits(-1.0f) == Val) || + (FloatToBits(0.5f) == Val) || + (FloatToBits(-0.5f) == Val) || + (FloatToBits(2.0f) == Val) || + (FloatToBits(-2.0f) == Val) || + (FloatToBits(4.0f) == Val) || + (FloatToBits(-4.0f) == Val); +} + +bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, + unsigned OpSize) const { + if (MO.isImm()) { + // MachineOperand provides no way to tell the true operand size, since it + // only records a 64-bit value. We need to know the size to determine if a + // 32-bit floating point immediate bit pattern is legal for an integer + // immediate. It would be for any 32-bit integer operand, but would not be + // for a 64-bit one. + + unsigned BitSize = 8 * OpSize; + return isInlineConstant(APInt(BitSize, MO.getImm(), true)); + } + + return false; +} + +bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO, + unsigned OpSize) const { + return MO.isImm() && !isInlineConstant(MO, OpSize); +} + +static bool compareMachineOp(const MachineOperand &Op0, + const MachineOperand &Op1) { + if (Op0.getType() != Op1.getType()) + return false; + + switch (Op0.getType()) { + case MachineOperand::MO_Register: + return Op0.getReg() == Op1.getReg(); + case MachineOperand::MO_Immediate: + return Op0.getImm() == Op1.getImm(); + default: + llvm_unreachable("Didn't expect to be comparing these operand types"); + } +} + +bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, + const MachineOperand &MO) const { + const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo]; + + assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); + + if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) + return true; + + if (OpInfo.RegClass < 0) + return false; + + unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize(); + if (isLiteralConstant(MO, OpSize)) + return RI.opCanUseLiteralConstant(OpInfo.OperandType); + + return RI.opCanUseInlineConstant(OpInfo.OperandType); +} + +bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { + int Op32 = AMDGPU::getVOPe32(Opcode); + if (Op32 == -1) + return false; + + return pseudoToMCOpcode(Op32) != -1; +} + +bool SIInstrInfo::hasModifiers(unsigned Opcode) const { + // The src0_modifier operand is present on all instructions + // that have modifiers. + + return AMDGPU::getNamedOperandIdx(Opcode, + AMDGPU::OpName::src0_modifiers) != -1; +} + +bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, + unsigned OpName) const { + const MachineOperand *Mods = getNamedOperand(MI, OpName); + return Mods && Mods->getImm(); +} + +bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, + const MachineOperand &MO, + unsigned OpSize) const { + // Literal constants use the constant bus. + if (isLiteralConstant(MO, OpSize)) + return true; + + if (!MO.isReg() || !MO.isUse()) + return false; + + if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) + return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); + + // FLAT_SCR is just an SGPR pair. + if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) + return true; + + // EXEC register uses the constant bus. + if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) + return true; + + // SGPRs use the constant bus + if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC || + (!MO.isImplicit() && + (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || + AMDGPU::SGPR_64RegClass.contains(MO.getReg())))) { + return true; + } + + return false; +} + +bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, + StringRef &ErrInfo) const { + uint16_t Opcode = MI->getOpcode(); + const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); + int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); + int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); + + // Make sure the number of operands is correct. + const MCInstrDesc &Desc = get(Opcode); + if (!Desc.isVariadic() && + Desc.getNumOperands() != MI->getNumExplicitOperands()) { + ErrInfo = "Instruction has wrong number of operands."; + return false; + } + + // Make sure the register classes are correct + for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { + if (MI->getOperand(i).isFPImm()) { + ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " + "all fp values to integers."; + return false; + } + + int RegClass = Desc.OpInfo[i].RegClass; + + switch (Desc.OpInfo[i].OperandType) { + case MCOI::OPERAND_REGISTER: + if (MI->getOperand(i).isImm()) { + ErrInfo = "Illegal immediate value for operand."; + return false; + } + break; + case AMDGPU::OPERAND_REG_IMM32: + break; + case AMDGPU::OPERAND_REG_INLINE_C: + if (isLiteralConstant(MI->getOperand(i), + RI.getRegClass(RegClass)->getSize())) { + ErrInfo = "Illegal immediate value for operand."; + return false; + } + break; + case MCOI::OPERAND_IMMEDIATE: + // Check if this operand is an immediate. + // FrameIndex operands will be replaced by immediates, so they are + // allowed. + if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFI()) { + ErrInfo = "Expected immediate, but got non-immediate"; + return false; + } + // Fall-through + default: + continue; + } + + if (!MI->getOperand(i).isReg()) + continue; + + if (RegClass != -1) { + unsigned Reg = MI->getOperand(i).getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) + continue; + + const TargetRegisterClass *RC = RI.getRegClass(RegClass); + if (!RC->contains(Reg)) { + ErrInfo = "Operand has incorrect register class."; + return false; + } + } + } + + + // Verify VOP* + if (isVOP1(Opcode) || isVOP2(Opcode) || isVOP3(Opcode) || isVOPC(Opcode)) { + // Only look at the true operands. Only a real operand can use the constant + // bus, and we don't want to check pseudo-operands like the source modifier + // flags. + const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; + + unsigned ConstantBusCount = 0; + unsigned SGPRUsed = AMDGPU::NoRegister; + for (int OpIdx : OpIndices) { + if (OpIdx == -1) + break; + const MachineOperand &MO = MI->getOperand(OpIdx); + if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) { + if (MO.isReg()) { + if (MO.getReg() != SGPRUsed) + ++ConstantBusCount; + SGPRUsed = MO.getReg(); + } else { + ++ConstantBusCount; + } + } + } + if (ConstantBusCount > 1) { + ErrInfo = "VOP* instruction uses the constant bus more than once"; + return false; + } + } + + // Verify misc. restrictions on specific instructions. + if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || + Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { + const MachineOperand &Src0 = MI->getOperand(Src0Idx); + const MachineOperand &Src1 = MI->getOperand(Src1Idx); + const MachineOperand &Src2 = MI->getOperand(Src2Idx); + if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { + if (!compareMachineOp(Src0, Src1) && + !compareMachineOp(Src0, Src2)) { + ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; + return false; + } + } + } + + return true; +} + +unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { + switch (MI.getOpcode()) { + default: return AMDGPU::INSTRUCTION_LIST_END; + case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; + case AMDGPU::COPY: return AMDGPU::COPY; + case AMDGPU::PHI: return AMDGPU::PHI; + case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; + case AMDGPU::S_MOV_B32: + return MI.getOperand(1).isReg() ? + AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; + case AMDGPU::S_ADD_I32: + case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32; + case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32; + case AMDGPU::S_SUB_I32: + case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; + case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; + case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; + case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32; + case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32; + case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32; + case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e32; + case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e32; + case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e32; + case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e32; + case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; + case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; + case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; + case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; + case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; + case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; + case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; + case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; + case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; + case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; + case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; + case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; + case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; + case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; + case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; + case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; + case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; + case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; + case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; + case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; + case AMDGPU::S_LOAD_DWORD_IMM: + case AMDGPU::S_LOAD_DWORD_SGPR: return AMDGPU::BUFFER_LOAD_DWORD_ADDR64; + case AMDGPU::S_LOAD_DWORDX2_IMM: + case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; + case AMDGPU::S_LOAD_DWORDX4_IMM: + case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; + case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; + case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; + case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; + case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; + } +} + +bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const { + return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END; +} + +const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, + unsigned OpNo) const { + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + const MCInstrDesc &Desc = get(MI.getOpcode()); + if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || + Desc.OpInfo[OpNo].RegClass == -1) { + unsigned Reg = MI.getOperand(OpNo).getReg(); + + if (TargetRegisterInfo::isVirtualRegister(Reg)) + return MRI.getRegClass(Reg); + return RI.getPhysRegClass(Reg); + } + + unsigned RCID = Desc.OpInfo[OpNo].RegClass; + return RI.getRegClass(RCID); +} + +bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { + switch (MI.getOpcode()) { + case AMDGPU::COPY: + case AMDGPU::REG_SEQUENCE: + case AMDGPU::PHI: + case AMDGPU::INSERT_SUBREG: + return RI.hasVGPRs(getOpRegClass(MI, 0)); + default: + return RI.hasVGPRs(getOpRegClass(MI, OpNo)); + } +} + +void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const { + MachineBasicBlock::iterator I = MI; + MachineBasicBlock *MBB = MI->getParent(); + MachineOperand &MO = MI->getOperand(OpIdx); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass; + const TargetRegisterClass *RC = RI.getRegClass(RCID); + unsigned Opcode = AMDGPU::V_MOV_B32_e32; + if (MO.isReg()) + Opcode = AMDGPU::COPY; + else if (RI.isSGPRClass(RC)) + Opcode = AMDGPU::S_MOV_B32; + + + const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); + if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) + VRC = &AMDGPU::VReg_64RegClass; + else + VRC = &AMDGPU::VGPR_32RegClass; + + unsigned Reg = MRI.createVirtualRegister(VRC); + DebugLoc DL = MBB->findDebugLoc(I); + BuildMI(*MI->getParent(), I, DL, get(Opcode), Reg) + .addOperand(MO); + MO.ChangeToRegister(Reg, false); +} + +unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, + MachineRegisterInfo &MRI, + MachineOperand &SuperReg, + const TargetRegisterClass *SuperRC, + unsigned SubIdx, + const TargetRegisterClass *SubRC) + const { + assert(SuperReg.isReg()); + + unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); + unsigned SubReg = MRI.createVirtualRegister(SubRC); + + // Just in case the super register is itself a sub-register, copy it to a new + // value so we don't need to worry about merging its subreg index with the + // SubIdx passed to this function. The register coalescer should be able to + // eliminate this extra copy. + MachineBasicBlock *MBB = MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) + .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); + + BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) + .addReg(NewSuperReg, 0, SubIdx); + + return SubReg; +} + +MachineOperand SIInstrInfo::buildExtractSubRegOrImm( + MachineBasicBlock::iterator MII, + MachineRegisterInfo &MRI, + MachineOperand &Op, + const TargetRegisterClass *SuperRC, + unsigned SubIdx, + const TargetRegisterClass *SubRC) const { + if (Op.isImm()) { + // XXX - Is there a better way to do this? + if (SubIdx == AMDGPU::sub0) + return MachineOperand::CreateImm(Op.getImm() & 0xFFFFFFFF); + if (SubIdx == AMDGPU::sub1) + return MachineOperand::CreateImm(Op.getImm() >> 32); + + llvm_unreachable("Unhandled register index for immediate"); + } + + unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, + SubIdx, SubRC); + return MachineOperand::CreateReg(SubReg, false); +} + +unsigned SIInstrInfo::split64BitImm(SmallVectorImpl &Worklist, + MachineBasicBlock::iterator MI, + MachineRegisterInfo &MRI, + const TargetRegisterClass *RC, + const MachineOperand &Op) const { + MachineBasicBlock *MBB = MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + unsigned LoDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned HiDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned Dst = MRI.createVirtualRegister(RC); + + MachineInstr *Lo = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), + LoDst) + .addImm(Op.getImm() & 0xFFFFFFFF); + MachineInstr *Hi = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), + HiDst) + .addImm(Op.getImm() >> 32); + + BuildMI(*MBB, MI, DL, get(TargetOpcode::REG_SEQUENCE), Dst) + .addReg(LoDst) + .addImm(AMDGPU::sub0) + .addReg(HiDst) + .addImm(AMDGPU::sub1); + + Worklist.push_back(Lo); + Worklist.push_back(Hi); + + return Dst; +} + +// Change the order of operands from (0, 1, 2) to (0, 2, 1) +void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const { + assert(Inst->getNumExplicitOperands() == 3); + MachineOperand Op1 = Inst->getOperand(1); + Inst->RemoveOperand(1); + Inst->addOperand(Op1); +} + +bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, + const MachineOperand *MO) const { + const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + const MCInstrDesc &InstDesc = get(MI->getOpcode()); + const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; + const TargetRegisterClass *DefinedRC = + OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; + if (!MO) + MO = &MI->getOperand(OpIdx); + + if (isVALU(InstDesc.Opcode) && + usesConstantBus(MRI, *MO, DefinedRC->getSize())) { + unsigned SGPRUsed = + MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + if (i == OpIdx) + continue; + const MachineOperand &Op = MI->getOperand(i); + if (Op.isReg() && Op.getReg() != SGPRUsed && + usesConstantBus(MRI, Op, getOpSize(*MI, i))) { + return false; + } + } + } + + if (MO->isReg()) { + assert(DefinedRC); + const TargetRegisterClass *RC = MRI.getRegClass(MO->getReg()); + + // In order to be legal, the common sub-class must be equal to the + // class of the current operand. For example: + // + // v_mov_b32 s0 ; Operand defined as vsrc_32 + // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL + // + // s_sendmsg 0, s0 ; Operand defined as m0reg + // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL + + return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; + } + + + // Handle non-register types that are treated like immediates. + assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); + + if (!DefinedRC) { + // This operand expects an immediate. + return true; + } + + return isImmOperandLegal(MI, OpIdx, *MO); +} + +void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { + MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + + int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::src0); + int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::src1); + int Src2Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::src2); + + // Legalize VOP2 + if (isVOP2(MI->getOpcode()) && Src1Idx != -1) { + // Legalize src0 + if (!isOperandLegal(MI, Src0Idx)) + legalizeOpWithMove(MI, Src0Idx); + + // Legalize src1 + if (isOperandLegal(MI, Src1Idx)) + return; + + // Usually src0 of VOP2 instructions allow more types of inputs + // than src1, so try to commute the instruction to decrease our + // chances of having to insert a MOV instruction to legalize src1. + if (MI->isCommutable()) { + if (commuteInstruction(MI)) + // If we are successful in commuting, then we know MI is legal, so + // we are done. + return; + } + + legalizeOpWithMove(MI, Src1Idx); + return; + } + + // XXX - Do any VOP3 instructions read VCC? + // Legalize VOP3 + if (isVOP3(MI->getOpcode())) { + int VOP3Idx[3] = { Src0Idx, Src1Idx, Src2Idx }; + + // Find the one SGPR operand we are allowed to use. + unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); + + for (unsigned i = 0; i < 3; ++i) { + int Idx = VOP3Idx[i]; + if (Idx == -1) + break; + MachineOperand &MO = MI->getOperand(Idx); + + if (MO.isReg()) { + if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) + continue; // VGPRs are legal + + assert(MO.getReg() != AMDGPU::SCC && "SCC operand to VOP3 instruction"); + + if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { + SGPRReg = MO.getReg(); + // We can use one SGPR in each VOP3 instruction. + continue; + } + } else if (!isLiteralConstant(MO, getOpSize(MI->getOpcode(), Idx))) { + // If it is not a register and not a literal constant, then it must be + // an inline constant which is always legal. + continue; + } + // If we make it this far, then the operand is not legal and we must + // legalize it. + legalizeOpWithMove(MI, Idx); + } + } + + // Legalize REG_SEQUENCE and PHI + // The register class of the operands much be the same type as the register + // class of the output. + if (MI->getOpcode() == AMDGPU::REG_SEQUENCE || + MI->getOpcode() == AMDGPU::PHI) { + const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; + for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { + if (!MI->getOperand(i).isReg() || + !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) + continue; + const TargetRegisterClass *OpRC = + MRI.getRegClass(MI->getOperand(i).getReg()); + if (RI.hasVGPRs(OpRC)) { + VRC = OpRC; + } else { + SRC = OpRC; + } + } + + // If any of the operands are VGPR registers, then they all most be + // otherwise we will create illegal VGPR->SGPR copies when legalizing + // them. + if (VRC || !RI.isSGPRClass(getOpRegClass(*MI, 0))) { + if (!VRC) { + assert(SRC); + VRC = RI.getEquivalentVGPRClass(SRC); + } + RC = VRC; + } else { + RC = SRC; + } + + // Update all the operands so they have the same type. + for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { + if (!MI->getOperand(i).isReg() || + !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) + continue; + unsigned DstReg = MRI.createVirtualRegister(RC); + MachineBasicBlock *InsertBB; + MachineBasicBlock::iterator Insert; + if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) { + InsertBB = MI->getParent(); + Insert = MI; + } else { + // MI is a PHI instruction. + InsertBB = MI->getOperand(i + 1).getMBB(); + Insert = InsertBB->getFirstTerminator(); + } + BuildMI(*InsertBB, Insert, MI->getDebugLoc(), + get(AMDGPU::COPY), DstReg) + .addOperand(MI->getOperand(i)); + MI->getOperand(i).setReg(DstReg); + } + } + + // Legalize INSERT_SUBREG + // src0 must have the same register class as dst + if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) { + unsigned Dst = MI->getOperand(0).getReg(); + unsigned Src0 = MI->getOperand(1).getReg(); + const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); + const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); + if (DstRC != Src0RC) { + MachineBasicBlock &MBB = *MI->getParent(); + unsigned NewSrc0 = MRI.createVirtualRegister(DstRC); + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0) + .addReg(Src0); + MI->getOperand(1).setReg(NewSrc0); + } + return; + } + + // Legalize MUBUF* instructions + // FIXME: If we start using the non-addr64 instructions for compute, we + // may need to legalize them here. + int SRsrcIdx = + AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc); + if (SRsrcIdx != -1) { + // We have an MUBUF instruction + MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx); + unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass; + if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), + RI.getRegClass(SRsrcRC))) { + // The operands are legal. + // FIXME: We may need to legalize operands besided srsrc. + return; + } + + MachineBasicBlock &MBB = *MI->getParent(); + // Extract the the ptr from the resource descriptor. + + // SRsrcPtrLo = srsrc:sub0 + unsigned SRsrcPtrLo = buildExtractSubReg(MI, MRI, *SRsrc, + &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VGPR_32RegClass); + + // SRsrcPtrHi = srsrc:sub1 + unsigned SRsrcPtrHi = buildExtractSubReg(MI, MRI, *SRsrc, + &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VGPR_32RegClass); + + // Create an empty resource descriptor + unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); + uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); + + // Zero64 = 0 + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64), + Zero64) + .addImm(0); + + // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), + SRsrcFormatLo) + .addImm(RsrcDataFormat & 0xFFFFFFFF); + + // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), + SRsrcFormatHi) + .addImm(RsrcDataFormat >> 32); + + // NewSRsrc = {Zero64, SRsrcFormat} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), + NewSRsrc) + .addReg(Zero64) + .addImm(AMDGPU::sub0_sub1) + .addReg(SRsrcFormatLo) + .addImm(AMDGPU::sub2) + .addReg(SRsrcFormatHi) + .addImm(AMDGPU::sub3); + + MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); + unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + unsigned NewVAddrLo; + unsigned NewVAddrHi; + if (VAddr) { + // This is already an ADDR64 instruction so we need to add the pointer + // extracted from the resource descriptor to the current value of VAddr. + NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + // NewVaddrLo = SRsrcPtrLo + VAddr:sub0 + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32), + NewVAddrLo) + .addReg(SRsrcPtrLo) + .addReg(VAddr->getReg(), 0, AMDGPU::sub0) + .addReg(AMDGPU::VCC, RegState::ImplicitDefine); + + // NewVaddrHi = SRsrcPtrHi + VAddr:sub1 + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADDC_U32_e32), + NewVAddrHi) + .addReg(SRsrcPtrHi) + .addReg(VAddr->getReg(), 0, AMDGPU::sub1) + .addReg(AMDGPU::VCC, RegState::ImplicitDefine) + .addReg(AMDGPU::VCC, RegState::Implicit); + + } else { + // This instructions is the _OFFSET variant, so we need to convert it to + // ADDR64. + MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata); + MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset); + MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset); + + // Create the new instruction. + unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode()); + MachineInstr *Addr64 = + BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) + .addOperand(*VData) + .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. + // This will be replaced later + // with the new value of vaddr. + .addOperand(*SRsrc) + .addOperand(*SOffset) + .addOperand(*Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0); // tfe + + MI->removeFromParent(); + MI = Addr64; + + NewVAddrLo = SRsrcPtrLo; + NewVAddrHi = SRsrcPtrHi; + VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); + SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); + } + + // NewVaddr = {NewVaddrHi, NewVaddrLo} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), + NewVAddr) + .addReg(NewVAddrLo) + .addImm(AMDGPU::sub0) + .addReg(NewVAddrHi) + .addImm(AMDGPU::sub1); + + + // Update the instruction to use NewVaddr + VAddr->setReg(NewVAddr); + // Update the instruction to use NewSRsrc + SRsrc->setReg(NewSRsrc); + } +} + +void SIInstrInfo::splitSMRD(MachineInstr *MI, + const TargetRegisterClass *HalfRC, + unsigned HalfImmOp, unsigned HalfSGPROp, + MachineInstr *&Lo, MachineInstr *&Hi) const { + + DebugLoc DL = MI->getDebugLoc(); + MachineBasicBlock *MBB = MI->getParent(); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + unsigned RegLo = MRI.createVirtualRegister(HalfRC); + unsigned RegHi = MRI.createVirtualRegister(HalfRC); + unsigned HalfSize = HalfRC->getSize(); + const MachineOperand *OffOp = + getNamedOperand(*MI, AMDGPU::OpName::offset); + const MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase); + + // The SMRD has an 8-bit offset in dwords on SI and a 20-bit offset in bytes + // on VI. + + bool IsKill = SBase->isKill(); + if (OffOp) { + bool isVI = + MBB->getParent()->getSubtarget().getGeneration() >= + AMDGPUSubtarget::VOLCANIC_ISLANDS; + unsigned OffScale = isVI ? 1 : 4; + // Handle the _IMM variant + unsigned LoOffset = OffOp->getImm() * OffScale; + unsigned HiOffset = LoOffset + HalfSize; + Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo) + // Use addReg instead of addOperand + // to make sure kill flag is cleared. + .addReg(SBase->getReg(), 0, SBase->getSubReg()) + .addImm(LoOffset / OffScale); + + if (!isUInt<20>(HiOffset) || (!isVI && !isUInt<8>(HiOffset / OffScale))) { + unsigned OffsetSGPR = + MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR) + .addImm(HiOffset); // The offset in register is in bytes. + Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi) + .addReg(SBase->getReg(), getKillRegState(IsKill), + SBase->getSubReg()) + .addReg(OffsetSGPR); + } else { + Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi) + .addReg(SBase->getReg(), getKillRegState(IsKill), + SBase->getSubReg()) + .addImm(HiOffset / OffScale); + } + } else { + // Handle the _SGPR variant + MachineOperand *SOff = getNamedOperand(*MI, AMDGPU::OpName::soff); + Lo = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegLo) + .addReg(SBase->getReg(), 0, SBase->getSubReg()) + .addOperand(*SOff); + unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR) + .addOperand(*SOff) + .addImm(HalfSize); + Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp)) + .addReg(SBase->getReg(), getKillRegState(IsKill), + SBase->getSubReg()) + .addReg(OffsetSGPR); + } + + unsigned SubLo, SubHi; + switch (HalfSize) { + case 4: + SubLo = AMDGPU::sub0; + SubHi = AMDGPU::sub1; + break; + case 8: + SubLo = AMDGPU::sub0_sub1; + SubHi = AMDGPU::sub2_sub3; + break; + case 16: + SubLo = AMDGPU::sub0_sub1_sub2_sub3; + SubHi = AMDGPU::sub4_sub5_sub6_sub7; + break; + case 32: + SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; + SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15; + break; + default: + llvm_unreachable("Unhandled HalfSize"); + } + + BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE)) + .addOperand(MI->getOperand(0)) + .addReg(RegLo) + .addImm(SubLo) + .addReg(RegHi) + .addImm(SubHi); +} + +void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const { + MachineBasicBlock *MBB = MI->getParent(); + switch (MI->getOpcode()) { + case AMDGPU::S_LOAD_DWORD_IMM: + case AMDGPU::S_LOAD_DWORD_SGPR: + case AMDGPU::S_LOAD_DWORDX2_IMM: + case AMDGPU::S_LOAD_DWORDX2_SGPR: + case AMDGPU::S_LOAD_DWORDX4_IMM: + case AMDGPU::S_LOAD_DWORDX4_SGPR: { + unsigned NewOpcode = getVALUOp(*MI); + unsigned RegOffset; + unsigned ImmOffset; + + if (MI->getOperand(2).isReg()) { + RegOffset = MI->getOperand(2).getReg(); + ImmOffset = 0; + } else { + assert(MI->getOperand(2).isImm()); + // SMRD instructions take a dword offsets on SI and byte offset on VI + // and MUBUF instructions always take a byte offset. + ImmOffset = MI->getOperand(2).getImm(); + if (MBB->getParent()->getSubtarget().getGeneration() <= + AMDGPUSubtarget::SEA_ISLANDS) + ImmOffset <<= 2; + RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + + if (isUInt<12>(ImmOffset)) { + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), + RegOffset) + .addImm(0); + } else { + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), + RegOffset) + .addImm(ImmOffset); + ImmOffset = 0; + } + } + + unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); + unsigned DWord0 = RegOffset; + unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); + + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1) + .addImm(0); + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2) + .addImm(RsrcDataFormat & 0xFFFFFFFF); + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3) + .addImm(RsrcDataFormat >> 32); + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc) + .addReg(DWord0) + .addImm(AMDGPU::sub0) + .addReg(DWord1) + .addImm(AMDGPU::sub1) + .addReg(DWord2) + .addImm(AMDGPU::sub2) + .addReg(DWord3) + .addImm(AMDGPU::sub3); + MI->setDesc(get(NewOpcode)); + if (MI->getOperand(2).isReg()) { + MI->getOperand(2).setReg(SRsrc); + } else { + MI->getOperand(2).ChangeToRegister(SRsrc, false); + } + MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); + MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset)); + MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // glc + MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // slc + MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // tfe + + const TargetRegisterClass *NewDstRC = + RI.getRegClass(get(NewOpcode).OpInfo[0].RegClass); + + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); + MRI.replaceRegWith(DstReg, NewDstReg); + break; + } + case AMDGPU::S_LOAD_DWORDX8_IMM: + case AMDGPU::S_LOAD_DWORDX8_SGPR: { + MachineInstr *Lo, *Hi; + splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM, + AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi); + MI->eraseFromParent(); + moveSMRDToVALU(Lo, MRI); + moveSMRDToVALU(Hi, MRI); + break; + } + + case AMDGPU::S_LOAD_DWORDX16_IMM: + case AMDGPU::S_LOAD_DWORDX16_SGPR: { + MachineInstr *Lo, *Hi; + splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM, + AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi); + MI->eraseFromParent(); + moveSMRDToVALU(Lo, MRI); + moveSMRDToVALU(Hi, MRI); + break; + } + } +} + +void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { + SmallVector Worklist; + Worklist.push_back(&TopInst); + + while (!Worklist.empty()) { + MachineInstr *Inst = Worklist.pop_back_val(); + MachineBasicBlock *MBB = Inst->getParent(); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + + unsigned Opcode = Inst->getOpcode(); + unsigned NewOpcode = getVALUOp(*Inst); + + // Handle some special cases + switch (Opcode) { + default: + if (isSMRD(Inst->getOpcode())) { + moveSMRDToVALU(Inst, MRI); + } + break; + case AMDGPU::S_MOV_B64: { + DebugLoc DL = Inst->getDebugLoc(); + + // If the source operand is a register we can replace this with a + // copy. + if (Inst->getOperand(1).isReg()) { + MachineInstr *Copy = BuildMI(*MBB, Inst, DL, get(TargetOpcode::COPY)) + .addOperand(Inst->getOperand(0)) + .addOperand(Inst->getOperand(1)); + Worklist.push_back(Copy); + } else { + // Otherwise, we need to split this into two movs, because there is + // no 64-bit VALU move instruction. + unsigned Reg = Inst->getOperand(0).getReg(); + unsigned Dst = split64BitImm(Worklist, + Inst, + MRI, + MRI.getRegClass(Reg), + Inst->getOperand(1)); + MRI.replaceRegWith(Reg, Dst); + } + Inst->eraseFromParent(); + continue; + } + case AMDGPU::S_AND_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32); + Inst->eraseFromParent(); + continue; + + case AMDGPU::S_OR_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32); + Inst->eraseFromParent(); + continue; + + case AMDGPU::S_XOR_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32); + Inst->eraseFromParent(); + continue; + + case AMDGPU::S_NOT_B64: + splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); + Inst->eraseFromParent(); + continue; + + case AMDGPU::S_BCNT1_I32_B64: + splitScalar64BitBCNT(Worklist, Inst); + Inst->eraseFromParent(); + continue; + + case AMDGPU::S_BFE_I64: { + splitScalar64BitBFE(Worklist, Inst); + Inst->eraseFromParent(); + continue; + } + + case AMDGPU::S_LSHL_B32: + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + NewOpcode = AMDGPU::V_LSHLREV_B32_e64; + swapOperands(Inst); + } + break; + case AMDGPU::S_ASHR_I32: + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + NewOpcode = AMDGPU::V_ASHRREV_I32_e64; + swapOperands(Inst); + } + break; + case AMDGPU::S_LSHR_B32: + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + NewOpcode = AMDGPU::V_LSHRREV_B32_e64; + swapOperands(Inst); + } + break; + case AMDGPU::S_LSHL_B64: + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + NewOpcode = AMDGPU::V_LSHLREV_B64; + swapOperands(Inst); + } + break; + case AMDGPU::S_ASHR_I64: + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + NewOpcode = AMDGPU::V_ASHRREV_I64; + swapOperands(Inst); + } + break; + case AMDGPU::S_LSHR_B64: + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + NewOpcode = AMDGPU::V_LSHRREV_B64; + swapOperands(Inst); + } + break; + + case AMDGPU::S_BFE_U64: + case AMDGPU::S_BFM_B64: + llvm_unreachable("Moving this op to VALU not implemented"); + } + + if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { + // We cannot move this instruction to the VALU, so we should try to + // legalize its operands instead. + legalizeOperands(Inst); + continue; + } + + // Use the new VALU Opcode. + const MCInstrDesc &NewDesc = get(NewOpcode); + Inst->setDesc(NewDesc); + + // Remove any references to SCC. Vector instructions can't read from it, and + // We're just about to add the implicit use / defs of VCC, and we don't want + // both. + for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) { + MachineOperand &Op = Inst->getOperand(i); + if (Op.isReg() && Op.getReg() == AMDGPU::SCC) + Inst->RemoveOperand(i); + } + + if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { + // We are converting these to a BFE, so we need to add the missing + // operands for the size and offset. + unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; + Inst->addOperand(MachineOperand::CreateImm(0)); + Inst->addOperand(MachineOperand::CreateImm(Size)); + + } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { + // The VALU version adds the second operand to the result, so insert an + // extra 0 operand. + Inst->addOperand(MachineOperand::CreateImm(0)); + } + + addDescImplicitUseDef(NewDesc, Inst); + + if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { + const MachineOperand &OffsetWidthOp = Inst->getOperand(2); + // If we need to move this to VGPRs, we need to unpack the second operand + // back into the 2 separate ones for bit offset and width. + assert(OffsetWidthOp.isImm() && + "Scalar BFE is only implemented for constant width and offset"); + uint32_t Imm = OffsetWidthOp.getImm(); + + uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. + uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. + Inst->RemoveOperand(2); // Remove old immediate. + Inst->addOperand(MachineOperand::CreateImm(Offset)); + Inst->addOperand(MachineOperand::CreateImm(BitWidth)); + } + + // Update the destination register class. + + const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0); + + switch (Opcode) { + // For target instructions, getOpRegClass just returns the virtual + // register class associated with the operand, so we need to find an + // equivalent VGPR register class in order to move the instruction to the + // VALU. + case AMDGPU::COPY: + case AMDGPU::PHI: + case AMDGPU::REG_SEQUENCE: + case AMDGPU::INSERT_SUBREG: + if (RI.hasVGPRs(NewDstRC)) + continue; + NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); + if (!NewDstRC) + continue; + break; + default: + break; + } + + unsigned DstReg = Inst->getOperand(0).getReg(); + unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); + MRI.replaceRegWith(DstReg, NewDstReg); + + // Legalize the operands + legalizeOperands(Inst); + + for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg), + E = MRI.use_end(); I != E; ++I) { + MachineInstr &UseMI = *I->getParent(); + if (!canReadVGPR(UseMI, I.getOperandNo())) { + Worklist.push_back(&UseMI); + } + } + } +} + +//===----------------------------------------------------------------------===// +// Indirect addressing callbacks +//===----------------------------------------------------------------------===// + +unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex, + unsigned Channel) const { + assert(Channel == 0); + return RegIndex; +} + +const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const { + return &AMDGPU::VGPR_32RegClass; +} + +void SIInstrInfo::splitScalar64BitUnaryOp( + SmallVectorImpl &Worklist, + MachineInstr *Inst, + unsigned Opcode) const { + MachineBasicBlock &MBB = *Inst->getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + MachineOperand &Dest = Inst->getOperand(0); + MachineOperand &Src0 = Inst->getOperand(1); + DebugLoc DL = Inst->getDebugLoc(); + + MachineBasicBlock::iterator MII = Inst; + + const MCInstrDesc &InstDesc = get(Opcode); + const TargetRegisterClass *Src0RC = Src0.isReg() ? + MRI.getRegClass(Src0.getReg()) : + &AMDGPU::SGPR_32RegClass; + + const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); + + MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, + AMDGPU::sub0, Src0SubRC); + + const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); + const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0); + + unsigned DestSub0 = MRI.createVirtualRegister(DestRC); + MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) + .addOperand(SrcReg0Sub0); + + MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, + AMDGPU::sub1, Src0SubRC); + + unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC); + MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) + .addOperand(SrcReg0Sub1); + + unsigned FullDestReg = MRI.createVirtualRegister(DestRC); + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + + MRI.replaceRegWith(Dest.getReg(), FullDestReg); + + // Try to legalize the operands in case we need to swap the order to keep it + // valid. + Worklist.push_back(LoHalf); + Worklist.push_back(HiHalf); +} + +void SIInstrInfo::splitScalar64BitBinaryOp( + SmallVectorImpl &Worklist, + MachineInstr *Inst, + unsigned Opcode) const { + MachineBasicBlock &MBB = *Inst->getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + MachineOperand &Dest = Inst->getOperand(0); + MachineOperand &Src0 = Inst->getOperand(1); + MachineOperand &Src1 = Inst->getOperand(2); + DebugLoc DL = Inst->getDebugLoc(); + + MachineBasicBlock::iterator MII = Inst; + + const MCInstrDesc &InstDesc = get(Opcode); + const TargetRegisterClass *Src0RC = Src0.isReg() ? + MRI.getRegClass(Src0.getReg()) : + &AMDGPU::SGPR_32RegClass; + + const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); + const TargetRegisterClass *Src1RC = Src1.isReg() ? + MRI.getRegClass(Src1.getReg()) : + &AMDGPU::SGPR_32RegClass; + + const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); + + MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, + AMDGPU::sub0, Src0SubRC); + MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, + AMDGPU::sub0, Src1SubRC); + + const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); + const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0); + + unsigned DestSub0 = MRI.createVirtualRegister(DestRC); + MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) + .addOperand(SrcReg0Sub0) + .addOperand(SrcReg1Sub0); + + MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, + AMDGPU::sub1, Src0SubRC); + MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, + AMDGPU::sub1, Src1SubRC); + + unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC); + MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) + .addOperand(SrcReg0Sub1) + .addOperand(SrcReg1Sub1); + + unsigned FullDestReg = MRI.createVirtualRegister(DestRC); + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + + MRI.replaceRegWith(Dest.getReg(), FullDestReg); + + // Try to legalize the operands in case we need to swap the order to keep it + // valid. + Worklist.push_back(LoHalf); + Worklist.push_back(HiHalf); +} + +void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl &Worklist, + MachineInstr *Inst) const { + MachineBasicBlock &MBB = *Inst->getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + MachineBasicBlock::iterator MII = Inst; + DebugLoc DL = Inst->getDebugLoc(); + + MachineOperand &Dest = Inst->getOperand(0); + MachineOperand &Src = Inst->getOperand(1); + + const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); + const TargetRegisterClass *SrcRC = Src.isReg() ? + MRI.getRegClass(Src.getReg()) : + &AMDGPU::SGPR_32RegClass; + + unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); + + MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, + AMDGPU::sub0, SrcSubRC); + MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, + AMDGPU::sub1, SrcSubRC); + + MachineInstr *First = BuildMI(MBB, MII, DL, InstDesc, MidReg) + .addOperand(SrcRegSub0) + .addImm(0); + + MachineInstr *Second = BuildMI(MBB, MII, DL, InstDesc, ResultReg) + .addOperand(SrcRegSub1) + .addReg(MidReg); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); + + Worklist.push_back(First); + Worklist.push_back(Second); +} + +void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl &Worklist, + MachineInstr *Inst) const { + MachineBasicBlock &MBB = *Inst->getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineBasicBlock::iterator MII = Inst; + DebugLoc DL = Inst->getDebugLoc(); + + MachineOperand &Dest = Inst->getOperand(0); + uint32_t Imm = Inst->getOperand(2).getImm(); + uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. + uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. + + (void) Offset; + + // Only sext_inreg cases handled. + assert(Inst->getOpcode() == AMDGPU::S_BFE_I64 && + BitWidth <= 32 && + Offset == 0 && + "Not implemented"); + + if (BitWidth < 32) { + unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + + BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) + .addReg(Inst->getOperand(1).getReg(), 0, AMDGPU::sub0) + .addImm(0) + .addImm(BitWidth); + + BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) + .addImm(31) + .addReg(MidRegLo); + + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) + .addReg(MidRegLo) + .addImm(AMDGPU::sub0) + .addReg(MidRegHi) + .addImm(AMDGPU::sub1); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); + return; + } + + MachineOperand &Src = Inst->getOperand(1); + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + + BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) + .addImm(31) + .addReg(Src.getReg(), 0, AMDGPU::sub0); + + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) + .addReg(Src.getReg(), 0, AMDGPU::sub0) + .addImm(AMDGPU::sub0) + .addReg(TmpReg) + .addImm(AMDGPU::sub1); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); +} + +void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc, + MachineInstr *Inst) const { + // Add the implict and explicit register definitions. + if (NewDesc.ImplicitUses) { + for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) { + unsigned Reg = NewDesc.ImplicitUses[i]; + Inst->addOperand(MachineOperand::CreateReg(Reg, false, true)); + } + } + + if (NewDesc.ImplicitDefs) { + for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) { + unsigned Reg = NewDesc.ImplicitDefs[i]; + Inst->addOperand(MachineOperand::CreateReg(Reg, true, true)); + } + } +} + +unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, + int OpIndices[3]) const { + const MCInstrDesc &Desc = get(MI->getOpcode()); + + // Find the one SGPR operand we are allowed to use. + unsigned SGPRReg = AMDGPU::NoRegister; + + // First we need to consider the instruction's operand requirements before + // legalizing. Some operands are required to be SGPRs, such as implicit uses + // of VCC, but we are still bound by the constant bus requirement to only use + // one. + // + // If the operand's class is an SGPR, we can never move it. + + for (const MachineOperand &MO : MI->implicit_operands()) { + // We only care about reads. + if (MO.isDef()) + continue; + + if (MO.getReg() == AMDGPU::VCC) + return AMDGPU::VCC; + + if (MO.getReg() == AMDGPU::FLAT_SCR) + return AMDGPU::FLAT_SCR; + } + + unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; + const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + + for (unsigned i = 0; i < 3; ++i) { + int Idx = OpIndices[i]; + if (Idx == -1) + break; + + const MachineOperand &MO = MI->getOperand(Idx); + if (RI.isSGPRClassID(Desc.OpInfo[Idx].RegClass)) + SGPRReg = MO.getReg(); + + if (MO.isReg() && RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) + UsedSGPRs[i] = MO.getReg(); + } + + if (SGPRReg != AMDGPU::NoRegister) + return SGPRReg; + + // We don't have a required SGPR operand, so we have a bit more freedom in + // selecting operands to move. + + // Try to select the most used SGPR. If an SGPR is equal to one of the + // others, we choose that. + // + // e.g. + // V_FMA_F32 v0, s0, s0, s0 -> No moves + // V_FMA_F32 v0, s0, s1, s0 -> Move s1 + + if (UsedSGPRs[0] != AMDGPU::NoRegister) { + if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) + SGPRReg = UsedSGPRs[0]; + } + + if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { + if (UsedSGPRs[1] == UsedSGPRs[2]) + SGPRReg = UsedSGPRs[1]; + } + + return SGPRReg; +} + +MachineInstrBuilder SIInstrInfo::buildIndirectWrite( + MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, + unsigned Address, unsigned OffsetReg) const { + const DebugLoc &DL = MBB->findDebugLoc(I); + unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( + getIndirectIndexBegin(*MBB->getParent())); + + return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_DST_V1)) + .addReg(IndirectBaseReg, RegState::Define) + .addOperand(I->getOperand(0)) + .addReg(IndirectBaseReg) + .addReg(OffsetReg) + .addImm(0) + .addReg(ValueReg); +} + +MachineInstrBuilder SIInstrInfo::buildIndirectRead( + MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, + unsigned Address, unsigned OffsetReg) const { + const DebugLoc &DL = MBB->findDebugLoc(I); + unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( + getIndirectIndexBegin(*MBB->getParent())); + + return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC)) + .addOperand(I->getOperand(0)) + .addOperand(I->getOperand(1)) + .addReg(IndirectBaseReg) + .addReg(OffsetReg) + .addImm(0); + +} + +void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved, + const MachineFunction &MF) const { + int End = getIndirectIndexEnd(MF); + int Begin = getIndirectIndexBegin(MF); + + if (End == -1) + return; + + + for (int Index = Begin; Index <= End; ++Index) + Reserved.set(AMDGPU::VGPR_32RegClass.getRegister(Index)); + + for (int Index = std::max(0, Begin - 1); Index <= End; ++Index) + Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index)); + + for (int Index = std::max(0, Begin - 2); Index <= End; ++Index) + Reserved.set(AMDGPU::VReg_96RegClass.getRegister(Index)); + + for (int Index = std::max(0, Begin - 3); Index <= End; ++Index) + Reserved.set(AMDGPU::VReg_128RegClass.getRegister(Index)); + + for (int Index = std::max(0, Begin - 7); Index <= End; ++Index) + Reserved.set(AMDGPU::VReg_256RegClass.getRegister(Index)); + + for (int Index = std::max(0, Begin - 15); Index <= End; ++Index) + Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index)); +} + +MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, + unsigned OperandName) const { + int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); + if (Idx == -1) + return nullptr; + + return &MI.getOperand(Idx); +} + +uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { + uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; + if (ST.isAmdHsaOS()) + RsrcDataFormat |= (1ULL << 56); + + return RsrcDataFormat; +} diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h new file mode 100644 index 00000000000..64b5120841c --- /dev/null +++ b/lib/Target/AMDGPU/SIInstrInfo.h @@ -0,0 +1,391 @@ +//===-- SIInstrInfo.h - SI Instruction Info Interface -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface definition for SIInstrInfo. +// +//===----------------------------------------------------------------------===// + + +#ifndef LLVM_LIB_TARGET_R600_SIINSTRINFO_H +#define LLVM_LIB_TARGET_R600_SIINSTRINFO_H + +#include "AMDGPUInstrInfo.h" +#include "SIDefines.h" +#include "SIRegisterInfo.h" + +namespace llvm { + +class SIInstrInfo : public AMDGPUInstrInfo { +private: + const SIRegisterInfo RI; + + unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, + MachineRegisterInfo &MRI, + MachineOperand &SuperReg, + const TargetRegisterClass *SuperRC, + unsigned SubIdx, + const TargetRegisterClass *SubRC) const; + MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, + MachineRegisterInfo &MRI, + MachineOperand &SuperReg, + const TargetRegisterClass *SuperRC, + unsigned SubIdx, + const TargetRegisterClass *SubRC) const; + + unsigned split64BitImm(SmallVectorImpl &Worklist, + MachineBasicBlock::iterator MI, + MachineRegisterInfo &MRI, + const TargetRegisterClass *RC, + const MachineOperand &Op) const; + + void swapOperands(MachineBasicBlock::iterator Inst) const; + + void splitScalar64BitUnaryOp(SmallVectorImpl &Worklist, + MachineInstr *Inst, unsigned Opcode) const; + + void splitScalar64BitBinaryOp(SmallVectorImpl &Worklist, + MachineInstr *Inst, unsigned Opcode) const; + + void splitScalar64BitBCNT(SmallVectorImpl &Worklist, + MachineInstr *Inst) const; + void splitScalar64BitBFE(SmallVectorImpl &Worklist, + MachineInstr *Inst) const; + + void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const; + + bool checkInstOffsetsDoNotOverlap(MachineInstr *MIa, + MachineInstr *MIb) const; + + unsigned findUsedSGPR(const MachineInstr *MI, int OpIndices[3]) const; + +public: + explicit SIInstrInfo(const AMDGPUSubtarget &st); + + const SIRegisterInfo &getRegisterInfo() const override { + return RI; + } + + bool isReallyTriviallyReMaterializable(const MachineInstr *MI, + AliasAnalysis *AA) const override; + + bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, + int64_t &Offset1, + int64_t &Offset2) const override; + + bool getLdStBaseRegImmOfs(MachineInstr *LdSt, + unsigned &BaseReg, unsigned &Offset, + const TargetRegisterInfo *TRI) const final; + + bool shouldClusterLoads(MachineInstr *FirstLdSt, + MachineInstr *SecondLdSt, + unsigned NumLoads) const final; + + void copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const override; + + unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + RegScavenger *RS, + unsigned TmpReg, + unsigned Offset, + unsigned Size) const; + + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + + bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; + + // \brief Returns an opcode that can be used to move a value to a \p DstRC + // register. If there is no hardware instruction that can store to \p + // DstRC, then AMDGPU::COPY is returned. + unsigned getMovOpcode(const TargetRegisterClass *DstRC) const; + unsigned commuteOpcode(const MachineInstr &MI) const; + + MachineInstr *commuteInstruction(MachineInstr *MI, + bool NewMI = false) const override; + bool findCommutedOpIndices(MachineInstr *MI, + unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const override; + + bool isTriviallyReMaterializable(const MachineInstr *MI, + AliasAnalysis *AA = nullptr) const; + + bool areMemAccessesTriviallyDisjoint( + MachineInstr *MIa, MachineInstr *MIb, + AliasAnalysis *AA = nullptr) const override; + + MachineInstr *buildMovInstr(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned DstReg, unsigned SrcReg) const override; + bool isMov(unsigned Opcode) const override; + + bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override; + + bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, + unsigned Reg, MachineRegisterInfo *MRI) const final; + + unsigned getMachineCSELookAheadLimit() const override { return 500; } + + bool isSALU(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SALU; + } + + bool isVALU(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VALU; + } + + bool isSOP1(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SOP1; + } + + bool isSOP2(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SOP2; + } + + bool isSOPC(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SOPC; + } + + bool isSOPK(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SOPK; + } + + bool isSOPP(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SOPP; + } + + bool isVOP1(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VOP1; + } + + bool isVOP2(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VOP2; + } + + bool isVOP3(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VOP3; + } + + bool isVOPC(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VOPC; + } + + bool isMUBUF(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::MUBUF; + } + + bool isMTBUF(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::MTBUF; + } + + bool isSMRD(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::SMRD; + } + + bool isDS(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::DS; + } + + bool isMIMG(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::MIMG; + } + + bool isFLAT(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::FLAT; + } + + bool isWQM(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::WQM; + } + + bool isVGPRSpill(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::VGPRSpill; + } + + bool isInlineConstant(const APInt &Imm) const; + bool isInlineConstant(const MachineOperand &MO, unsigned OpSize) const; + bool isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const; + + bool isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, + const MachineOperand &MO) const; + + /// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding. + /// This function will return false if you pass it a 32-bit instruction. + bool hasVALU32BitEncoding(unsigned Opcode) const; + + /// \brief Returns true if this operand uses the constant bus. + bool usesConstantBus(const MachineRegisterInfo &MRI, + const MachineOperand &MO, + unsigned OpSize) const; + + /// \brief Return true if this instruction has any modifiers. + /// e.g. src[012]_mod, omod, clamp. + bool hasModifiers(unsigned Opcode) const; + + bool hasModifiersSet(const MachineInstr &MI, + unsigned OpName) const; + + bool verifyInstruction(const MachineInstr *MI, + StringRef &ErrInfo) const override; + + static unsigned getVALUOp(const MachineInstr &MI); + + bool isSALUOpSupportedOnVALU(const MachineInstr &MI) const; + + /// \brief Return the correct register class for \p OpNo. For target-specific + /// instructions, this will return the register class that has been defined + /// in tablegen. For generic instructions, like REG_SEQUENCE it will return + /// the register class of its machine operand. + /// to infer the correct register class base on the other operands. + const TargetRegisterClass *getOpRegClass(const MachineInstr &MI, + unsigned OpNo) const; + + /// \brief Return the size in bytes of the operand OpNo on the given + // instruction opcode. + unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const { + const MCOperandInfo &OpInfo = get(Opcode).OpInfo[OpNo]; + + if (OpInfo.RegClass == -1) { + // If this is an immediate operand, this must be a 32-bit literal. + assert(OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE); + return 4; + } + + return RI.getRegClass(OpInfo.RegClass)->getSize(); + } + + /// \brief This form should usually be preferred since it handles operands + /// with unknown register classes. + unsigned getOpSize(const MachineInstr &MI, unsigned OpNo) const { + return getOpRegClass(MI, OpNo)->getSize(); + } + + /// \returns true if it is legal for the operand at index \p OpNo + /// to read a VGPR. + bool canReadVGPR(const MachineInstr &MI, unsigned OpNo) const; + + /// \brief Legalize the \p OpIndex operand of this instruction by inserting + /// a MOV. For example: + /// ADD_I32_e32 VGPR0, 15 + /// to + /// MOV VGPR1, 15 + /// ADD_I32_e32 VGPR0, VGPR1 + /// + /// If the operand being legalized is a register, then a COPY will be used + /// instead of MOV. + void legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const; + + /// \brief Check if \p MO is a legal operand if it was the \p OpIdx Operand + /// for \p MI. + bool isOperandLegal(const MachineInstr *MI, unsigned OpIdx, + const MachineOperand *MO = nullptr) const; + + /// \brief Legalize all operands in this instruction. This function may + /// create new instruction and insert them before \p MI. + void legalizeOperands(MachineInstr *MI) const; + + /// \brief Split an SMRD instruction into two smaller loads of half the + // size storing the results in \p Lo and \p Hi. + void splitSMRD(MachineInstr *MI, const TargetRegisterClass *HalfRC, + unsigned HalfImmOp, unsigned HalfSGPROp, + MachineInstr *&Lo, MachineInstr *&Hi) const; + + void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const; + + /// \brief Replace this instruction's opcode with the equivalent VALU + /// opcode. This function will also move the users of \p MI to the + /// VALU if necessary. + void moveToVALU(MachineInstr &MI) const; + + unsigned calculateIndirectAddress(unsigned RegIndex, + unsigned Channel) const override; + + const TargetRegisterClass *getIndirectAddrRegClass() const override; + + MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, + unsigned Address, + unsigned OffsetReg) const override; + + MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, + unsigned Address, + unsigned OffsetReg) const override; + void reserveIndirectRegisters(BitVector &Reserved, + const MachineFunction &MF) const; + + void LoadM0(MachineInstr *MoveRel, MachineBasicBlock::iterator I, + unsigned SavReg, unsigned IndexReg) const; + + void insertNOPs(MachineBasicBlock::iterator MI, int Count) const; + + /// \brief Returns the operand named \p Op. If \p MI does not have an + /// operand named \c Op, this function returns nullptr. + MachineOperand *getNamedOperand(MachineInstr &MI, unsigned OperandName) const; + + const MachineOperand *getNamedOperand(const MachineInstr &MI, + unsigned OpName) const { + return getNamedOperand(const_cast(MI), OpName); + } + + uint64_t getDefaultRsrcDataFormat() const; + +}; + +namespace AMDGPU { + + int getVOPe64(uint16_t Opcode); + int getVOPe32(uint16_t Opcode); + int getCommuteRev(uint16_t Opcode); + int getCommuteOrig(uint16_t Opcode); + int getAddr64Inst(uint16_t Opcode); + int getAtomicRetOp(uint16_t Opcode); + int getAtomicNoRetOp(uint16_t Opcode); + + const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; + const uint64_t RSRC_TID_ENABLE = 1LL << 55; + +} // End namespace AMDGPU + +namespace SI { +namespace KernelInputOffsets { + +/// Offsets in bytes from the start of the input buffer +enum Offsets { + NGROUPS_X = 0, + NGROUPS_Y = 4, + NGROUPS_Z = 8, + GLOBAL_SIZE_X = 12, + GLOBAL_SIZE_Y = 16, + GLOBAL_SIZE_Z = 20, + LOCAL_SIZE_X = 24, + LOCAL_SIZE_Y = 28, + LOCAL_SIZE_Z = 32 +}; + +} // End namespace KernelInputOffsets +} // End namespace SI + +} // End namespace llvm + +#endif diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td new file mode 100644 index 00000000000..93e4ca74ec3 --- /dev/null +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -0,0 +1,2647 @@ +//===-- SIInstrInfo.td - SI Instruction Infos -------------*- tablegen -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +def isCI : Predicate<"Subtarget->getGeneration() " + ">= AMDGPUSubtarget::SEA_ISLANDS">; +def isVI : Predicate < + "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">, + AssemblerPredicate<"FeatureGCN3Encoding">; + +def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">; + +class vop { + field bits<9> SI3; + field bits<10> VI3; +} + +class vopc si, bits<8> vi = !add(0x40, si)> : vop { + field bits<8> SI = si; + field bits<8> VI = vi; + + field bits<9> SI3 = {0, si{7-0}}; + field bits<10> VI3 = {0, 0, vi{7-0}}; +} + +class vop1 si, bits<8> vi = si> : vop { + field bits<8> SI = si; + field bits<8> VI = vi; + + field bits<9> SI3 = {1, 1, si{6-0}}; + field bits<10> VI3 = !add(0x140, vi); +} + +class vop2 si, bits<6> vi = si> : vop { + field bits<6> SI = si; + field bits<6> VI = vi; + + field bits<9> SI3 = {1, 0, 0, si{5-0}}; + field bits<10> VI3 = {0, 1, 0, 0, vi{5-0}}; +} + +// Specify a VOP2 opcode for SI and VOP3 opcode for VI +// that doesn't have VOP2 encoding on VI +class vop23 si, bits<10> vi> : vop2 { + let VI3 = vi; +} + +class vop3 si, bits<10> vi = {0, si}> : vop { + let SI3 = si; + let VI3 = vi; +} + +class sop1 si, bits<8> vi = si> { + field bits<8> SI = si; + field bits<8> VI = vi; +} + +class sop2 si, bits<7> vi = si> { + field bits<7> SI = si; + field bits<7> VI = vi; +} + +class sopk si, bits<5> vi = si> { + field bits<5> SI = si; + field bits<5> VI = vi; +} + +// Execpt for the NONE field, this must be kept in sync with the SISubtarget enum +// in AMDGPUInstrInfo.cpp +def SISubtarget { + int NONE = -1; + int SI = 0; + int VI = 1; +} + +//===----------------------------------------------------------------------===// +// SI DAG Nodes +//===----------------------------------------------------------------------===// + +def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT", + SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>, + [SDNPMayLoad, SDNPMemOperand] +>; + +def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", + SDTypeProfile<0, 13, + [SDTCisVT<0, v4i32>, // rsrc(SGPR) + SDTCisVT<1, iAny>, // vdata(VGPR) + SDTCisVT<2, i32>, // num_channels(imm) + SDTCisVT<3, i32>, // vaddr(VGPR) + SDTCisVT<4, i32>, // soffset(SGPR) + SDTCisVT<5, i32>, // inst_offset(imm) + SDTCisVT<6, i32>, // dfmt(imm) + SDTCisVT<7, i32>, // nfmt(imm) + SDTCisVT<8, i32>, // offen(imm) + SDTCisVT<9, i32>, // idxen(imm) + SDTCisVT<10, i32>, // glc(imm) + SDTCisVT<11, i32>, // slc(imm) + SDTCisVT<12, i32> // tfe(imm) + ]>, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain] +>; + +def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT", + SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i16>, + SDTCisVT<3, i32>]> +>; + +class SDSample : SDNode , SDTCisVT<2, v32i8>, + SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]> +>; + +def SIsample : SDSample<"AMDGPUISD::SAMPLE">; +def SIsampleb : SDSample<"AMDGPUISD::SAMPLEB">; +def SIsampled : SDSample<"AMDGPUISD::SAMPLED">; +def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">; + +def SIconstdata_ptr : SDNode< + "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 0, [SDTCisVT<0, i64>]> +>; + +//===----------------------------------------------------------------------===// +// SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1 +// to be glued to the memory instructions. +//===----------------------------------------------------------------------===// + +def SIld_local : SDNode <"ISD::LOAD", SDTLoad, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] +>; + +def si_ld_local : PatFrag <(ops node:$ptr), (SIld_local node:$ptr), [{ + return isLocalLoad(cast(N)); +}]>; + +def si_load_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{ + return cast(N)->getAddressingMode() == ISD::UNINDEXED && + cast(N)->getExtensionType() == ISD::NON_EXTLOAD; +}]>; + +def si_load_local_align8 : Aligned8Bytes < + (ops node:$ptr), (si_load_local node:$ptr) +>; + +def si_sextload_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{ + return cast(N)->getExtensionType() == ISD::SEXTLOAD; +}]>; +def si_az_extload_local : AZExtLoadBase ; + +multiclass SIExtLoadLocal { + + def _i8 : PatFrag <(ops node:$ptr), (ld_node node:$ptr), + [{return cast(N)->getMemoryVT() == MVT::i8;}] + >; + + def _i16 : PatFrag <(ops node:$ptr), (ld_node node:$ptr), + [{return cast(N)->getMemoryVT() == MVT::i16;}] + >; +} + +defm si_sextload_local : SIExtLoadLocal ; +defm si_az_extload_local : SIExtLoadLocal ; + +def SIst_local : SDNode <"ISD::STORE", SDTStore, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue] +>; + +def si_st_local : PatFrag < + (ops node:$val, node:$ptr), (SIst_local node:$val, node:$ptr), [{ + return isLocalStore(cast(N)); +}]>; + +def si_store_local : PatFrag < + (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{ + return cast(N)->getAddressingMode() == ISD::UNINDEXED && + !cast(N)->isTruncatingStore(); +}]>; + +def si_store_local_align8 : Aligned8Bytes < + (ops node:$val, node:$ptr), (si_store_local node:$val, node:$ptr) +>; + +def si_truncstore_local : PatFrag < + (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{ + return cast(N)->isTruncatingStore(); +}]>; + +def si_truncstore_local_i8 : PatFrag < + (ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i8; +}]>; + +def si_truncstore_local_i16 : PatFrag < + (ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i16; +}]>; + +multiclass SIAtomicM0Glue2 { + + def _glue : SDNode <"ISD::ATOMIC_"#op_name, SDTAtomic2, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] + >; + + def _local : local_binary_atomic_op (NAME#"_glue")>; +} + +defm si_atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">; +defm si_atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">; +defm si_atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">; +defm si_atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">; +defm si_atomic_load_or : SIAtomicM0Glue2 <"LOAD_OR">; +defm si_atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">; +defm si_atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">; +defm si_atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">; +defm si_atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">; +defm si_atomic_swap : SIAtomicM0Glue2 <"SWAP">; + +def si_atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] +>; + +defm si_atomic_cmp_swap : AtomicCmpSwapLocal ; + +// Transformation function, extract the lower 32bit of a 64bit immediate +def LO32 : SDNodeXFormgetTargetConstant(N->getZExtValue() & 0xffffffff, SDLoc(N), + MVT::i32); +}]>; + +def LO32f : SDNodeXFormgetValueAPF().bitcastToAPInt().trunc(32); + return CurDAG->getTargetConstantFP(APFloat(APFloat::IEEEsingle, V), MVT::f32); +}]>; + +// Transformation function, extract the upper 32bit of a 64bit immediate +def HI32 : SDNodeXFormgetTargetConstant(N->getZExtValue() >> 32, SDLoc(N), MVT::i32); +}]>; + +def HI32f : SDNodeXFormgetValueAPF().bitcastToAPInt().lshr(32).trunc(32); + return CurDAG->getTargetConstantFP(APFloat(APFloat::IEEEsingle, V), SDLoc(N), + MVT::f32); +}]>; + +def IMM8bitDWORD : PatLeaf <(imm), + [{return (N->getZExtValue() & ~0x3FC) == 0;}] +>; + +def as_dword_i32imm : SDNodeXFormgetTargetConstant(N->getZExtValue() >> 2, SDLoc(N), MVT::i32); +}]>; + +def as_i1imm : SDNodeXFormgetTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1); +}]>; + +def as_i8imm : SDNodeXFormgetTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i8); +}]>; + +def as_i16imm : SDNodeXFormgetTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i16); +}]>; + +def as_i32imm: SDNodeXFormgetTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32); +}]>; + +def as_i64imm: SDNodeXFormgetTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i64); +}]>; + +// Copied from the AArch64 backend: +def bitcast_fpimm_to_i32 : SDNodeXFormgetTargetConstant( + N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32); +}]>; + +// Copied from the AArch64 backend: +def bitcast_fpimm_to_i64 : SDNodeXFormgetTargetConstant( + N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64); +}]>; + +def IMM8bit : PatLeaf <(imm), + [{return isUInt<8>(N->getZExtValue());}] +>; + +def IMM12bit : PatLeaf <(imm), + [{return isUInt<12>(N->getZExtValue());}] +>; + +def IMM16bit : PatLeaf <(imm), + [{return isUInt<16>(N->getZExtValue());}] +>; + +def IMM20bit : PatLeaf <(imm), + [{return isUInt<20>(N->getZExtValue());}] +>; + +def IMM32bit : PatLeaf <(imm), + [{return isUInt<32>(N->getZExtValue());}] +>; + +def mubuf_vaddr_offset : PatFrag< + (ops node:$ptr, node:$offset, node:$imm_offset), + (add (add node:$ptr, node:$offset), node:$imm_offset) +>; + +class InlineImm : PatLeaf <(vt imm), [{ + return isInlineImmediate(N); +}]>; + +class InlineFPImm : PatLeaf <(vt fpimm), [{ + return isInlineImmediate(N); +}]>; + +class SGPRImm : PatLeafgetGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) { + return false; + } + const SIRegisterInfo *SIRI = + static_cast(Subtarget->getRegisterInfo()); + for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); + U != E; ++U) { + if (SIRI->isSGPRClass(getOperandRegClass(*U, U.getOperandNo()))) { + return true; + } + } + return false; +}]>; + +//===----------------------------------------------------------------------===// +// Custom Operands +//===----------------------------------------------------------------------===// + +def FRAMEri32 : Operand { + let MIOperandInfo = (ops i32:$ptr, i32imm:$index); +} + +def SoppBrTarget : AsmOperandClass { + let Name = "SoppBrTarget"; + let ParserMethod = "parseSOppBrTarget"; +} + +def sopp_brtarget : Operand { + let EncoderMethod = "getSOPPBrEncoding"; + let OperandType = "OPERAND_PCREL"; + let ParserMatchClass = SoppBrTarget; +} + +include "SIInstrFormats.td" +include "VIInstrFormats.td" + +def MubufOffsetMatchClass : AsmOperandClass { + let Name = "MubufOffset"; + let ParserMethod = "parseMubufOptionalOps"; + let RenderMethod = "addImmOperands"; +} + +class DSOffsetBaseMatchClass : AsmOperandClass { + let Name = "DSOffset"#parser; + let ParserMethod = parser; + let RenderMethod = "addImmOperands"; + let PredicateMethod = "isDSOffset"; +} + +def DSOffsetMatchClass : DSOffsetBaseMatchClass <"parseDSOptionalOps">; +def DSOffsetGDSMatchClass : DSOffsetBaseMatchClass <"parseDSOffsetOptional">; + +def DSOffset01MatchClass : AsmOperandClass { + let Name = "DSOffset1"; + let ParserMethod = "parseDSOff01OptionalOps"; + let RenderMethod = "addImmOperands"; + let PredicateMethod = "isDSOffset01"; +} + +class GDSBaseMatchClass : AsmOperandClass { + let Name = "GDS"#parser; + let PredicateMethod = "isImm"; + let ParserMethod = parser; + let RenderMethod = "addImmOperands"; +} + +def GDSMatchClass : GDSBaseMatchClass <"parseDSOptionalOps">; +def GDS01MatchClass : GDSBaseMatchClass <"parseDSOff01OptionalOps">; + +class GLCBaseMatchClass : AsmOperandClass { + let Name = "GLC"#parser; + let PredicateMethod = "isImm"; + let ParserMethod = parser; + let RenderMethod = "addImmOperands"; +} + +def GLCMubufMatchClass : GLCBaseMatchClass <"parseMubufOptionalOps">; +def GLCFlatMatchClass : GLCBaseMatchClass <"parseFlatOptionalOps">; + +class SLCBaseMatchClass : AsmOperandClass { + let Name = "SLC"#parser; + let PredicateMethod = "isImm"; + let ParserMethod = parser; + let RenderMethod = "addImmOperands"; +} + +def SLCMubufMatchClass : SLCBaseMatchClass <"parseMubufOptionalOps">; +def SLCFlatMatchClass : SLCBaseMatchClass <"parseFlatOptionalOps">; +def SLCFlatAtomicMatchClass : SLCBaseMatchClass <"parseFlatAtomicOptionalOps">; + +class TFEBaseMatchClass : AsmOperandClass { + let Name = "TFE"#parser; + let PredicateMethod = "isImm"; + let ParserMethod = parser; + let RenderMethod = "addImmOperands"; +} + +def TFEMubufMatchClass : TFEBaseMatchClass <"parseMubufOptionalOps">; +def TFEFlatMatchClass : TFEBaseMatchClass <"parseFlatOptionalOps">; +def TFEFlatAtomicMatchClass : TFEBaseMatchClass <"parseFlatAtomicOptionalOps">; + +def OModMatchClass : AsmOperandClass { + let Name = "OMod"; + let PredicateMethod = "isImm"; + let ParserMethod = "parseVOP3OptionalOps"; + let RenderMethod = "addImmOperands"; +} + +def ClampMatchClass : AsmOperandClass { + let Name = "Clamp"; + let PredicateMethod = "isImm"; + let ParserMethod = "parseVOP3OptionalOps"; + let RenderMethod = "addImmOperands"; +} + +let OperandType = "OPERAND_IMMEDIATE" in { + +def offen : Operand { + let PrintMethod = "printOffen"; +} +def idxen : Operand { + let PrintMethod = "printIdxen"; +} +def addr64 : Operand { + let PrintMethod = "printAddr64"; +} +def mbuf_offset : Operand { + let PrintMethod = "printMBUFOffset"; + let ParserMatchClass = MubufOffsetMatchClass; +} +class ds_offset_base : Operand { + let PrintMethod = "printDSOffset"; + let ParserMatchClass = mc; +} +def ds_offset : ds_offset_base ; +def ds_offset_gds : ds_offset_base ; + +def ds_offset0 : Operand { + let PrintMethod = "printDSOffset0"; + let ParserMatchClass = DSOffset01MatchClass; +} +def ds_offset1 : Operand { + let PrintMethod = "printDSOffset1"; + let ParserMatchClass = DSOffset01MatchClass; +} +class gds_base : Operand { + let PrintMethod = "printGDS"; + let ParserMatchClass = mc; +} +def gds : gds_base ; + +def gds01 : gds_base ; + +class glc_base : Operand { + let PrintMethod = "printGLC"; + let ParserMatchClass = mc; +} + +def glc : glc_base ; +def glc_flat : glc_base ; + +class slc_base : Operand { + let PrintMethod = "printSLC"; + let ParserMatchClass = mc; +} + +def slc : slc_base ; +def slc_flat : slc_base ; +def slc_flat_atomic : slc_base ; + +class tfe_base : Operand { + let PrintMethod = "printTFE"; + let ParserMatchClass = mc; +} + +def tfe : tfe_base ; +def tfe_flat : tfe_base ; +def tfe_flat_atomic : tfe_base ; + +def omod : Operand { + let PrintMethod = "printOModSI"; + let ParserMatchClass = OModMatchClass; +} + +def ClampMod : Operand { + let PrintMethod = "printClampSI"; + let ParserMatchClass = ClampMatchClass; +} + +} // End OperandType = "OPERAND_IMMEDIATE" + +def VOPDstS64 : VOPDstOperand ; + +//===----------------------------------------------------------------------===// +// Complex patterns +//===----------------------------------------------------------------------===// + +def DS1Addr1Offset : ComplexPattern; +def DS64Bit4ByteAligned : ComplexPattern; + +def MUBUFAddr32 : ComplexPattern; +def MUBUFAddr64 : ComplexPattern; +def MUBUFAddr64Atomic : ComplexPattern; +def MUBUFScratch : ComplexPattern; +def MUBUFOffset : ComplexPattern; +def MUBUFOffsetAtomic : ComplexPattern; + +def VOP3Mods0 : ComplexPattern; +def VOP3Mods0Clamp : ComplexPattern; +def VOP3Mods0Clamp0OMod : ComplexPattern; +def VOP3Mods : ComplexPattern; + +//===----------------------------------------------------------------------===// +// SI assembler operands +//===----------------------------------------------------------------------===// + +def SIOperand { + int ZERO = 0x80; + int VCC = 0x6A; + int FLAT_SCR = 0x68; +} + +def SRCMODS { + int NONE = 0; + int NEG = 1; +} + +def DSTCLAMP { + int NONE = 0; +} + +def DSTOMOD { + int NONE = 0; +} + +//===----------------------------------------------------------------------===// +// +// SI Instruction multiclass helpers. +// +// Instructions with _32 take 32-bit operands. +// Instructions with _64 take 64-bit operands. +// +// VOP_* instructions can use either a 32-bit or 64-bit encoding. The 32-bit +// encoding is the standard encoding, but instruction that make use of +// any of the instruction modifiers must use the 64-bit encoding. +// +// Instructions with _e32 use the 32-bit encoding. +// Instructions with _e64 use the 64-bit encoding. +// +//===----------------------------------------------------------------------===// + +class SIMCInstr { + string PseudoInstr = pseudo; + int Subtarget = subtarget; +} + +//===----------------------------------------------------------------------===// +// EXP classes +//===----------------------------------------------------------------------===// + +class EXPCommon : InstSI< + (outs), + (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm, + VGPR_32:$src0, VGPR_32:$src1, VGPR_32:$src2, VGPR_32:$src3), + "exp $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3", + [] > { + + let EXP_CNT = 1; + let Uses = [EXEC]; +} + +multiclass EXP_m { + + let isPseudo = 1, isCodeGenOnly = 1 in { + def "" : EXPCommon, SIMCInstr <"exp", SISubtarget.NONE> ; + } + + def _si : EXPCommon, SIMCInstr <"exp", SISubtarget.SI>, EXPe; + + def _vi : EXPCommon, SIMCInstr <"exp", SISubtarget.VI>, EXPe_vi; +} + +//===----------------------------------------------------------------------===// +// Scalar classes +//===----------------------------------------------------------------------===// + +class SOP1_Pseudo pattern> : + SOP1 , + SIMCInstr { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +class SOP1_Real_si : + SOP1 , + SOP1e , + SIMCInstr { + let isCodeGenOnly = 0; + let AssemblerPredicates = [isSICI]; +} + +class SOP1_Real_vi : + SOP1 , + SOP1e , + SIMCInstr { + let isCodeGenOnly = 0; + let AssemblerPredicates = [isVI]; +} + +multiclass SOP1_m pattern> { + + def "" : SOP1_Pseudo ; + + def _si : SOP1_Real_si ; + + def _vi : SOP1_Real_vi ; + +} + +multiclass SOP1_32 pattern> : SOP1_m < + op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0), + opName#" $dst, $src0", pattern +>; + +multiclass SOP1_64 pattern> : SOP1_m < + op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0), + opName#" $dst, $src0", pattern +>; + +// no input, 64-bit output. +multiclass SOP1_64_0 pattern> { + def "" : SOP1_Pseudo ; + + def _si : SOP1_Real_si { + let ssrc0 = 0; + } + + def _vi : SOP1_Real_vi { + let ssrc0 = 0; + } +} + +// 64-bit input, no output +multiclass SOP1_1 pattern> { + def "" : SOP1_Pseudo ; + + def _si : SOP1_Real_si { + let sdst = 0; + } + + def _vi : SOP1_Real_vi { + let sdst = 0; + } +} + +// 64-bit input, 32-bit output. +multiclass SOP1_32_64 pattern> : SOP1_m < + op, opName, (outs SReg_32:$dst), (ins SSrc_64:$src0), + opName#" $dst, $src0", pattern +>; + +class SOP2_Pseudo pattern> : + SOP2, + SIMCInstr { + let isPseudo = 1; + let isCodeGenOnly = 1; + let Size = 4; + + // Pseudo instructions have no encodings, but adding this field here allows + // us to do: + // let sdst = xxx in { + // for multiclasses that include both real and pseudo instructions. + field bits<7> sdst = 0; +} + +class SOP2_Real_si : + SOP2, + SOP2e, + SIMCInstr { + let AssemblerPredicates = [isSICI]; +} + +class SOP2_Real_vi : + SOP2, + SOP2e, + SIMCInstr { + let AssemblerPredicates = [isVI]; +} + +multiclass SOP2_SELECT_32 pattern> { + def "" : SOP2_Pseudo ; + + def _si : SOP2_Real_si ; + + def _vi : SOP2_Real_vi ; +} + +multiclass SOP2_m pattern> { + + def "" : SOP2_Pseudo ; + + def _si : SOP2_Real_si ; + + def _vi : SOP2_Real_vi ; + +} + +multiclass SOP2_32 pattern> : SOP2_m < + op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1), + opName#" $dst, $src0, $src1", pattern +>; + +multiclass SOP2_64 pattern> : SOP2_m < + op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_64:$src1), + opName#" $dst, $src0, $src1", pattern +>; + +multiclass SOP2_64_32 pattern> : SOP2_m < + op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1), + opName#" $dst, $src0, $src1", pattern +>; + +class SOPC_Helper op, RegisterOperand rc, ValueType vt, + string opName, PatLeaf cond> : SOPC < + op, (outs SCCReg:$dst), (ins rc:$src0, rc:$src1), + opName#" $src0, $src1", []>; + +class SOPC_32 op, string opName, PatLeaf cond = COND_NULL> + : SOPC_Helper; + +class SOPC_64 op, string opName, PatLeaf cond = COND_NULL> + : SOPC_Helper; + +class SOPK_Pseudo pattern> : + SOPK , + SIMCInstr { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +class SOPK_Real_si : + SOPK , + SOPKe , + SIMCInstr { + let AssemblerPredicates = [isSICI]; + let isCodeGenOnly = 0; +} + +class SOPK_Real_vi : + SOPK , + SOPKe , + SIMCInstr { + let AssemblerPredicates = [isVI]; + let isCodeGenOnly = 0; +} + +multiclass SOPK_m { + def "" : SOPK_Pseudo ; + + def _si : SOPK_Real_si ; + + def _vi : SOPK_Real_vi ; + +} + +multiclass SOPK_32 pattern> { + def "" : SOPK_Pseudo ; + + def _si : SOPK_Real_si ; + + def _vi : SOPK_Real_vi ; +} + +multiclass SOPK_SCC pattern> { + def "" : SOPK_Pseudo ; + + let DisableEncoding = "$dst" in { + def _si : SOPK_Real_si ; + + def _vi : SOPK_Real_vi ; + } +} + +multiclass SOPK_32TIE pattern> : SOPK_m < + op, opName, (outs SReg_32:$sdst), (ins SReg_32:$src0, u16imm:$simm16), + " $sdst, $simm16" +>; + +multiclass SOPK_IMM32 { + + def "" : SOPK_Pseudo ; + + def _si : SOPK , + SOPK64e , + SIMCInstr { + let AssemblerPredicates = [isSICI]; + let isCodeGenOnly = 0; + } + + def _vi : SOPK , + SOPK64e , + SIMCInstr { + let AssemblerPredicates = [isVI]; + let isCodeGenOnly = 0; + } +} +//===----------------------------------------------------------------------===// +// SMRD classes +//===----------------------------------------------------------------------===// + +class SMRD_Pseudo pattern> : + SMRD , + SIMCInstr { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +class SMRD_Real_si op, string opName, bit imm, dag outs, dag ins, + string asm> : + SMRD , + SMRDe , + SIMCInstr { + let AssemblerPredicates = [isSICI]; +} + +class SMRD_Real_vi op, string opName, bit imm, dag outs, dag ins, + string asm> : + SMRD , + SMEMe_vi , + SIMCInstr { + let AssemblerPredicates = [isVI]; +} + +multiclass SMRD_m op, string opName, bit imm, dag outs, dag ins, + string asm, list pattern> { + + def "" : SMRD_Pseudo ; + + def _si : SMRD_Real_si ; + + // glc is only applicable to scalar stores, which are not yet + // implemented. + let glc = 0 in { + def _vi : SMRD_Real_vi <{0, 0, 0, op}, opName, imm, outs, ins, asm>; + } +} + +multiclass SMRD_Helper op, string opName, RegisterClass baseClass, + RegisterClass dstClass> { + defm _IMM : SMRD_m < + op, opName#"_IMM", 1, (outs dstClass:$dst), + (ins baseClass:$sbase, u32imm:$offset), + opName#" $dst, $sbase, $offset", [] + >; + + defm _SGPR : SMRD_m < + op, opName#"_SGPR", 0, (outs dstClass:$dst), + (ins baseClass:$sbase, SReg_32:$soff), + opName#" $dst, $sbase, $soff", [] + >; +} + +//===----------------------------------------------------------------------===// +// Vector ALU classes +//===----------------------------------------------------------------------===// + +// This must always be right before the operand being input modified. +def InputMods : OperandWithDefaultOps { + let PrintMethod = "printOperandAndMods"; +} + +def InputModsMatchClass : AsmOperandClass { + let Name = "RegWithInputMods"; +} + +def InputModsNoDefault : Operand { + let PrintMethod = "printOperandAndMods"; + let ParserMatchClass = InputModsMatchClass; +} + +class getNumSrcArgs { + int ret = + !if (!eq(Src1.Value, untyped.Value), 1, // VOP1 + !if (!eq(Src2.Value, untyped.Value), 2, // VOP2 + 3)); // VOP3 +} + +// Returns the register class to use for the destination of VOP[123C] +// instructions for the given VT. +class getVALUDstForVT { + RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand, + !if(!eq(VT.Size, 64), VOPDstOperand, + VOPDstOperand)); // else VT == i1 +} + +// Returns the register class to use for source 0 of VOP[12C] +// instructions for the given VT. +class getVOPSrc0ForVT { + RegisterOperand ret = !if(!eq(VT.Size, 32), VSrc_32, VSrc_64); +} + +// Returns the register class to use for source 1 of VOP[12C] for the +// given VT. +class getVOPSrc1ForVT { + RegisterClass ret = !if(!eq(VT.Size, 32), VGPR_32, VReg_64); +} + +// Returns the register class to use for sources of VOP3 instructions for the +// given VT. +class getVOP3SrcForVT { + RegisterOperand ret = !if(!eq(VT.Size, 32), VCSrc_32, VCSrc_64); +} + +// Returns 1 if the source arguments have modifiers, 0 if they do not. +class hasModifiers { + bit ret = !if(!eq(SrcVT.Value, f32.Value), 1, + !if(!eq(SrcVT.Value, f64.Value), 1, 0)); +} + +// Returns the input arguments for VOP[12C] instructions for the given SrcVT. +class getIns32 { + dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0), // VOP1 + !if(!eq(NumSrcArgs, 2), (ins Src0RC:$src0, Src1RC:$src1), // VOP2 + (ins))); +} + +// Returns the input arguments for VOP3 instructions for the given SrcVT. +class getIns64 { + + dag ret = + !if (!eq(NumSrcArgs, 1), + !if (!eq(HasModifiers, 1), + // VOP1 with modifiers + (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0, + ClampMod:$clamp, omod:$omod) + /* else */, + // VOP1 without modifiers + (ins Src0RC:$src0) + /* endif */ ), + !if (!eq(NumSrcArgs, 2), + !if (!eq(HasModifiers, 1), + // VOP 2 with modifiers + (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0, + InputModsNoDefault:$src1_modifiers, Src1RC:$src1, + ClampMod:$clamp, omod:$omod) + /* else */, + // VOP2 without modifiers + (ins Src0RC:$src0, Src1RC:$src1) + /* endif */ ) + /* NumSrcArgs == 3 */, + !if (!eq(HasModifiers, 1), + // VOP3 with modifiers + (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0, + InputModsNoDefault:$src1_modifiers, Src1RC:$src1, + InputModsNoDefault:$src2_modifiers, Src2RC:$src2, + ClampMod:$clamp, omod:$omod) + /* else */, + // VOP3 without modifiers + (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2) + /* endif */ ))); +} + +// Returns the assembly string for the inputs and outputs of a VOP[12C] +// instruction. This does not add the _e32 suffix, so it can be reused +// by getAsm64. +class getAsm32 { + string src1 = ", $src1"; + string src2 = ", $src2"; + string ret = "$dst, $src0"# + !if(!eq(NumSrcArgs, 1), "", src1)# + !if(!eq(NumSrcArgs, 3), src2, ""); +} + +// Returns the assembly string for the inputs and outputs of a VOP3 +// instruction. +class getAsm64 { + string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); + string src1 = !if(!eq(NumSrcArgs, 1), "", + !if(!eq(NumSrcArgs, 2), " $src1_modifiers", + " $src1_modifiers,")); + string src2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", ""); + string ret = + !if(!eq(HasModifiers, 0), + getAsm32.ret, + "$dst, "#src0#src1#src2#"$clamp"#"$omod"); +} + + +class VOPProfile _ArgVT> { + + field list ArgVT = _ArgVT; + + field ValueType DstVT = ArgVT[0]; + field ValueType Src0VT = ArgVT[1]; + field ValueType Src1VT = ArgVT[2]; + field ValueType Src2VT = ArgVT[3]; + field RegisterOperand DstRC = getVALUDstForVT.ret; + field RegisterOperand Src0RC32 = getVOPSrc0ForVT.ret; + field RegisterClass Src1RC32 = getVOPSrc1ForVT.ret; + field RegisterOperand Src0RC64 = getVOP3SrcForVT.ret; + field RegisterOperand Src1RC64 = getVOP3SrcForVT.ret; + field RegisterOperand Src2RC64 = getVOP3SrcForVT.ret; + + field int NumSrcArgs = getNumSrcArgs.ret; + field bit HasModifiers = hasModifiers.ret; + + field dag Outs = (outs DstRC:$dst); + + field dag Ins32 = getIns32.ret; + field dag Ins64 = getIns64.ret; + + field string Asm32 = getAsm32.ret; + field string Asm64 = getAsm64.ret; +} + +// FIXME: I think these F16/I16 profiles will need to use f16/i16 types in order +// for the instruction patterns to work. +def VOP_F16_F16 : VOPProfile <[f32, f32, untyped, untyped]>; +def VOP_F16_I16 : VOPProfile <[f32, i32, untyped, untyped]>; +def VOP_I16_F16 : VOPProfile <[i32, f32, untyped, untyped]>; + +def VOP_F16_F16_F16 : VOPProfile <[f32, f32, f32, untyped]>; +def VOP_F16_F16_I16 : VOPProfile <[f32, f32, i32, untyped]>; +def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>; + +def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>; +def VOP_F32_F64 : VOPProfile <[f32, f64, untyped, untyped]>; +def VOP_F32_I32 : VOPProfile <[f32, i32, untyped, untyped]>; +def VOP_F64_F32 : VOPProfile <[f64, f32, untyped, untyped]>; +def VOP_F64_F64 : VOPProfile <[f64, f64, untyped, untyped]>; +def VOP_F64_I32 : VOPProfile <[f64, i32, untyped, untyped]>; +def VOP_I32_F32 : VOPProfile <[i32, f32, untyped, untyped]>; +def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>; +def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>; + +def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>; +def VOP_F32_F32_I32 : VOPProfile <[f32, f32, i32, untyped]>; +def VOP_F64_F64_F64 : VOPProfile <[f64, f64, f64, untyped]>; +def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>; +def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>; +def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>; +def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>; +def VOP_I32_I32_I32_VCC : VOPProfile <[i32, i32, i32, untyped]> { + let Src0RC32 = VCSrc_32; +} + +def VOP_I1_F32_I32 : VOPProfile <[i1, f32, i32, untyped]> { + let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); + let Asm64 = "$dst, $src0_modifiers, $src1"; +} + +def VOP_I1_F64_I32 : VOPProfile <[i1, f64, i32, untyped]> { + let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); + let Asm64 = "$dst, $src0_modifiers, $src1"; +} + +def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>; +def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>; +def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>; +def VOP_CNDMASK : VOPProfile <[i32, i32, i32, untyped]> { + let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VCCReg:$src2); + let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, SSrc_64:$src2); + let Asm64 = "$dst, $src0, $src1, $src2"; +} + +def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>; +def VOP_MADK : VOPProfile <[f32, f32, f32, f32]> { + field dag Ins = (ins VCSrc_32:$src0, VGPR_32:$vsrc1, u32imm:$src2); + field string Asm = "$dst, $src0, $vsrc1, $src2"; +} +def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>; +def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>; +def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>; + + +class VOP { + string OpName = opName; +} + +class VOP2_REV { + string RevOp = revOp; + bit IsOrig = isOrig; +} + +class AtomicNoRet { + string NoRetOp = noRetOp; + bit IsRet = isRet; +} + +class VOP1_Pseudo pattern, string opName> : + VOP1Common , + VOP , + SIMCInstr , + MnemonicAlias { + let isPseudo = 1; + let isCodeGenOnly = 1; + + field bits<8> vdst; + field bits<9> src0; +} + +class VOP1_Real_si : + VOP1, + SIMCInstr { + let AssemblerPredicate = SIAssemblerPredicate; +} + +class VOP1_Real_vi : + VOP1, + SIMCInstr { + let AssemblerPredicates = [isVI]; +} + +multiclass VOP1_m pattern, + string opName> { + def "" : VOP1_Pseudo ; + + def _si : VOP1_Real_si ; + + def _vi : VOP1_Real_vi ; +} + +multiclass VOP1SI_m pattern, + string opName> { + def "" : VOP1_Pseudo ; + + def _si : VOP1_Real_si ; +} + +class VOP2_Pseudo pattern, string opName> : + VOP2Common , + VOP , + SIMCInstr, + MnemonicAlias { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +class VOP2_Real_si : + VOP2 , + SIMCInstr { + let AssemblerPredicates = [isSICI]; +} + +class VOP2_Real_vi : + VOP2 , + SIMCInstr { + let AssemblerPredicates = [isVI]; +} + +multiclass VOP2SI_m pattern, + string opName, string revOp> { + def "" : VOP2_Pseudo , + VOP2_REV; + + def _si : VOP2_Real_si ; +} + +multiclass VOP2_m pattern, + string opName, string revOp> { + def "" : VOP2_Pseudo , + VOP2_REV; + + def _si : VOP2_Real_si ; + + def _vi : VOP2_Real_vi ; + +} + +class VOP3DisableFields { + + bits<2> src0_modifiers = !if(HasModifiers, ?, 0); + bits<2> src1_modifiers = !if(HasModifiers, !if(HasSrc1, ?, 0), 0); + bits<2> src2_modifiers = !if(HasModifiers, !if(HasSrc2, ?, 0), 0); + bits<2> omod = !if(HasModifiers, ?, 0); + bits<1> clamp = !if(HasModifiers, ?, 0); + bits<9> src1 = !if(HasSrc1, ?, 0); + bits<9> src2 = !if(HasSrc2, ?, 0); +} + +class VOP3DisableModFields { + bits<2> src0_modifiers = !if(HasSrc0Mods, ?, 0); + bits<2> src1_modifiers = !if(HasSrc1Mods, ?, 0); + bits<2> src2_modifiers = !if(HasSrc2Mods, ?, 0); + bits<2> omod = !if(HasOutputMods, ?, 0); + bits<1> clamp = !if(HasOutputMods, ?, 0); +} + +class VOP3_Pseudo pattern, string opName> : + VOP3Common , + VOP , + SIMCInstr, + MnemonicAlias { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +class VOP3_Real_si op, dag outs, dag ins, string asm, string opName> : + VOP3Common , + VOP3e , + SIMCInstr { + let AssemblerPredicates = [isSICI]; +} + +class VOP3_Real_vi op, dag outs, dag ins, string asm, string opName> : + VOP3Common , + VOP3e_vi , + SIMCInstr { + let AssemblerPredicates = [isVI]; +} + +class VOP3b_Real_si op, dag outs, dag ins, string asm, string opName> : + VOP3Common , + VOP3be , + SIMCInstr { + let AssemblerPredicates = [isSICI]; +} + +class VOP3b_Real_vi op, dag outs, dag ins, string asm, string opName> : + VOP3Common , + VOP3be_vi , + SIMCInstr { + let AssemblerPredicates = [isVI]; +} + +multiclass VOP3_m pattern, + string opName, int NumSrcArgs, bit HasMods = 1> { + + def "" : VOP3_Pseudo ; + + def _si : VOP3_Real_si , + VOP3DisableFields; + def _vi : VOP3_Real_vi , + VOP3DisableFields; +} + +// VOP3_m without source modifiers +multiclass VOP3_m_nomods pattern, + string opName, int NumSrcArgs, bit HasMods = 1> { + + def "" : VOP3_Pseudo ; + + let src0_modifiers = 0, + src1_modifiers = 0, + src2_modifiers = 0, + clamp = 0, + omod = 0 in { + def _si : VOP3_Real_si ; + def _vi : VOP3_Real_vi ; + } +} + +multiclass VOP3_1_m pattern, string opName, bit HasMods = 1> { + + def "" : VOP3_Pseudo ; + + def _si : VOP3_Real_si , + VOP3DisableFields<0, 0, HasMods>; + + def _vi : VOP3_Real_vi , + VOP3DisableFields<0, 0, HasMods>; +} + +multiclass VOP3SI_1_m pattern, string opName, bit HasMods = 1> { + + def "" : VOP3_Pseudo ; + + def _si : VOP3_Real_si , + VOP3DisableFields<0, 0, HasMods>; + // No VI instruction. This class is for SI only. +} + +multiclass VOP3_2_m pattern, string opName, string revOp, + bit HasMods = 1, bit UseFullOp = 0> { + + def "" : VOP3_Pseudo , + VOP2_REV; + + def _si : VOP3_Real_si , + VOP3DisableFields<1, 0, HasMods>; + + def _vi : VOP3_Real_vi , + VOP3DisableFields<1, 0, HasMods>; +} + +multiclass VOP3SI_2_m pattern, string opName, string revOp, + bit HasMods = 1, bit UseFullOp = 0> { + + def "" : VOP3_Pseudo , + VOP2_REV; + + def _si : VOP3_Real_si , + VOP3DisableFields<1, 0, HasMods>; + + // No VI instruction. This class is for SI only. +} + +// XXX - Is v_div_scale_{f32|f64} only available in vop3b without +// option of implicit vcc use? +multiclass VOP3b_2_m pattern, string opName, string revOp, + bit HasMods = 1, bit UseFullOp = 0> { + def "" : VOP3_Pseudo , + VOP2_REV; + + // The VOP2 variant puts the carry out into VCC, the VOP3 variant + // can write it into any SGPR. We currently don't use the carry out, + // so for now hardcode it to VCC as well. + let sdst = SIOperand.VCC, Defs = [VCC] in { + def _si : VOP3b_Real_si , + VOP3DisableFields<1, 0, HasMods>; + + def _vi : VOP3b_Real_vi , + VOP3DisableFields<1, 0, HasMods>; + } // End sdst = SIOperand.VCC, Defs = [VCC] +} + +multiclass VOP3b_3_m pattern, string opName, string revOp, + bit HasMods = 1, bit UseFullOp = 0> { + def "" : VOP3_Pseudo ; + + + def _si : VOP3b_Real_si , + VOP3DisableFields<1, 1, HasMods>; + + def _vi : VOP3b_Real_vi , + VOP3DisableFields<1, 1, HasMods>; +} + +multiclass VOP3_C_m pattern, string opName, + bit HasMods, bit defExec, string revOp> { + + def "" : VOP3_Pseudo , + VOP2_REV; + + def _si : VOP3_Real_si , + VOP3DisableFields<1, 0, HasMods> { + let Defs = !if(defExec, [EXEC], []); + } + + def _vi : VOP3_Real_vi , + VOP3DisableFields<1, 0, HasMods> { + let Defs = !if(defExec, [EXEC], []); + } +} + +// An instruction that is VOP2 on SI and VOP3 on VI, no modifiers. +multiclass VOP2SI_3VI_m pattern = []> { + let isPseudo = 1, isCodeGenOnly = 1 in { + def "" : VOPAnyCommon , + SIMCInstr; + } + + def _si : VOP2 , + SIMCInstr { + let AssemblerPredicates = [isSICI]; + } + + def _vi : VOP3Common , + VOP3e_vi , + VOP3DisableFields <1, 0, 0>, + SIMCInstr { + let AssemblerPredicates = [isVI]; + } +} + +multiclass VOP1_Helper pat32, + dag ins64, string asm64, list pat64, + bit HasMods> { + + defm _e32 : VOP1_m ; + + defm _e64 : VOP3_1_m ; +} + +multiclass VOP1Inst : VOP1_Helper < + op, opName, P.Outs, + P.Ins32, P.Asm32, [], + P.Ins64, P.Asm64, + !if(P.HasModifiers, + [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, + i32:$src0_modifiers, i1:$clamp, i32:$omod))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0))]), + P.HasModifiers +>; + +multiclass VOP1InstSI { + + defm _e32 : VOP1SI_m ; + + defm _e64 : VOP3SI_1_m ; +} + +multiclass VOP2_Helper pat32, + dag ins64, string asm64, list pat64, + string revOp, bit HasMods> { + defm _e32 : VOP2_m ; + + defm _e64 : VOP3_2_m ; +} + +multiclass VOP2Inst : VOP2_Helper < + op, opName, P.Outs, + P.Ins32, P.Asm32, [], + P.Ins64, P.Asm64, + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), + revOp, P.HasModifiers +>; + +multiclass VOP2InstSI { + defm _e32 : VOP2SI_m ; + + defm _e64 : VOP3SI_2_m ; +} + +multiclass VOP2b_Helper pat32, + dag ins64, string asm64, list pat64, + string revOp, bit HasMods> { + + defm _e32 : VOP2_m ; + + defm _e64 : VOP3b_2_m ; +} + +multiclass VOP2bInst : VOP2b_Helper < + op, opName, P.Outs, + P.Ins32, P.Asm32, [], + P.Ins64, P.Asm64, + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), + revOp, P.HasModifiers +>; + +// A VOP2 instruction that is VOP3-only on VI. +multiclass VOP2_VI3_Helper pat32, + dag ins64, string asm64, list pat64, + string revOp, bit HasMods> { + defm _e32 : VOP2SI_m ; + + defm _e64 : VOP3_2_m ; +} + +multiclass VOP2_VI3_Inst + : VOP2_VI3_Helper < + op, opName, P.Outs, + P.Ins32, P.Asm32, [], + P.Ins64, P.Asm64, + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), + revOp, P.HasModifiers +>; + +multiclass VOP2MADK pattern = []> { + + def "" : VOP2_Pseudo ; + +let isCodeGenOnly = 0 in { + def _si : VOP2Common , + SIMCInstr , + VOP2_MADKe { + let AssemblerPredicates = [isSICI]; + } + + def _vi : VOP2Common , + SIMCInstr , + VOP2_MADKe { + let AssemblerPredicates = [isVI]; + } +} // End isCodeGenOnly = 0 +} + +class VOPC_Pseudo pattern, string opName> : + VOPCCommon , + VOP , + SIMCInstr, + MnemonicAlias { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +multiclass VOPC_m pattern, + string opName, bit DefExec, string revOpName = ""> { + def "" : VOPC_Pseudo ; + + def _si : VOPC, + SIMCInstr { + let Defs = !if(DefExec, [EXEC], []); + let hasSideEffects = DefExec; + } + + def _vi : VOPC, + SIMCInstr { + let Defs = !if(DefExec, [EXEC], []); + let hasSideEffects = DefExec; + } +} + +multiclass VOPC_Helper pat32, + dag out64, dag ins64, string asm64, list pat64, + bit HasMods, bit DefExec, string revOp> { + defm _e32 : VOPC_m ; + + defm _e64 : VOP3_C_m ; +} + +// Special case for class instructions which only have modifiers on +// the 1st source operand. +multiclass VOPC_Class_Helper pat32, + dag out64, dag ins64, string asm64, list pat64, + bit HasMods, bit DefExec, string revOp> { + defm _e32 : VOPC_m ; + + defm _e64 : VOP3_C_m , + VOP3DisableModFields<1, 0, 0>; +} + +multiclass VOPCInst : VOPC_Helper < + op, opName, + P.Ins32, P.Asm32, [], + (outs VOPDstS64:$dst), P.Ins64, P.Asm64, + !if(P.HasModifiers, + [(set i1:$dst, + (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), + cond))], + [(set i1:$dst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]), + P.HasModifiers, DefExec, revOp +>; + +multiclass VOPCClassInst : VOPC_Class_Helper < + op, opName, + P.Ins32, P.Asm32, [], + (outs VOPDstS64:$dst), P.Ins64, P.Asm64, + !if(P.HasModifiers, + [(set i1:$dst, + (AMDGPUfp_class (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)), P.Src1VT:$src1))], + [(set i1:$dst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]), + P.HasModifiers, DefExec, opName +>; + + +multiclass VOPC_F32 : + VOPCInst ; + +multiclass VOPC_F64 : + VOPCInst ; + +multiclass VOPC_I32 : + VOPCInst ; + +multiclass VOPC_I64 : + VOPCInst ; + + +multiclass VOPCX + : VOPCInst ; + +multiclass VOPCX_F32 : + VOPCX ; + +multiclass VOPCX_F64 : + VOPCX ; + +multiclass VOPCX_I32 : + VOPCX ; + +multiclass VOPCX_I64 : + VOPCX ; + +multiclass VOP3_Helper pat, int NumSrcArgs, bit HasMods> : VOP3_m < + op, outs, ins, opName#" "#asm, pat, opName, NumSrcArgs, HasMods +>; + +multiclass VOPC_CLASS_F32 : + VOPCClassInst ; + +multiclass VOPCX_CLASS_F32 : + VOPCClassInst ; + +multiclass VOPC_CLASS_F64 : + VOPCClassInst ; + +multiclass VOPCX_CLASS_F64 : + VOPCClassInst ; + +multiclass VOP3Inst : VOP3_Helper < + op, opName, (outs P.DstRC.RegClass:$dst), P.Ins64, P.Asm64, + !if(!eq(P.NumSrcArgs, 3), + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), + (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1, + P.Src2VT:$src2))]), + !if(!eq(P.NumSrcArgs, 2), + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]) + /* P.NumSrcArgs == 1 */, + !if(P.HasModifiers, + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod))))], + [(set P.DstVT:$dst, (node P.Src0VT:$src0))]))), + P.NumSrcArgs, P.HasModifiers +>; + +// Special case for v_div_fmas_{f32|f64}, since it seems to be the +// only VOP instruction that implicitly reads VCC. +multiclass VOP3_VCC_Inst : VOP3_Helper < + op, opName, + (outs P.DstRC.RegClass:$dst), + (ins InputModsNoDefault:$src0_modifiers, P.Src0RC64:$src0, + InputModsNoDefault:$src1_modifiers, P.Src1RC64:$src1, + InputModsNoDefault:$src2_modifiers, P.Src2RC64:$src2, + ClampMod:$clamp, + omod:$omod), + " $dst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod", + [(set P.DstVT:$dst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), + (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers)), + (i1 VCC)))], + 3, 1 +>; + +multiclass VOP3b_Helper pattern> : + VOP3b_3_m < + op, (outs vrc:$vdst, SReg_64:$sdst), + (ins InputModsNoDefault:$src0_modifiers, arc:$src0, + InputModsNoDefault:$src1_modifiers, arc:$src1, + InputModsNoDefault:$src2_modifiers, arc:$src2, + ClampMod:$clamp, omod:$omod), + opName#" $vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod", pattern, + opName, opName, 1, 1 +>; + +multiclass VOP3b_64 pattern> : + VOP3b_Helper ; + +multiclass VOP3b_32 pattern> : + VOP3b_Helper ; + + +class Vop3ModPat : Pat< + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), + (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))), + (Inst i32:$src0_modifiers, P.Src0VT:$src0, + i32:$src1_modifiers, P.Src1VT:$src1, + i32:$src2_modifiers, P.Src2VT:$src2, + i1:$clamp, + i32:$omod)>; + +//===----------------------------------------------------------------------===// +// Interpolation opcodes +//===----------------------------------------------------------------------===// + +class VINTRP_Pseudo pattern> : + VINTRPCommon , + SIMCInstr { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +class VINTRP_Real_si op, string opName, dag outs, dag ins, + string asm> : + VINTRPCommon , + VINTRPe , + SIMCInstr; + +class VINTRP_Real_vi op, string opName, dag outs, dag ins, + string asm> : + VINTRPCommon , + VINTRPe_vi , + SIMCInstr; + +multiclass VINTRP_m op, dag outs, dag ins, string asm, + list pattern = []> { + def "" : VINTRP_Pseudo ; + + def _si : VINTRP_Real_si ; + + def _vi : VINTRP_Real_vi ; +} + +//===----------------------------------------------------------------------===// +// Vector I/O classes +//===----------------------------------------------------------------------===// + +class DS_Pseudo pattern> : + DS , + SIMCInstr { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +class DS_Real_si op, string opName, dag outs, dag ins, string asm> : + DS , + DSe , + SIMCInstr { + let isCodeGenOnly = 0; +} + +class DS_Real_vi op, string opName, dag outs, dag ins, string asm> : + DS , + DSe_vi , + SIMCInstr ; + +class DS_Off16_Real_si op, string opName, dag outs, dag ins, string asm> : + DS_Real_si { + + // Single load interpret the 2 i8imm operands as a single i16 offset. + bits<16> offset; + let offset0 = offset{7-0}; + let offset1 = offset{15-8}; + let isCodeGenOnly = 0; +} + +class DS_Off16_Real_vi op, string opName, dag outs, dag ins, string asm> : + DS_Real_vi { + + // Single load interpret the 2 i8imm operands as a single i16 offset. + bits<16> offset; + let offset0 = offset{7-0}; + let offset1 = offset{15-8}; +} + +multiclass DS_1A_RET op, string opName, RegisterClass rc, + dag outs = (outs rc:$vdst), + dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds), + string asm = opName#" $vdst, $addr"#"$offset$gds"> { + + def "" : DS_Pseudo ; + + let data0 = 0, data1 = 0 in { + def _si : DS_Off16_Real_si ; + def _vi : DS_Off16_Real_vi ; + } +} + +multiclass DS_1A_Off8_RET op, string opName, RegisterClass rc, + dag outs = (outs rc:$vdst), + dag ins = (ins VGPR_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1, + gds01:$gds), + string asm = opName#" $vdst, $addr"#"$offset0"#"$offset1$gds"> { + + def "" : DS_Pseudo ; + + let data0 = 0, data1 = 0, AsmMatchConverter = "cvtDSOffset01" in { + def _si : DS_Real_si ; + def _vi : DS_Real_vi ; + } +} + +multiclass DS_1A1D_NORET op, string opName, RegisterClass rc, + dag outs = (outs), + dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds), + string asm = opName#" $addr, $data0"#"$offset$gds"> { + + def "" : DS_Pseudo , + AtomicNoRet; + + let data1 = 0, vdst = 0 in { + def _si : DS_Off16_Real_si ; + def _vi : DS_Off16_Real_vi ; + } +} + +multiclass DS_1A1D_Off8_NORET op, string opName, RegisterClass rc, + dag outs = (outs), + dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1, + ds_offset0:$offset0, ds_offset1:$offset1, gds01:$gds), + string asm = opName#" $addr, $data0, $data1"#"$offset0"#"$offset1"#"$gds"> { + + def "" : DS_Pseudo ; + + let vdst = 0, AsmMatchConverter = "cvtDSOffset01" in { + def _si : DS_Real_si ; + def _vi : DS_Real_vi ; + } +} + +multiclass DS_1A1D_RET op, string opName, RegisterClass rc, + string noRetOp = "", + dag outs = (outs rc:$vdst), + dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds), + string asm = opName#" $vdst, $addr, $data0"#"$offset$gds"> { + + def "" : DS_Pseudo , + AtomicNoRet; + + let data1 = 0 in { + def _si : DS_Off16_Real_si ; + def _vi : DS_Off16_Real_vi ; + } +} + +multiclass DS_1A2D_RET_m op, string opName, RegisterClass rc, + string noRetOp = "", dag ins, + dag outs = (outs rc:$vdst), + string asm = opName#" $vdst, $addr, $data0, $data1"#"$offset"#"$gds"> { + + def "" : DS_Pseudo , + AtomicNoRet; + + def _si : DS_Off16_Real_si ; + def _vi : DS_Off16_Real_vi ; +} + +multiclass DS_1A2D_RET op, string asm, RegisterClass rc, + string noRetOp = "", RegisterClass src = rc> : + DS_1A2D_RET_m ; + +multiclass DS_1A2D_NORET op, string opName, RegisterClass rc, + string noRetOp = opName, + dag outs = (outs), + dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1, + ds_offset:$offset, gds:$gds), + string asm = opName#" $addr, $data0, $data1"#"$offset"#"$gds"> { + + def "" : DS_Pseudo , + AtomicNoRet; + + let vdst = 0 in { + def _si : DS_Off16_Real_si ; + def _vi : DS_Off16_Real_vi ; + } +} + +multiclass DS_0A_RET op, string opName, + dag outs = (outs VGPR_32:$vdst), + dag ins = (ins ds_offset:$offset, gds:$gds), + string asm = opName#" $vdst"#"$offset"#"$gds"> { + + let mayLoad = 1, mayStore = 1 in { + def "" : DS_Pseudo ; + + let addr = 0, data0 = 0, data1 = 0 in { + def _si : DS_Off16_Real_si ; + def _vi : DS_Off16_Real_vi ; + } // end addr = 0, data0 = 0, data1 = 0 + } // end mayLoad = 1, mayStore = 1 +} + +multiclass DS_1A_RET_GDS op, string opName, + dag outs = (outs VGPR_32:$vdst), + dag ins = (ins VGPR_32:$addr, ds_offset_gds:$offset), + string asm = opName#" $vdst, $addr"#"$offset gds"> { + + def "" : DS_Pseudo ; + + let data0 = 0, data1 = 0, gds = 1 in { + def _si : DS_Off16_Real_si ; + def _vi : DS_Off16_Real_vi ; + } // end data0 = 0, data1 = 0, gds = 1 +} + +multiclass DS_1A_GDS op, string opName, + dag outs = (outs), + dag ins = (ins VGPR_32:$addr), + string asm = opName#" $addr gds"> { + + def "" : DS_Pseudo ; + + let vdst = 0, data0 = 0, data1 = 0, offset0 = 0, offset1 = 0, gds = 1 in { + def _si : DS_Real_si ; + def _vi : DS_Real_vi ; + } // end vdst = 0, data = 0, data1 = 0, gds = 1 +} + +multiclass DS_1A op, string opName, + dag outs = (outs), + dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds), + string asm = opName#" $addr"#"$offset"#"$gds"> { + + let mayLoad = 1, mayStore = 1 in { + def "" : DS_Pseudo ; + + let vdst = 0, data0 = 0, data1 = 0 in { + def _si : DS_Off16_Real_si ; + def _vi : DS_Off16_Real_vi ; + } // let vdst = 0, data0 = 0, data1 = 0 + } // end mayLoad = 1, mayStore = 1 +} + +//===----------------------------------------------------------------------===// +// MTBUF classes +//===----------------------------------------------------------------------===// + +class MTBUF_Pseudo pattern> : + MTBUF , + SIMCInstr { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +class MTBUF_Real_si op, string opName, dag outs, dag ins, + string asm> : + MTBUF , + MTBUFe , + SIMCInstr; + +class MTBUF_Real_vi op, string opName, dag outs, dag ins, string asm> : + MTBUF , + MTBUFe_vi , + SIMCInstr ; + +multiclass MTBUF_m op, string opName, dag outs, dag ins, string asm, + list pattern> { + + def "" : MTBUF_Pseudo ; + + def _si : MTBUF_Real_si ; + + def _vi : MTBUF_Real_vi <{0, op{2}, op{1}, op{0}}, opName, outs, ins, asm>; + +} + +let mayStore = 1, mayLoad = 0 in { + +multiclass MTBUF_Store_Helper op, string opName, + RegisterClass regClass> : MTBUF_m < + op, opName, (outs), + (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, + i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, + SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SCSrc_32:$soffset), + opName#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt," + #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", [] +>; + +} // mayStore = 1, mayLoad = 0 + +let mayLoad = 1, mayStore = 0 in { + +multiclass MTBUF_Load_Helper op, string opName, + RegisterClass regClass> : MTBUF_m < + op, opName, (outs regClass:$dst), + (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, + i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, SReg_128:$srsrc, + i1imm:$slc, i1imm:$tfe, SCSrc_32:$soffset), + opName#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt," + #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", [] +>; + +} // mayLoad = 1, mayStore = 0 + +//===----------------------------------------------------------------------===// +// MUBUF classes +//===----------------------------------------------------------------------===// + +class mubuf si, bits<7> vi = si> { + field bits<7> SI = si; + field bits<7> VI = vi; +} + +let isCodeGenOnly = 0 in { + +class MUBUF_si op, dag outs, dag ins, string asm, list pattern> : + MUBUF , MUBUFe { + let lds = 0; +} + +} // End let isCodeGenOnly = 0 + +class MUBUF_vi op, dag outs, dag ins, string asm, list pattern> : + MUBUF , MUBUFe_vi { + let lds = 0; +} + +class MUBUFAddr64Table { + bit IsAddr64 = is_addr64; + string OpName = NAME # suffix; +} + +class MUBUF_Pseudo pattern> : + MUBUF , + SIMCInstr { + let isPseudo = 1; + let isCodeGenOnly = 1; + + // dummy fields, so that we can use let statements around multiclasses + bits<1> offen; + bits<1> idxen; + bits<8> vaddr; + bits<1> glc; + bits<1> slc; + bits<1> tfe; + bits<8> soffset; +} + +class MUBUF_Real_si : + MUBUF , + MUBUFe , + SIMCInstr { + let lds = 0; +} + +class MUBUF_Real_vi : + MUBUF , + MUBUFe_vi , + SIMCInstr { + let lds = 0; +} + +multiclass MUBUF_m pattern> { + + def "" : MUBUF_Pseudo , + MUBUFAddr64Table <0>; + + let addr64 = 0, isCodeGenOnly = 0 in { + def _si : MUBUF_Real_si ; + } + + def _vi : MUBUF_Real_vi ; +} + +multiclass MUBUFAddr64_m pattern> { + + def "" : MUBUF_Pseudo , + MUBUFAddr64Table <1>; + + let addr64 = 1, isCodeGenOnly = 0 in { + def _si : MUBUF_Real_si ; + } + + // There is no VI version. If the pseudo is selected, it should be lowered + // for VI appropriately. +} + +multiclass MUBUFAtomicOffset_m pattern, bit is_return> { + + def "" : MUBUF_Pseudo , + MUBUFAddr64Table <0, !if(is_return, "_RTN", "")>, + AtomicNoRet; + + let offen = 0, idxen = 0, tfe = 0, vaddr = 0 in { + let addr64 = 0 in { + def _si : MUBUF_Real_si ; + } + + def _vi : MUBUF_Real_vi ; + } +} + +multiclass MUBUFAtomicAddr64_m pattern, bit is_return> { + + def "" : MUBUF_Pseudo , + MUBUFAddr64Table <1, !if(is_return, "_RTN", "")>, + AtomicNoRet; + + let offen = 0, idxen = 0, addr64 = 1, tfe = 0 in { + def _si : MUBUF_Real_si ; + } + + // There is no VI version. If the pseudo is selected, it should be lowered + // for VI appropriately. +} + +multiclass MUBUF_Atomic { + + let mayStore = 1, mayLoad = 1, hasPostISelHook = 1 in { + + // No return variants + let glc = 0 in { + + defm _ADDR64 : MUBUFAtomicAddr64_m < + op, name#"_addr64", (outs), + (ins rc:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, + SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#"$slc", [], 0 + >; + + defm _OFFSET : MUBUFAtomicOffset_m < + op, name#"_offset", (outs), + (ins rc:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset, + slc:$slc), + name#" $vdata, $srsrc, $soffset"#"$offset"#"$slc", [], 0 + >; + } // glc = 0 + + // Variant that return values + let glc = 1, Constraints = "$vdata = $vdata_in", + DisableEncoding = "$vdata_in" in { + + defm _RTN_ADDR64 : MUBUFAtomicAddr64_m < + op, name#"_rtn_addr64", (outs rc:$vdata), + (ins rc:$vdata_in, SReg_128:$srsrc, VReg_64:$vaddr, + SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#" glc"#"$slc", + [(set vt:$vdata, + (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, + i16:$offset, i1:$slc), vt:$vdata_in))], 1 + >; + + defm _RTN_OFFSET : MUBUFAtomicOffset_m < + op, name#"_rtn_offset", (outs rc:$vdata), + (ins rc:$vdata_in, SReg_128:$srsrc, SCSrc_32:$soffset, + mbuf_offset:$offset, slc:$slc), + name#" $vdata, $srsrc, $soffset"#"$offset"#" glc $slc", + [(set vt:$vdata, + (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, + i1:$slc), vt:$vdata_in))], 1 + >; + + } // glc = 1 + + } // mayStore = 1, mayLoad = 1, hasPostISelHook = 1 +} + +multiclass MUBUF_Load_Helper { + + let mayLoad = 1, mayStore = 0 in { + let offen = 0, idxen = 0, vaddr = 0 in { + defm _OFFSET : MUBUF_m ; + } + + let offen = 1, idxen = 0 in { + defm _OFFEN : MUBUF_m ; + } + + let offen = 0, idxen = 1 in { + defm _IDXEN : MUBUF_m ; + } + + let offen = 1, idxen = 1 in { + defm _BOTHEN : MUBUF_m ; + } + + let offen = 0, idxen = 0 in { + defm _ADDR64 : MUBUFAddr64_m ; + } + } +} + +multiclass MUBUF_Store_Helper { + let mayLoad = 0, mayStore = 1 in { + defm : MUBUF_m ; + + let offen = 0, idxen = 0, vaddr = 0 in { + defm _OFFSET : MUBUF_m ; + } // offen = 0, idxen = 0, vaddr = 0 + + let offen = 1, idxen = 0 in { + defm _OFFEN : MUBUF_m ; + } // end offen = 1, idxen = 0 + + let offen = 0, idxen = 1 in { + defm _IDXEN : MUBUF_m ; + } + + let offen = 1, idxen = 1 in { + defm _BOTHEN : MUBUF_m ; + } + + let offen = 0, idxen = 0 in { + defm _ADDR64 : MUBUFAddr64_m ; + } + } // End mayLoad = 0, mayStore = 1 +} + +class FLAT_Load_Helper op, string asm, RegisterClass regClass> : + FLAT { + let data = 0; + let mayLoad = 1; +} + +class FLAT_Store_Helper op, string name, RegisterClass vdataClass> : + FLAT { + + let mayLoad = 0; + let mayStore = 1; + + // Encoding + let vdst = 0; +} + +multiclass FLAT_ATOMIC op, string name, RegisterClass vdst_rc, + RegisterClass data_rc = vdst_rc> { + + let mayLoad = 1, mayStore = 1 in { + def "" : FLAT , + AtomicNoRet { + let glc = 0; + let vdst = 0; + } + + def _RTN : FLAT , + AtomicNoRet { + let glc = 1; + } + } +} + +class MIMG_Mask { + string Op = op; + int Channels = channels; +} + +class MIMG_NoSampler_Helper op, string asm, + RegisterClass dst_rc, + RegisterClass src_rc> : MIMG < + op, + (outs dst_rc:$vdata), + (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, + i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr, + SReg_256:$srsrc), + asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," + #" $tfe, $lwe, $slc, $vaddr, $srsrc", + []> { + let ssamp = 0; + let mayLoad = 1; + let mayStore = 0; + let hasPostISelHook = 1; +} + +multiclass MIMG_NoSampler_Src_Helper op, string asm, + RegisterClass dst_rc, + int channels> { + def _V1 : MIMG_NoSampler_Helper , + MIMG_Mask; + def _V2 : MIMG_NoSampler_Helper , + MIMG_Mask; + def _V4 : MIMG_NoSampler_Helper , + MIMG_Mask; +} + +multiclass MIMG_NoSampler op, string asm> { + defm _V1 : MIMG_NoSampler_Src_Helper ; + defm _V2 : MIMG_NoSampler_Src_Helper ; + defm _V3 : MIMG_NoSampler_Src_Helper ; + defm _V4 : MIMG_NoSampler_Src_Helper ; +} + +class MIMG_Sampler_Helper op, string asm, + RegisterClass dst_rc, + RegisterClass src_rc, int wqm> : MIMG < + op, + (outs dst_rc:$vdata), + (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, + i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr, + SReg_256:$srsrc, SReg_128:$ssamp), + asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," + #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp", + []> { + let mayLoad = 1; + let mayStore = 0; + let hasPostISelHook = 1; + let WQM = wqm; +} + +multiclass MIMG_Sampler_Src_Helper op, string asm, + RegisterClass dst_rc, + int channels, int wqm> { + def _V1 : MIMG_Sampler_Helper , + MIMG_Mask; + def _V2 : MIMG_Sampler_Helper , + MIMG_Mask; + def _V4 : MIMG_Sampler_Helper , + MIMG_Mask; + def _V8 : MIMG_Sampler_Helper , + MIMG_Mask; + def _V16 : MIMG_Sampler_Helper , + MIMG_Mask; +} + +multiclass MIMG_Sampler op, string asm> { + defm _V1 : MIMG_Sampler_Src_Helper; + defm _V2 : MIMG_Sampler_Src_Helper; + defm _V3 : MIMG_Sampler_Src_Helper; + defm _V4 : MIMG_Sampler_Src_Helper; +} + +multiclass MIMG_Sampler_WQM op, string asm> { + defm _V1 : MIMG_Sampler_Src_Helper; + defm _V2 : MIMG_Sampler_Src_Helper; + defm _V3 : MIMG_Sampler_Src_Helper; + defm _V4 : MIMG_Sampler_Src_Helper; +} + +class MIMG_Gather_Helper op, string asm, + RegisterClass dst_rc, + RegisterClass src_rc, int wqm> : MIMG < + op, + (outs dst_rc:$vdata), + (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, + i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr, + SReg_256:$srsrc, SReg_128:$ssamp), + asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," + #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp", + []> { + let mayLoad = 1; + let mayStore = 0; + + // DMASK was repurposed for GATHER4. 4 components are always + // returned and DMASK works like a swizzle - it selects + // the component to fetch. The only useful DMASK values are + // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns + // (red,red,red,red) etc.) The ISA document doesn't mention + // this. + // Therefore, disable all code which updates DMASK by setting these two: + let MIMG = 0; + let hasPostISelHook = 0; + let WQM = wqm; +} + +multiclass MIMG_Gather_Src_Helper op, string asm, + RegisterClass dst_rc, + int channels, int wqm> { + def _V1 : MIMG_Gather_Helper , + MIMG_Mask; + def _V2 : MIMG_Gather_Helper , + MIMG_Mask; + def _V4 : MIMG_Gather_Helper , + MIMG_Mask; + def _V8 : MIMG_Gather_Helper , + MIMG_Mask; + def _V16 : MIMG_Gather_Helper , + MIMG_Mask; +} + +multiclass MIMG_Gather op, string asm> { + defm _V1 : MIMG_Gather_Src_Helper; + defm _V2 : MIMG_Gather_Src_Helper; + defm _V3 : MIMG_Gather_Src_Helper; + defm _V4 : MIMG_Gather_Src_Helper; +} + +multiclass MIMG_Gather_WQM op, string asm> { + defm _V1 : MIMG_Gather_Src_Helper; + defm _V2 : MIMG_Gather_Src_Helper; + defm _V3 : MIMG_Gather_Src_Helper; + defm _V4 : MIMG_Gather_Src_Helper; +} + +//===----------------------------------------------------------------------===// +// Vector instruction mappings +//===----------------------------------------------------------------------===// + +// Maps an opcode in e32 form to its e64 equivalent +def getVOPe64 : InstrMapping { + let FilterClass = "VOP"; + let RowFields = ["OpName"]; + let ColFields = ["Size"]; + let KeyCol = ["4"]; + let ValueCols = [["8"]]; +} + +// Maps an opcode in e64 form to its e32 equivalent +def getVOPe32 : InstrMapping { + let FilterClass = "VOP"; + let RowFields = ["OpName"]; + let ColFields = ["Size"]; + let KeyCol = ["8"]; + let ValueCols = [["4"]]; +} + +def getMaskedMIMGOp : InstrMapping { + let FilterClass = "MIMG_Mask"; + let RowFields = ["Op"]; + let ColFields = ["Channels"]; + let KeyCol = ["4"]; + let ValueCols = [["1"], ["2"], ["3"] ]; +} + +// Maps an commuted opcode to its original version +def getCommuteOrig : InstrMapping { + let FilterClass = "VOP2_REV"; + let RowFields = ["RevOp"]; + let ColFields = ["IsOrig"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + +// Maps an original opcode to its commuted version +def getCommuteRev : InstrMapping { + let FilterClass = "VOP2_REV"; + let RowFields = ["RevOp"]; + let ColFields = ["IsOrig"]; + let KeyCol = ["1"]; + let ValueCols = [["0"]]; +} + +def getCommuteCmpOrig : InstrMapping { + let FilterClass = "VOP2_REV"; + let RowFields = ["RevOp"]; + let ColFields = ["IsOrig"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + +// Maps an original opcode to its commuted version +def getCommuteCmpRev : InstrMapping { + let FilterClass = "VOP2_REV"; + let RowFields = ["RevOp"]; + let ColFields = ["IsOrig"]; + let KeyCol = ["1"]; + let ValueCols = [["0"]]; +} + + +def getMCOpcodeGen : InstrMapping { + let FilterClass = "SIMCInstr"; + let RowFields = ["PseudoInstr"]; + let ColFields = ["Subtarget"]; + let KeyCol = [!cast(SISubtarget.NONE)]; + let ValueCols = [[!cast(SISubtarget.SI)],[!cast(SISubtarget.VI)]]; +} + +def getAddr64Inst : InstrMapping { + let FilterClass = "MUBUFAddr64Table"; + let RowFields = ["OpName"]; + let ColFields = ["IsAddr64"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + +// Maps an atomic opcode to its version with a return value. +def getAtomicRetOp : InstrMapping { + let FilterClass = "AtomicNoRet"; + let RowFields = ["NoRetOp"]; + let ColFields = ["IsRet"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + +// Maps an atomic opcode to its returnless version. +def getAtomicNoRetOp : InstrMapping { + let FilterClass = "AtomicNoRet"; + let RowFields = ["NoRetOp"]; + let ColFields = ["IsRet"]; + let KeyCol = ["1"]; + let ValueCols = [["0"]]; +} + +include "SIInstructions.td" +include "CIInstructions.td" +include "VIInstructions.td" diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td new file mode 100644 index 00000000000..8c8d836776d --- /dev/null +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -0,0 +1,3327 @@ +//===-- SIInstructions.td - SI Instruction Defintions ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This file was originally auto-generated from a GPU register header file and +// all the instruction definitions were originally commented out. Instructions +// that are not yet supported remain commented out. +//===----------------------------------------------------------------------===// + +class InterpSlots { +int P0 = 2; +int P10 = 0; +int P20 = 1; +} +def INTERP : InterpSlots; + +def InterpSlot : Operand { + let PrintMethod = "printInterpSlot"; +} + +def SendMsgImm : Operand { + let PrintMethod = "printSendMsg"; +} + +def isGCN : Predicate<"Subtarget->getGeneration() " + ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">, + AssemblerPredicate<"FeatureGCN">; +def isSI : Predicate<"Subtarget->getGeneration() " + "== AMDGPUSubtarget::SOUTHERN_ISLANDS">; + +def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">; +def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">; + +def SWaitMatchClass : AsmOperandClass { + let Name = "SWaitCnt"; + let RenderMethod = "addImmOperands"; + let ParserMethod = "parseSWaitCntOps"; +} + +def WAIT_FLAG : InstFlag<"printWaitFlag"> { + let ParserMatchClass = SWaitMatchClass; +} + +let SubtargetPredicate = isGCN in { + +//===----------------------------------------------------------------------===// +// EXP Instructions +//===----------------------------------------------------------------------===// + +defm EXP : EXP_m; + +//===----------------------------------------------------------------------===// +// SMRD Instructions +//===----------------------------------------------------------------------===// + +let mayLoad = 1 in { + +// We are using the SGPR_32 and not the SReg_32 register class for 32-bit +// SMRD instructions, because the SGPR_32 register class does not include M0 +// and writing to M0 from an SMRD instruction will hang the GPU. +defm S_LOAD_DWORD : SMRD_Helper <0x00, "s_load_dword", SReg_64, SGPR_32>; +defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "s_load_dwordx2", SReg_64, SReg_64>; +defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "s_load_dwordx4", SReg_64, SReg_128>; +defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "s_load_dwordx8", SReg_64, SReg_256>; +defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "s_load_dwordx16", SReg_64, SReg_512>; + +defm S_BUFFER_LOAD_DWORD : SMRD_Helper < + 0x08, "s_buffer_load_dword", SReg_128, SGPR_32 +>; + +defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper < + 0x09, "s_buffer_load_dwordx2", SReg_128, SReg_64 +>; + +defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper < + 0x0a, "s_buffer_load_dwordx4", SReg_128, SReg_128 +>; + +defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper < + 0x0b, "s_buffer_load_dwordx8", SReg_128, SReg_256 +>; + +defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper < + 0x0c, "s_buffer_load_dwordx16", SReg_128, SReg_512 +>; + +} // mayLoad = 1 + +//def S_MEMTIME : SMRD_ <0x0000001e, "s_memtime", []>; +//def S_DCACHE_INV : SMRD_ <0x0000001f, "s_dcache_inv", []>; + +//===----------------------------------------------------------------------===// +// SOP1 Instructions +//===----------------------------------------------------------------------===// + +let isMoveImm = 1 in { + let isReMaterializable = 1, isAsCheapAsAMove = 1 in { + defm S_MOV_B32 : SOP1_32 , "s_mov_b32", []>; + defm S_MOV_B64 : SOP1_64 , "s_mov_b64", []>; + } // let isRematerializeable = 1 + + let Uses = [SCC] in { + defm S_CMOV_B32 : SOP1_32 , "s_cmov_b32", []>; + defm S_CMOV_B64 : SOP1_64 , "s_cmov_b64", []>; + } // End Uses = [SCC] +} // End isMoveImm = 1 + +let Defs = [SCC] in { + defm S_NOT_B32 : SOP1_32 , "s_not_b32", + [(set i32:$dst, (not i32:$src0))] + >; + + defm S_NOT_B64 : SOP1_64 , "s_not_b64", + [(set i64:$dst, (not i64:$src0))] + >; + defm S_WQM_B32 : SOP1_32 , "s_wqm_b32", []>; + defm S_WQM_B64 : SOP1_64 , "s_wqm_b64", []>; +} // End Defs = [SCC] + + +defm S_BREV_B32 : SOP1_32 , "s_brev_b32", + [(set i32:$dst, (AMDGPUbrev i32:$src0))] +>; +defm S_BREV_B64 : SOP1_64 , "s_brev_b64", []>; + +let Defs = [SCC] in { + defm S_BCNT0_I32_B32 : SOP1_32 , "s_bcnt0_i32_b32", []>; + defm S_BCNT0_I32_B64 : SOP1_32_64 , "s_bcnt0_i32_b64", []>; + defm S_BCNT1_I32_B32 : SOP1_32 , "s_bcnt1_i32_b32", + [(set i32:$dst, (ctpop i32:$src0))] + >; + defm S_BCNT1_I32_B64 : SOP1_32_64 , "s_bcnt1_i32_b64", []>; +} // End Defs = [SCC] + +defm S_FF0_I32_B32 : SOP1_32 , "s_ff0_i32_b32", []>; +defm S_FF0_I32_B64 : SOP1_32_64 , "s_ff0_i32_b64", []>; +defm S_FF1_I32_B32 : SOP1_32 , "s_ff1_i32_b32", + [(set i32:$dst, (cttz_zero_undef i32:$src0))] +>; +defm S_FF1_I32_B64 : SOP1_32_64 , "s_ff1_i32_b64", []>; + +defm S_FLBIT_I32_B32 : SOP1_32 , "s_flbit_i32_b32", + [(set i32:$dst, (ctlz_zero_undef i32:$src0))] +>; + +defm S_FLBIT_I32_B64 : SOP1_32_64 , "s_flbit_i32_b64", []>; +defm S_FLBIT_I32 : SOP1_32 , "s_flbit_i32", + [(set i32:$dst, (int_AMDGPU_flbit_i32 i32:$src0))] +>; +defm S_FLBIT_I32_I64 : SOP1_32_64 , "s_flbit_i32_i64", []>; +defm S_SEXT_I32_I8 : SOP1_32 , "s_sext_i32_i8", + [(set i32:$dst, (sext_inreg i32:$src0, i8))] +>; +defm S_SEXT_I32_I16 : SOP1_32 , "s_sext_i32_i16", + [(set i32:$dst, (sext_inreg i32:$src0, i16))] +>; + +defm S_BITSET0_B32 : SOP1_32 , "s_bitset0_b32", []>; +defm S_BITSET0_B64 : SOP1_64 , "s_bitset0_b64", []>; +defm S_BITSET1_B32 : SOP1_32 , "s_bitset1_b32", []>; +defm S_BITSET1_B64 : SOP1_64 , "s_bitset1_b64", []>; +defm S_GETPC_B64 : SOP1_64_0 , "s_getpc_b64", []>; +defm S_SETPC_B64 : SOP1_64 , "s_setpc_b64", []>; +defm S_SWAPPC_B64 : SOP1_64 , "s_swappc_b64", []>; +defm S_RFE_B64 : SOP1_64 , "s_rfe_b64", []>; + +let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in { + +defm S_AND_SAVEEXEC_B64 : SOP1_64 , "s_and_saveexec_b64", []>; +defm S_OR_SAVEEXEC_B64 : SOP1_64 , "s_or_saveexec_b64", []>; +defm S_XOR_SAVEEXEC_B64 : SOP1_64 , "s_xor_saveexec_b64", []>; +defm S_ANDN2_SAVEEXEC_B64 : SOP1_64 , "s_andn2_saveexec_b64", []>; +defm S_ORN2_SAVEEXEC_B64 : SOP1_64 , "s_orn2_saveexec_b64", []>; +defm S_NAND_SAVEEXEC_B64 : SOP1_64 , "s_nand_saveexec_b64", []>; +defm S_NOR_SAVEEXEC_B64 : SOP1_64 , "s_nor_saveexec_b64", []>; +defm S_XNOR_SAVEEXEC_B64 : SOP1_64 , "s_xnor_saveexec_b64", []>; + +} // End hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] + +defm S_QUADMASK_B32 : SOP1_32 , "s_quadmask_b32", []>; +defm S_QUADMASK_B64 : SOP1_64 , "s_quadmask_b64", []>; +defm S_MOVRELS_B32 : SOP1_32 , "s_movrels_b32", []>; +defm S_MOVRELS_B64 : SOP1_64 , "s_movrels_b64", []>; +defm S_MOVRELD_B32 : SOP1_32 , "s_movreld_b32", []>; +defm S_MOVRELD_B64 : SOP1_64 , "s_movreld_b64", []>; +defm S_CBRANCH_JOIN : SOP1_1 , "s_cbranch_join", []>; +defm S_MOV_REGRD_B32 : SOP1_32 , "s_mov_regrd_b32", []>; +let Defs = [SCC] in { + defm S_ABS_I32 : SOP1_32 , "s_abs_i32", []>; +} // End Defs = [SCC] +defm S_MOV_FED_B32 : SOP1_32 , "s_mov_fed_b32", []>; + +//===----------------------------------------------------------------------===// +// SOP2 Instructions +//===----------------------------------------------------------------------===// + +let Defs = [SCC] in { // Carry out goes to SCC +let isCommutable = 1 in { +defm S_ADD_U32 : SOP2_32 , "s_add_u32", []>; +defm S_ADD_I32 : SOP2_32 , "s_add_i32", + [(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))] +>; +} // End isCommutable = 1 + +defm S_SUB_U32 : SOP2_32 , "s_sub_u32", []>; +defm S_SUB_I32 : SOP2_32 , "s_sub_i32", + [(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))] +>; + +let Uses = [SCC] in { // Carry in comes from SCC +let isCommutable = 1 in { +defm S_ADDC_U32 : SOP2_32 , "s_addc_u32", + [(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; +} // End isCommutable = 1 + +defm S_SUBB_U32 : SOP2_32 , "s_subb_u32", + [(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; +} // End Uses = [SCC] + +defm S_MIN_I32 : SOP2_32 , "s_min_i32", + [(set i32:$dst, (smin i32:$src0, i32:$src1))] +>; +defm S_MIN_U32 : SOP2_32 , "s_min_u32", + [(set i32:$dst, (umin i32:$src0, i32:$src1))] +>; +defm S_MAX_I32 : SOP2_32 , "s_max_i32", + [(set i32:$dst, (smax i32:$src0, i32:$src1))] +>; +defm S_MAX_U32 : SOP2_32 , "s_max_u32", + [(set i32:$dst, (umax i32:$src0, i32:$src1))] +>; +} // End Defs = [SCC] + + +let Uses = [SCC] in { + defm S_CSELECT_B32 : SOP2_32 , "s_cselect_b32", []>; + defm S_CSELECT_B64 : SOP2_64 , "s_cselect_b64", []>; +} // End Uses = [SCC] + +let Defs = [SCC] in { +defm S_AND_B32 : SOP2_32 , "s_and_b32", + [(set i32:$dst, (and i32:$src0, i32:$src1))] +>; + +defm S_AND_B64 : SOP2_64 , "s_and_b64", + [(set i64:$dst, (and i64:$src0, i64:$src1))] +>; + +defm S_OR_B32 : SOP2_32 , "s_or_b32", + [(set i32:$dst, (or i32:$src0, i32:$src1))] +>; + +defm S_OR_B64 : SOP2_64 , "s_or_b64", + [(set i64:$dst, (or i64:$src0, i64:$src1))] +>; + +defm S_XOR_B32 : SOP2_32 , "s_xor_b32", + [(set i32:$dst, (xor i32:$src0, i32:$src1))] +>; + +defm S_XOR_B64 : SOP2_64 , "s_xor_b64", + [(set i64:$dst, (xor i64:$src0, i64:$src1))] +>; +defm S_ANDN2_B32 : SOP2_32 , "s_andn2_b32", []>; +defm S_ANDN2_B64 : SOP2_64 , "s_andn2_b64", []>; +defm S_ORN2_B32 : SOP2_32 , "s_orn2_b32", []>; +defm S_ORN2_B64 : SOP2_64 , "s_orn2_b64", []>; +defm S_NAND_B32 : SOP2_32 , "s_nand_b32", []>; +defm S_NAND_B64 : SOP2_64 , "s_nand_b64", []>; +defm S_NOR_B32 : SOP2_32 , "s_nor_b32", []>; +defm S_NOR_B64 : SOP2_64 , "s_nor_b64", []>; +defm S_XNOR_B32 : SOP2_32 , "s_xnor_b32", []>; +defm S_XNOR_B64 : SOP2_64 , "s_xnor_b64", []>; +} // End Defs = [SCC] + +// Use added complexity so these patterns are preferred to the VALU patterns. +let AddedComplexity = 1 in { +let Defs = [SCC] in { + +defm S_LSHL_B32 : SOP2_32 , "s_lshl_b32", + [(set i32:$dst, (shl i32:$src0, i32:$src1))] +>; +defm S_LSHL_B64 : SOP2_64_32 , "s_lshl_b64", + [(set i64:$dst, (shl i64:$src0, i32:$src1))] +>; +defm S_LSHR_B32 : SOP2_32 , "s_lshr_b32", + [(set i32:$dst, (srl i32:$src0, i32:$src1))] +>; +defm S_LSHR_B64 : SOP2_64_32 , "s_lshr_b64", + [(set i64:$dst, (srl i64:$src0, i32:$src1))] +>; +defm S_ASHR_I32 : SOP2_32 , "s_ashr_i32", + [(set i32:$dst, (sra i32:$src0, i32:$src1))] +>; +defm S_ASHR_I64 : SOP2_64_32 , "s_ashr_i64", + [(set i64:$dst, (sra i64:$src0, i32:$src1))] +>; +} // End Defs = [SCC] + +defm S_BFM_B32 : SOP2_32 , "s_bfm_b32", + [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))]>; +defm S_BFM_B64 : SOP2_64 , "s_bfm_b64", []>; +defm S_MUL_I32 : SOP2_32 , "s_mul_i32", + [(set i32:$dst, (mul i32:$src0, i32:$src1))] +>; + +} // End AddedComplexity = 1 + +let Defs = [SCC] in { +defm S_BFE_U32 : SOP2_32 , "s_bfe_u32", []>; +defm S_BFE_I32 : SOP2_32 , "s_bfe_i32", []>; +defm S_BFE_U64 : SOP2_64 , "s_bfe_u64", []>; +defm S_BFE_I64 : SOP2_64_32 , "s_bfe_i64", []>; +} // End Defs = [SCC] + +let sdst = 0 in { +defm S_CBRANCH_G_FORK : SOP2_m < + sop2<0x2b, 0x29>, "s_cbranch_g_fork", (outs), + (ins SReg_64:$src0, SReg_64:$src1), "s_cbranch_g_fork $src0, $src1", [] +>; +} + +let Defs = [SCC] in { +defm S_ABSDIFF_I32 : SOP2_32 , "s_absdiff_i32", []>; +} // End Defs = [SCC] + +//===----------------------------------------------------------------------===// +// SOPC Instructions +//===----------------------------------------------------------------------===// + +def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "s_cmp_eq_i32">; +def S_CMP_LG_I32 : SOPC_32 <0x00000001, "s_cmp_lg_i32">; +def S_CMP_GT_I32 : SOPC_32 <0x00000002, "s_cmp_gt_i32">; +def S_CMP_GE_I32 : SOPC_32 <0x00000003, "s_cmp_ge_i32">; +def S_CMP_LT_I32 : SOPC_32 <0x00000004, "s_cmp_lt_i32">; +def S_CMP_LE_I32 : SOPC_32 <0x00000005, "s_cmp_le_i32">; +def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "s_cmp_eq_u32">; +def S_CMP_LG_U32 : SOPC_32 <0x00000007, "s_cmp_lg_u32">; +def S_CMP_GT_U32 : SOPC_32 <0x00000008, "s_cmp_gt_u32">; +def S_CMP_GE_U32 : SOPC_32 <0x00000009, "s_cmp_ge_u32">; +def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "s_cmp_lt_u32">; +def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32">; +////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "s_bitcmp0_b32", []>; +////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "s_bitcmp1_b32", []>; +////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "s_bitcmp0_b64", []>; +////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "s_bitcmp1_b64", []>; +//def S_SETVSKIP : SOPC_ <0x00000010, "s_setvskip", []>; + +//===----------------------------------------------------------------------===// +// SOPK Instructions +//===----------------------------------------------------------------------===// + +let isReMaterializable = 1 in { +defm S_MOVK_I32 : SOPK_32 , "s_movk_i32", []>; +} // End isReMaterializable = 1 +let Uses = [SCC] in { + defm S_CMOVK_I32 : SOPK_32 , "s_cmovk_i32", []>; +} + +let isCompare = 1 in { + +/* +This instruction is disabled for now until we can figure out how to teach +the instruction selector to correctly use the S_CMP* vs V_CMP* +instructions. + +When this instruction is enabled the code generator sometimes produces this +invalid sequence: + +SCC = S_CMPK_EQ_I32 SGPR0, imm +VCC = COPY SCC +VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1 + +defm S_CMPK_EQ_I32 : SOPK_SCC , "s_cmpk_eq_i32", + [(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))] +>; +*/ + +defm S_CMPK_EQ_I32 : SOPK_SCC , "s_cmpk_eq_i32", []>; +defm S_CMPK_LG_I32 : SOPK_SCC , "s_cmpk_lg_i32", []>; +defm S_CMPK_GT_I32 : SOPK_SCC , "s_cmpk_gt_i32", []>; +defm S_CMPK_GE_I32 : SOPK_SCC , "s_cmpk_ge_i32", []>; +defm S_CMPK_LT_I32 : SOPK_SCC , "s_cmpk_lt_i32", []>; +defm S_CMPK_LE_I32 : SOPK_SCC , "s_cmpk_le_i32", []>; +defm S_CMPK_EQ_U32 : SOPK_SCC , "s_cmpk_eq_u32", []>; +defm S_CMPK_LG_U32 : SOPK_SCC , "s_cmpk_lg_u32", []>; +defm S_CMPK_GT_U32 : SOPK_SCC , "s_cmpk_gt_u32", []>; +defm S_CMPK_GE_U32 : SOPK_SCC , "s_cmpk_ge_u32", []>; +defm S_CMPK_LT_U32 : SOPK_SCC , "s_cmpk_lt_u32", []>; +defm S_CMPK_LE_U32 : SOPK_SCC , "s_cmpk_le_u32", []>; +} // End isCompare = 1 + +let Defs = [SCC], isCommutable = 1, DisableEncoding = "$src0", + Constraints = "$sdst = $src0" in { + defm S_ADDK_I32 : SOPK_32TIE , "s_addk_i32", []>; + defm S_MULK_I32 : SOPK_32TIE , "s_mulk_i32", []>; +} + +defm S_CBRANCH_I_FORK : SOPK_m < + sopk<0x11, 0x10>, "s_cbranch_i_fork", (outs), + (ins SReg_64:$sdst, u16imm:$simm16), " $sdst, $simm16" +>; +defm S_GETREG_B32 : SOPK_32 , "s_getreg_b32", []>; +defm S_SETREG_B32 : SOPK_m < + sopk<0x13, 0x12>, "s_setreg_b32", (outs), + (ins SReg_32:$sdst, u16imm:$simm16), " $sdst, $simm16" +>; +// FIXME: Not on SI? +//defm S_GETREG_REGRD_B32 : SOPK_32 , "s_getreg_regrd_b32", []>; +defm S_SETREG_IMM32_B32 : SOPK_IMM32 < + sopk<0x15, 0x14>, "s_setreg_imm32_b32", (outs), + (ins i32imm:$imm, u16imm:$simm16), " $imm, $simm16" +>; + +//===----------------------------------------------------------------------===// +// SOPP Instructions +//===----------------------------------------------------------------------===// + +def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "s_nop $simm16">; + +let isTerminator = 1 in { + +def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm", + [(IL_retflag)]> { + let simm16 = 0; + let isBarrier = 1; + let hasCtrlDep = 1; +} + +let isBranch = 1 in { +def S_BRANCH : SOPP < + 0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16", + [(br bb:$simm16)]> { + let isBarrier = 1; +} + +let DisableEncoding = "$scc" in { +def S_CBRANCH_SCC0 : SOPP < + 0x00000004, (ins sopp_brtarget:$simm16, SCCReg:$scc), + "s_cbranch_scc0 $simm16" +>; +def S_CBRANCH_SCC1 : SOPP < + 0x00000005, (ins sopp_brtarget:$simm16, SCCReg:$scc), + "s_cbranch_scc1 $simm16" +>; +} // End DisableEncoding = "$scc" + +def S_CBRANCH_VCCZ : SOPP < + 0x00000006, (ins sopp_brtarget:$simm16, VCCReg:$vcc), + "s_cbranch_vccz $simm16" +>; +def S_CBRANCH_VCCNZ : SOPP < + 0x00000007, (ins sopp_brtarget:$simm16, VCCReg:$vcc), + "s_cbranch_vccnz $simm16" +>; + +let DisableEncoding = "$exec" in { +def S_CBRANCH_EXECZ : SOPP < + 0x00000008, (ins sopp_brtarget:$simm16, EXECReg:$exec), + "s_cbranch_execz $simm16" +>; +def S_CBRANCH_EXECNZ : SOPP < + 0x00000009, (ins sopp_brtarget:$simm16, EXECReg:$exec), + "s_cbranch_execnz $simm16" +>; +} // End DisableEncoding = "$exec" + + +} // End isBranch = 1 +} // End isTerminator = 1 + +let hasSideEffects = 1 in { +def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier", + [(int_AMDGPU_barrier_local)] +> { + let simm16 = 0; + let isBarrier = 1; + let hasCtrlDep = 1; + let mayLoad = 1; + let mayStore = 1; +} + +def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">; +def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">; +def S_SLEEP : SOPP <0x0000000e, (ins i16imm:$simm16), "s_sleep $simm16">; +def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$sim16), "s_setprio $sim16">; + +let Uses = [EXEC, M0] in { + def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16), "s_sendmsg $simm16", + [(AMDGPUsendmsg (i32 imm:$simm16))] + >; +} // End Uses = [EXEC, M0] + +def S_SENDMSGHALT : SOPP <0x00000011, (ins i16imm:$simm16), "s_sendmsghalt $simm16">; +def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16">; +def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> { + let simm16 = 0; +} +def S_INCPERFLEVEL : SOPP <0x00000014, (ins i16imm:$simm16), "s_incperflevel $simm16">; +def S_DECPERFLEVEL : SOPP <0x00000015, (ins i16imm:$simm16), "s_decperflevel $simm16">; +def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> { + let simm16 = 0; +} +} // End hasSideEffects + +//===----------------------------------------------------------------------===// +// VOPC Instructions +//===----------------------------------------------------------------------===// + +let isCompare = 1, isCommutable = 1 in { + +defm V_CMP_F_F32 : VOPC_F32 , "v_cmp_f_f32">; +defm V_CMP_LT_F32 : VOPC_F32 , "v_cmp_lt_f32", COND_OLT, "v_cmp_gt_f32">; +defm V_CMP_EQ_F32 : VOPC_F32 , "v_cmp_eq_f32", COND_OEQ>; +defm V_CMP_LE_F32 : VOPC_F32 , "v_cmp_le_f32", COND_OLE, "v_cmp_ge_f32">; +defm V_CMP_GT_F32 : VOPC_F32 , "v_cmp_gt_f32", COND_OGT>; +defm V_CMP_LG_F32 : VOPC_F32 , "v_cmp_lg_f32", COND_ONE>; +defm V_CMP_GE_F32 : VOPC_F32 , "v_cmp_ge_f32", COND_OGE>; +defm V_CMP_O_F32 : VOPC_F32 , "v_cmp_o_f32", COND_O>; +defm V_CMP_U_F32 : VOPC_F32 , "v_cmp_u_f32", COND_UO>; +defm V_CMP_NGE_F32 : VOPC_F32 , "v_cmp_nge_f32", COND_ULT, "v_cmp_nle_f32">; +defm V_CMP_NLG_F32 : VOPC_F32 , "v_cmp_nlg_f32", COND_UEQ>; +defm V_CMP_NGT_F32 : VOPC_F32 , "v_cmp_ngt_f32", COND_ULE, "v_cmp_nlt_f32">; +defm V_CMP_NLE_F32 : VOPC_F32 , "v_cmp_nle_f32", COND_UGT>; +defm V_CMP_NEQ_F32 : VOPC_F32 , "v_cmp_neq_f32", COND_UNE>; +defm V_CMP_NLT_F32 : VOPC_F32 , "v_cmp_nlt_f32", COND_UGE>; +defm V_CMP_TRU_F32 : VOPC_F32 , "v_cmp_tru_f32">; + + +defm V_CMPX_F_F32 : VOPCX_F32 , "v_cmpx_f_f32">; +defm V_CMPX_LT_F32 : VOPCX_F32 , "v_cmpx_lt_f32", "v_cmpx_gt_f32">; +defm V_CMPX_EQ_F32 : VOPCX_F32 , "v_cmpx_eq_f32">; +defm V_CMPX_LE_F32 : VOPCX_F32 , "v_cmpx_le_f32", "v_cmpx_ge_f32">; +defm V_CMPX_GT_F32 : VOPCX_F32 , "v_cmpx_gt_f32">; +defm V_CMPX_LG_F32 : VOPCX_F32 , "v_cmpx_lg_f32">; +defm V_CMPX_GE_F32 : VOPCX_F32 , "v_cmpx_ge_f32">; +defm V_CMPX_O_F32 : VOPCX_F32 , "v_cmpx_o_f32">; +defm V_CMPX_U_F32 : VOPCX_F32 , "v_cmpx_u_f32">; +defm V_CMPX_NGE_F32 : VOPCX_F32 , "v_cmpx_nge_f32">; +defm V_CMPX_NLG_F32 : VOPCX_F32 , "v_cmpx_nlg_f32">; +defm V_CMPX_NGT_F32 : VOPCX_F32 , "v_cmpx_ngt_f32">; +defm V_CMPX_NLE_F32 : VOPCX_F32 , "v_cmpx_nle_f32">; +defm V_CMPX_NEQ_F32 : VOPCX_F32 , "v_cmpx_neq_f32">; +defm V_CMPX_NLT_F32 : VOPCX_F32 , "v_cmpx_nlt_f32">; +defm V_CMPX_TRU_F32 : VOPCX_F32 , "v_cmpx_tru_f32">; + + +defm V_CMP_F_F64 : VOPC_F64 , "v_cmp_f_f64">; +defm V_CMP_LT_F64 : VOPC_F64 , "v_cmp_lt_f64", COND_OLT, "v_cmp_gt_f64">; +defm V_CMP_EQ_F64 : VOPC_F64 , "v_cmp_eq_f64", COND_OEQ>; +defm V_CMP_LE_F64 : VOPC_F64 , "v_cmp_le_f64", COND_OLE, "v_cmp_ge_f64">; +defm V_CMP_GT_F64 : VOPC_F64 , "v_cmp_gt_f64", COND_OGT>; +defm V_CMP_LG_F64 : VOPC_F64 , "v_cmp_lg_f64", COND_ONE>; +defm V_CMP_GE_F64 : VOPC_F64 , "v_cmp_ge_f64", COND_OGE>; +defm V_CMP_O_F64 : VOPC_F64 , "v_cmp_o_f64", COND_O>; +defm V_CMP_U_F64 : VOPC_F64 , "v_cmp_u_f64", COND_UO>; +defm V_CMP_NGE_F64 : VOPC_F64 , "v_cmp_nge_f64", COND_ULT, "v_cmp_nle_f64">; +defm V_CMP_NLG_F64 : VOPC_F64 , "v_cmp_nlg_f64", COND_UEQ>; +defm V_CMP_NGT_F64 : VOPC_F64 , "v_cmp_ngt_f64", COND_ULE, "v_cmp_nlt_f64">; +defm V_CMP_NLE_F64 : VOPC_F64 , "v_cmp_nle_f64", COND_UGT>; +defm V_CMP_NEQ_F64 : VOPC_F64 , "v_cmp_neq_f64", COND_UNE>; +defm V_CMP_NLT_F64 : VOPC_F64 , "v_cmp_nlt_f64", COND_UGE>; +defm V_CMP_TRU_F64 : VOPC_F64 , "v_cmp_tru_f64">; + + +defm V_CMPX_F_F64 : VOPCX_F64 , "v_cmpx_f_f64">; +defm V_CMPX_LT_F64 : VOPCX_F64 , "v_cmpx_lt_f64", "v_cmpx_gt_f64">; +defm V_CMPX_EQ_F64 : VOPCX_F64 , "v_cmpx_eq_f64">; +defm V_CMPX_LE_F64 : VOPCX_F64 , "v_cmpx_le_f64", "v_cmpx_ge_f64">; +defm V_CMPX_GT_F64 : VOPCX_F64 , "v_cmpx_gt_f64">; +defm V_CMPX_LG_F64 : VOPCX_F64 , "v_cmpx_lg_f64">; +defm V_CMPX_GE_F64 : VOPCX_F64 , "v_cmpx_ge_f64">; +defm V_CMPX_O_F64 : VOPCX_F64 , "v_cmpx_o_f64">; +defm V_CMPX_U_F64 : VOPCX_F64 , "v_cmpx_u_f64">; +defm V_CMPX_NGE_F64 : VOPCX_F64 , "v_cmpx_nge_f64", "v_cmpx_nle_f64">; +defm V_CMPX_NLG_F64 : VOPCX_F64 , "v_cmpx_nlg_f64">; +defm V_CMPX_NGT_F64 : VOPCX_F64 , "v_cmpx_ngt_f64", "v_cmpx_nlt_f64">; +defm V_CMPX_NLE_F64 : VOPCX_F64 , "v_cmpx_nle_f64">; +defm V_CMPX_NEQ_F64 : VOPCX_F64 , "v_cmpx_neq_f64">; +defm V_CMPX_NLT_F64 : VOPCX_F64 , "v_cmpx_nlt_f64">; +defm V_CMPX_TRU_F64 : VOPCX_F64 , "v_cmpx_tru_f64">; + + +let SubtargetPredicate = isSICI in { + +defm V_CMPS_F_F32 : VOPC_F32 , "v_cmps_f_f32">; +defm V_CMPS_LT_F32 : VOPC_F32 , "v_cmps_lt_f32", COND_NULL, "v_cmps_gt_f32">; +defm V_CMPS_EQ_F32 : VOPC_F32 , "v_cmps_eq_f32">; +defm V_CMPS_LE_F32 : VOPC_F32 , "v_cmps_le_f32", COND_NULL, "v_cmps_ge_f32">; +defm V_CMPS_GT_F32 : VOPC_F32 , "v_cmps_gt_f32">; +defm V_CMPS_LG_F32 : VOPC_F32 , "v_cmps_lg_f32">; +defm V_CMPS_GE_F32 : VOPC_F32 , "v_cmps_ge_f32">; +defm V_CMPS_O_F32 : VOPC_F32 , "v_cmps_o_f32">; +defm V_CMPS_U_F32 : VOPC_F32 , "v_cmps_u_f32">; +defm V_CMPS_NGE_F32 : VOPC_F32 , "v_cmps_nge_f32", COND_NULL, "v_cmps_nle_f32">; +defm V_CMPS_NLG_F32 : VOPC_F32 , "v_cmps_nlg_f32">; +defm V_CMPS_NGT_F32 : VOPC_F32 , "v_cmps_ngt_f32", COND_NULL, "v_cmps_nlt_f32">; +defm V_CMPS_NLE_F32 : VOPC_F32 , "v_cmps_nle_f32">; +defm V_CMPS_NEQ_F32 : VOPC_F32 , "v_cmps_neq_f32">; +defm V_CMPS_NLT_F32 : VOPC_F32 , "v_cmps_nlt_f32">; +defm V_CMPS_TRU_F32 : VOPC_F32 , "v_cmps_tru_f32">; + + +defm V_CMPSX_F_F32 : VOPCX_F32 , "v_cmpsx_f_f32">; +defm V_CMPSX_LT_F32 : VOPCX_F32 , "v_cmpsx_lt_f32", "v_cmpsx_gt_f32">; +defm V_CMPSX_EQ_F32 : VOPCX_F32 , "v_cmpsx_eq_f32">; +defm V_CMPSX_LE_F32 : VOPCX_F32 , "v_cmpsx_le_f32", "v_cmpsx_ge_f32">; +defm V_CMPSX_GT_F32 : VOPCX_F32 , "v_cmpsx_gt_f32">; +defm V_CMPSX_LG_F32 : VOPCX_F32 , "v_cmpsx_lg_f32">; +defm V_CMPSX_GE_F32 : VOPCX_F32 , "v_cmpsx_ge_f32">; +defm V_CMPSX_O_F32 : VOPCX_F32 , "v_cmpsx_o_f32">; +defm V_CMPSX_U_F32 : VOPCX_F32 , "v_cmpsx_u_f32">; +defm V_CMPSX_NGE_F32 : VOPCX_F32 , "v_cmpsx_nge_f32", "v_cmpsx_nle_f32">; +defm V_CMPSX_NLG_F32 : VOPCX_F32 , "v_cmpsx_nlg_f32">; +defm V_CMPSX_NGT_F32 : VOPCX_F32 , "v_cmpsx_ngt_f32", "v_cmpsx_nlt_f32">; +defm V_CMPSX_NLE_F32 : VOPCX_F32 , "v_cmpsx_nle_f32">; +defm V_CMPSX_NEQ_F32 : VOPCX_F32 , "v_cmpsx_neq_f32">; +defm V_CMPSX_NLT_F32 : VOPCX_F32 , "v_cmpsx_nlt_f32">; +defm V_CMPSX_TRU_F32 : VOPCX_F32 , "v_cmpsx_tru_f32">; + + +defm V_CMPS_F_F64 : VOPC_F64 , "v_cmps_f_f64">; +defm V_CMPS_LT_F64 : VOPC_F64 , "v_cmps_lt_f64", COND_NULL, "v_cmps_gt_f64">; +defm V_CMPS_EQ_F64 : VOPC_F64 , "v_cmps_eq_f64">; +defm V_CMPS_LE_F64 : VOPC_F64 , "v_cmps_le_f64", COND_NULL, "v_cmps_ge_f64">; +defm V_CMPS_GT_F64 : VOPC_F64 , "v_cmps_gt_f64">; +defm V_CMPS_LG_F64 : VOPC_F64 , "v_cmps_lg_f64">; +defm V_CMPS_GE_F64 : VOPC_F64 , "v_cmps_ge_f64">; +defm V_CMPS_O_F64 : VOPC_F64 , "v_cmps_o_f64">; +defm V_CMPS_U_F64 : VOPC_F64 , "v_cmps_u_f64">; +defm V_CMPS_NGE_F64 : VOPC_F64 , "v_cmps_nge_f64", COND_NULL, "v_cmps_nle_f64">; +defm V_CMPS_NLG_F64 : VOPC_F64 , "v_cmps_nlg_f64">; +defm V_CMPS_NGT_F64 : VOPC_F64 , "v_cmps_ngt_f64", COND_NULL, "v_cmps_nlt_f64">; +defm V_CMPS_NLE_F64 : VOPC_F64 , "v_cmps_nle_f64">; +defm V_CMPS_NEQ_F64 : VOPC_F64 , "v_cmps_neq_f64">; +defm V_CMPS_NLT_F64 : VOPC_F64 , "v_cmps_nlt_f64">; +defm V_CMPS_TRU_F64 : VOPC_F64 , "v_cmps_tru_f64">; + + +defm V_CMPSX_F_F64 : VOPCX_F64 , "v_cmpsx_f_f64">; +defm V_CMPSX_LT_F64 : VOPCX_F64 , "v_cmpsx_lt_f64", "v_cmpsx_gt_f64">; +defm V_CMPSX_EQ_F64 : VOPCX_F64 , "v_cmpsx_eq_f64">; +defm V_CMPSX_LE_F64 : VOPCX_F64 , "v_cmpsx_le_f64", "v_cmpsx_ge_f64">; +defm V_CMPSX_GT_F64 : VOPCX_F64 , "v_cmpsx_gt_f64">; +defm V_CMPSX_LG_F64 : VOPCX_F64 , "v_cmpsx_lg_f64">; +defm V_CMPSX_GE_F64 : VOPCX_F64 , "v_cmpsx_ge_f64">; +defm V_CMPSX_O_F64 : VOPCX_F64 , "v_cmpsx_o_f64">; +defm V_CMPSX_U_F64 : VOPCX_F64 , "v_cmpsx_u_f64">; +defm V_CMPSX_NGE_F64 : VOPCX_F64 , "v_cmpsx_nge_f64", "v_cmpsx_nle_f64">; +defm V_CMPSX_NLG_F64 : VOPCX_F64 , "v_cmpsx_nlg_f64">; +defm V_CMPSX_NGT_F64 : VOPCX_F64 , "v_cmpsx_ngt_f64", "v_cmpsx_nlt_f64">; +defm V_CMPSX_NLE_F64 : VOPCX_F64 , "v_cmpsx_nle_f64">; +defm V_CMPSX_NEQ_F64 : VOPCX_F64 , "v_cmpsx_neq_f64">; +defm V_CMPSX_NLT_F64 : VOPCX_F64 , "v_cmpsx_nlt_f64">; +defm V_CMPSX_TRU_F64 : VOPCX_F64 , "v_cmpsx_tru_f64">; + +} // End SubtargetPredicate = isSICI + +defm V_CMP_F_I32 : VOPC_I32 , "v_cmp_f_i32">; +defm V_CMP_LT_I32 : VOPC_I32 , "v_cmp_lt_i32", COND_SLT, "v_cmp_gt_i32">; +defm V_CMP_EQ_I32 : VOPC_I32 , "v_cmp_eq_i32", COND_EQ>; +defm V_CMP_LE_I32 : VOPC_I32 , "v_cmp_le_i32", COND_SLE, "v_cmp_ge_i32">; +defm V_CMP_GT_I32 : VOPC_I32 , "v_cmp_gt_i32", COND_SGT>; +defm V_CMP_NE_I32 : VOPC_I32 , "v_cmp_ne_i32", COND_NE>; +defm V_CMP_GE_I32 : VOPC_I32 , "v_cmp_ge_i32", COND_SGE>; +defm V_CMP_T_I32 : VOPC_I32 , "v_cmp_t_i32">; + + +defm V_CMPX_F_I32 : VOPCX_I32 , "v_cmpx_f_i32">; +defm V_CMPX_LT_I32 : VOPCX_I32 , "v_cmpx_lt_i32", "v_cmpx_gt_i32">; +defm V_CMPX_EQ_I32 : VOPCX_I32 , "v_cmpx_eq_i32">; +defm V_CMPX_LE_I32 : VOPCX_I32 , "v_cmpx_le_i32", "v_cmpx_ge_i32">; +defm V_CMPX_GT_I32 : VOPCX_I32 , "v_cmpx_gt_i32">; +defm V_CMPX_NE_I32 : VOPCX_I32 , "v_cmpx_ne_i32">; +defm V_CMPX_GE_I32 : VOPCX_I32 , "v_cmpx_ge_i32">; +defm V_CMPX_T_I32 : VOPCX_I32 , "v_cmpx_t_i32">; + + +defm V_CMP_F_I64 : VOPC_I64 , "v_cmp_f_i64">; +defm V_CMP_LT_I64 : VOPC_I64 , "v_cmp_lt_i64", COND_SLT, "v_cmp_gt_i64">; +defm V_CMP_EQ_I64 : VOPC_I64 , "v_cmp_eq_i64", COND_EQ>; +defm V_CMP_LE_I64 : VOPC_I64 , "v_cmp_le_i64", COND_SLE, "v_cmp_ge_i64">; +defm V_CMP_GT_I64 : VOPC_I64 , "v_cmp_gt_i64", COND_SGT>; +defm V_CMP_NE_I64 : VOPC_I64 , "v_cmp_ne_i64", COND_NE>; +defm V_CMP_GE_I64 : VOPC_I64 , "v_cmp_ge_i64", COND_SGE>; +defm V_CMP_T_I64 : VOPC_I64 , "v_cmp_t_i64">; + + +defm V_CMPX_F_I64 : VOPCX_I64 , "v_cmpx_f_i64">; +defm V_CMPX_LT_I64 : VOPCX_I64 , "v_cmpx_lt_i64", "v_cmpx_gt_i64">; +defm V_CMPX_EQ_I64 : VOPCX_I64 , "v_cmpx_eq_i64">; +defm V_CMPX_LE_I64 : VOPCX_I64 , "v_cmpx_le_i64", "v_cmpx_ge_i64">; +defm V_CMPX_GT_I64 : VOPCX_I64 , "v_cmpx_gt_i64">; +defm V_CMPX_NE_I64 : VOPCX_I64 , "v_cmpx_ne_i64">; +defm V_CMPX_GE_I64 : VOPCX_I64 , "v_cmpx_ge_i64">; +defm V_CMPX_T_I64 : VOPCX_I64 , "v_cmpx_t_i64">; + + +defm V_CMP_F_U32 : VOPC_I32 , "v_cmp_f_u32">; +defm V_CMP_LT_U32 : VOPC_I32 , "v_cmp_lt_u32", COND_ULT, "v_cmp_gt_u32">; +defm V_CMP_EQ_U32 : VOPC_I32 , "v_cmp_eq_u32", COND_EQ>; +defm V_CMP_LE_U32 : VOPC_I32 , "v_cmp_le_u32", COND_ULE, "v_cmp_ge_u32">; +defm V_CMP_GT_U32 : VOPC_I32 , "v_cmp_gt_u32", COND_UGT>; +defm V_CMP_NE_U32 : VOPC_I32 , "v_cmp_ne_u32", COND_NE>; +defm V_CMP_GE_U32 : VOPC_I32 , "v_cmp_ge_u32", COND_UGE>; +defm V_CMP_T_U32 : VOPC_I32 , "v_cmp_t_u32">; + + +defm V_CMPX_F_U32 : VOPCX_I32 , "v_cmpx_f_u32">; +defm V_CMPX_LT_U32 : VOPCX_I32 , "v_cmpx_lt_u32", "v_cmpx_gt_u32">; +defm V_CMPX_EQ_U32 : VOPCX_I32 , "v_cmpx_eq_u32">; +defm V_CMPX_LE_U32 : VOPCX_I32 , "v_cmpx_le_u32", "v_cmpx_le_u32">; +defm V_CMPX_GT_U32 : VOPCX_I32 , "v_cmpx_gt_u32">; +defm V_CMPX_NE_U32 : VOPCX_I32 , "v_cmpx_ne_u32">; +defm V_CMPX_GE_U32 : VOPCX_I32 , "v_cmpx_ge_u32">; +defm V_CMPX_T_U32 : VOPCX_I32 , "v_cmpx_t_u32">; + + +defm V_CMP_F_U64 : VOPC_I64 , "v_cmp_f_u64">; +defm V_CMP_LT_U64 : VOPC_I64 , "v_cmp_lt_u64", COND_ULT, "v_cmp_gt_u64">; +defm V_CMP_EQ_U64 : VOPC_I64 , "v_cmp_eq_u64", COND_EQ>; +defm V_CMP_LE_U64 : VOPC_I64 , "v_cmp_le_u64", COND_ULE, "v_cmp_ge_u64">; +defm V_CMP_GT_U64 : VOPC_I64 , "v_cmp_gt_u64", COND_UGT>; +defm V_CMP_NE_U64 : VOPC_I64 , "v_cmp_ne_u64", COND_NE>; +defm V_CMP_GE_U64 : VOPC_I64 , "v_cmp_ge_u64", COND_UGE>; +defm V_CMP_T_U64 : VOPC_I64 , "v_cmp_t_u64">; + +defm V_CMPX_F_U64 : VOPCX_I64 , "v_cmpx_f_u64">; +defm V_CMPX_LT_U64 : VOPCX_I64 , "v_cmpx_lt_u64", "v_cmpx_gt_u64">; +defm V_CMPX_EQ_U64 : VOPCX_I64 , "v_cmpx_eq_u64">; +defm V_CMPX_LE_U64 : VOPCX_I64 , "v_cmpx_le_u64", "v_cmpx_ge_u64">; +defm V_CMPX_GT_U64 : VOPCX_I64 , "v_cmpx_gt_u64">; +defm V_CMPX_NE_U64 : VOPCX_I64 , "v_cmpx_ne_u64">; +defm V_CMPX_GE_U64 : VOPCX_I64 , "v_cmpx_ge_u64">; +defm V_CMPX_T_U64 : VOPCX_I64 , "v_cmpx_t_u64">; + +} // End isCompare = 1, isCommutable = 1 + +defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 , "v_cmp_class_f32">; +defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 , "v_cmpx_class_f32">; +defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 , "v_cmp_class_f64">; +defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 , "v_cmpx_class_f64">; + +//===----------------------------------------------------------------------===// +// DS Instructions +//===----------------------------------------------------------------------===// + +defm DS_ADD_U32 : DS_1A1D_NORET <0x0, "ds_add_u32", VGPR_32>; +defm DS_SUB_U32 : DS_1A1D_NORET <0x1, "ds_sub_u32", VGPR_32>; +defm DS_RSUB_U32 : DS_1A1D_NORET <0x2, "ds_rsub_u32", VGPR_32>; +defm DS_INC_U32 : DS_1A1D_NORET <0x3, "ds_inc_u32", VGPR_32>; +defm DS_DEC_U32 : DS_1A1D_NORET <0x4, "ds_dec_u32", VGPR_32>; +defm DS_MIN_I32 : DS_1A1D_NORET <0x5, "ds_min_i32", VGPR_32>; +defm DS_MAX_I32 : DS_1A1D_NORET <0x6, "ds_max_i32", VGPR_32>; +defm DS_MIN_U32 : DS_1A1D_NORET <0x7, "ds_min_u32", VGPR_32>; +defm DS_MAX_U32 : DS_1A1D_NORET <0x8, "ds_max_u32", VGPR_32>; +defm DS_AND_B32 : DS_1A1D_NORET <0x9, "ds_and_b32", VGPR_32>; +defm DS_OR_B32 : DS_1A1D_NORET <0xa, "ds_or_b32", VGPR_32>; +defm DS_XOR_B32 : DS_1A1D_NORET <0xb, "ds_xor_b32", VGPR_32>; +defm DS_MSKOR_B32 : DS_1A2D_NORET <0xc, "ds_mskor_b32", VGPR_32>; +let mayLoad = 0 in { +defm DS_WRITE_B32 : DS_1A1D_NORET <0xd, "ds_write_b32", VGPR_32>; +defm DS_WRITE2_B32 : DS_1A1D_Off8_NORET <0xe, "ds_write2_b32", VGPR_32>; +defm DS_WRITE2ST64_B32 : DS_1A1D_Off8_NORET <0xf, "ds_write2st64_b32", VGPR_32>; +} +defm DS_CMPST_B32 : DS_1A2D_NORET <0x10, "ds_cmpst_b32", VGPR_32>; +defm DS_CMPST_F32 : DS_1A2D_NORET <0x11, "ds_cmpst_f32", VGPR_32>; +defm DS_MIN_F32 : DS_1A2D_NORET <0x12, "ds_min_f32", VGPR_32>; +defm DS_MAX_F32 : DS_1A2D_NORET <0x13, "ds_max_f32", VGPR_32>; + +defm DS_GWS_INIT : DS_1A_GDS <0x19, "ds_gws_init">; +defm DS_GWS_SEMA_V : DS_1A_GDS <0x1a, "ds_gws_sema_v">; +defm DS_GWS_SEMA_BR : DS_1A_GDS <0x1b, "ds_gws_sema_br">; +defm DS_GWS_SEMA_P : DS_1A_GDS <0x1c, "ds_gws_sema_p">; +defm DS_GWS_BARRIER : DS_1A_GDS <0x1d, "ds_gws_barrier">; +let mayLoad = 0 in { +defm DS_WRITE_B8 : DS_1A1D_NORET <0x1e, "ds_write_b8", VGPR_32>; +defm DS_WRITE_B16 : DS_1A1D_NORET <0x1f, "ds_write_b16", VGPR_32>; +} +defm DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "ds_add_rtn_u32", VGPR_32, "ds_add_u32">; +defm DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "ds_sub_rtn_u32", VGPR_32, "ds_sub_u32">; +defm DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "ds_rsub_rtn_u32", VGPR_32, "ds_rsub_u32">; +defm DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "ds_inc_rtn_u32", VGPR_32, "ds_inc_u32">; +defm DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "ds_dec_rtn_u32", VGPR_32, "ds_dec_u32">; +defm DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "ds_min_rtn_i32", VGPR_32, "ds_min_i32">; +defm DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "ds_max_rtn_i32", VGPR_32, "ds_max_i32">; +defm DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "ds_min_rtn_u32", VGPR_32, "ds_min_u32">; +defm DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "ds_max_rtn_u32", VGPR_32, "ds_max_u32">; +defm DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "ds_and_rtn_b32", VGPR_32, "ds_and_b32">; +defm DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "ds_or_rtn_b32", VGPR_32, "ds_or_b32">; +defm DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "ds_xor_rtn_b32", VGPR_32, "ds_xor_b32">; +defm DS_MSKOR_RTN_B32 : DS_1A2D_RET <0x2c, "ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">; +defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "ds_wrxchg_rtn_b32", VGPR_32>; +defm DS_WRXCHG2_RTN_B32 : DS_1A2D_RET < + 0x2e, "ds_wrxchg2_rtn_b32", VReg_64, "", VGPR_32 +>; +defm DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_RET < + 0x2f, "ds_wrxchg2st64_rtn_b32", VReg_64, "", VGPR_32 +>; +defm DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">; +defm DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">; +defm DS_MIN_RTN_F32 : DS_1A2D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">; +defm DS_MAX_RTN_F32 : DS_1A2D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">; +let SubtargetPredicate = isCI in { +defm DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">; +} // End isCI +defm DS_SWIZZLE_B32 : DS_1A_RET <0x35, "ds_swizzle_b32", VGPR_32>; +let mayStore = 0 in { +defm DS_READ_B32 : DS_1A_RET <0x36, "ds_read_b32", VGPR_32>; +defm DS_READ2_B32 : DS_1A_Off8_RET <0x37, "ds_read2_b32", VReg_64>; +defm DS_READ2ST64_B32 : DS_1A_Off8_RET <0x38, "ds_read2st64_b32", VReg_64>; +defm DS_READ_I8 : DS_1A_RET <0x39, "ds_read_i8", VGPR_32>; +defm DS_READ_U8 : DS_1A_RET <0x3a, "ds_read_u8", VGPR_32>; +defm DS_READ_I16 : DS_1A_RET <0x3b, "ds_read_i16", VGPR_32>; +defm DS_READ_U16 : DS_1A_RET <0x3c, "ds_read_u16", VGPR_32>; +} +defm DS_CONSUME : DS_0A_RET <0x3d, "ds_consume">; +defm DS_APPEND : DS_0A_RET <0x3e, "ds_append">; +defm DS_ORDERED_COUNT : DS_1A_RET_GDS <0x3f, "ds_ordered_count">; +defm DS_ADD_U64 : DS_1A1D_NORET <0x40, "ds_add_u64", VReg_64>; +defm DS_SUB_U64 : DS_1A1D_NORET <0x41, "ds_sub_u64", VReg_64>; +defm DS_RSUB_U64 : DS_1A1D_NORET <0x42, "ds_rsub_u64", VReg_64>; +defm DS_INC_U64 : DS_1A1D_NORET <0x43, "ds_inc_u64", VReg_64>; +defm DS_DEC_U64 : DS_1A1D_NORET <0x44, "ds_dec_u64", VReg_64>; +defm DS_MIN_I64 : DS_1A1D_NORET <0x45, "ds_min_i64", VReg_64>; +defm DS_MAX_I64 : DS_1A1D_NORET <0x46, "ds_max_i64", VReg_64>; +defm DS_MIN_U64 : DS_1A1D_NORET <0x47, "ds_min_u64", VReg_64>; +defm DS_MAX_U64 : DS_1A1D_NORET <0x48, "ds_max_u64", VReg_64>; +defm DS_AND_B64 : DS_1A1D_NORET <0x49, "ds_and_b64", VReg_64>; +defm DS_OR_B64 : DS_1A1D_NORET <0x4a, "ds_or_b64", VReg_64>; +defm DS_XOR_B64 : DS_1A1D_NORET <0x4b, "ds_xor_b64", VReg_64>; +defm DS_MSKOR_B64 : DS_1A2D_NORET <0x4c, "ds_mskor_b64", VReg_64>; +let mayLoad = 0 in { +defm DS_WRITE_B64 : DS_1A1D_NORET <0x4d, "ds_write_b64", VReg_64>; +defm DS_WRITE2_B64 : DS_1A1D_Off8_NORET <0x4E, "ds_write2_b64", VReg_64>; +defm DS_WRITE2ST64_B64 : DS_1A1D_Off8_NORET <0x4f, "ds_write2st64_b64", VReg_64>; +} +defm DS_CMPST_B64 : DS_1A2D_NORET <0x50, "ds_cmpst_b64", VReg_64>; +defm DS_CMPST_F64 : DS_1A2D_NORET <0x51, "ds_cmpst_f64", VReg_64>; +defm DS_MIN_F64 : DS_1A1D_NORET <0x52, "ds_min_f64", VReg_64>; +defm DS_MAX_F64 : DS_1A1D_NORET <0x53, "ds_max_f64", VReg_64>; + +defm DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "ds_add_rtn_u64", VReg_64, "ds_add_u64">; +defm DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "ds_sub_rtn_u64", VReg_64, "ds_sub_u64">; +defm DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "ds_rsub_rtn_u64", VReg_64, "ds_rsub_u64">; +defm DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "ds_inc_rtn_u64", VReg_64, "ds_inc_u64">; +defm DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "ds_dec_rtn_u64", VReg_64, "ds_dec_u64">; +defm DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "ds_min_rtn_i64", VReg_64, "ds_min_i64">; +defm DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "ds_max_rtn_i64", VReg_64, "ds_max_i64">; +defm DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "ds_min_rtn_u64", VReg_64, "ds_min_u64">; +defm DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "ds_max_rtn_u64", VReg_64, "ds_max_u64">; +defm DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "ds_and_rtn_b64", VReg_64, "ds_and_b64">; +defm DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "ds_or_rtn_b64", VReg_64, "ds_or_b64">; +defm DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "ds_xor_rtn_b64", VReg_64, "ds_xor_b64">; +defm DS_MSKOR_RTN_B64 : DS_1A2D_RET <0x6c, "ds_mskor_rtn_b64", VReg_64, "ds_mskor_b64">; +defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "ds_wrxchg_rtn_b64", VReg_64, "ds_wrxchg_b64">; +defm DS_WRXCHG2_RTN_B64 : DS_1A2D_RET <0x6e, "ds_wrxchg2_rtn_b64", VReg_128, "ds_wrxchg2_b64", VReg_64>; +defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_RET <0x6f, "ds_wrxchg2st64_rtn_b64", VReg_128, "ds_wrxchg2st64_b64", VReg_64>; +defm DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "ds_cmpst_rtn_b64", VReg_64, "ds_cmpst_b64">; +defm DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "ds_cmpst_rtn_f64", VReg_64, "ds_cmpst_f64">; +defm DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "ds_min_rtn_f64", VReg_64, "ds_min_f64">; +defm DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "ds_max_rtn_f64", VReg_64, "ds_max_f64">; + +let mayStore = 0 in { +defm DS_READ_B64 : DS_1A_RET <0x76, "ds_read_b64", VReg_64>; +defm DS_READ2_B64 : DS_1A_Off8_RET <0x77, "ds_read2_b64", VReg_128>; +defm DS_READ2ST64_B64 : DS_1A_Off8_RET <0x78, "ds_read2st64_b64", VReg_128>; +} + +defm DS_ADD_SRC2_U32 : DS_1A <0x80, "ds_add_src2_u32">; +defm DS_SUB_SRC2_U32 : DS_1A <0x81, "ds_sub_src2_u32">; +defm DS_RSUB_SRC2_U32 : DS_1A <0x82, "ds_rsub_src2_u32">; +defm DS_INC_SRC2_U32 : DS_1A <0x83, "ds_inc_src2_u32">; +defm DS_DEC_SRC2_U32 : DS_1A <0x84, "ds_dec_src2_u32">; +defm DS_MIN_SRC2_I32 : DS_1A <0x85, "ds_min_src2_i32">; +defm DS_MAX_SRC2_I32 : DS_1A <0x86, "ds_max_src2_i32">; +defm DS_MIN_SRC2_U32 : DS_1A <0x87, "ds_min_src2_u32">; +defm DS_MAX_SRC2_U32 : DS_1A <0x88, "ds_max_src2_u32">; +defm DS_AND_SRC2_B32 : DS_1A <0x89, "ds_and_src_b32">; +defm DS_OR_SRC2_B32 : DS_1A <0x8a, "ds_or_src2_b32">; +defm DS_XOR_SRC2_B32 : DS_1A <0x8b, "ds_xor_src2_b32">; +defm DS_WRITE_SRC2_B32 : DS_1A <0x8c, "ds_write_src2_b32">; + +defm DS_MIN_SRC2_F32 : DS_1A <0x92, "ds_min_src2_f32">; +defm DS_MAX_SRC2_F32 : DS_1A <0x93, "ds_max_src2_f32">; + +defm DS_ADD_SRC2_U64 : DS_1A <0xc0, "ds_add_src2_u64">; +defm DS_SUB_SRC2_U64 : DS_1A <0xc1, "ds_sub_src2_u64">; +defm DS_RSUB_SRC2_U64 : DS_1A <0xc2, "ds_rsub_src2_u64">; +defm DS_INC_SRC2_U64 : DS_1A <0xc3, "ds_inc_src2_u64">; +defm DS_DEC_SRC2_U64 : DS_1A <0xc4, "ds_dec_src2_u64">; +defm DS_MIN_SRC2_I64 : DS_1A <0xc5, "ds_min_src2_i64">; +defm DS_MAX_SRC2_I64 : DS_1A <0xc6, "ds_max_src2_i64">; +defm DS_MIN_SRC2_U64 : DS_1A <0xc7, "ds_min_src2_u64">; +defm DS_MAX_SRC2_U64 : DS_1A <0xc8, "ds_max_src2_u64">; +defm DS_AND_SRC2_B64 : DS_1A <0xc9, "ds_and_src2_b64">; +defm DS_OR_SRC2_B64 : DS_1A <0xca, "ds_or_src2_b64">; +defm DS_XOR_SRC2_B64 : DS_1A <0xcb, "ds_xor_src2_b64">; +defm DS_WRITE_SRC2_B64 : DS_1A <0xcc, "ds_write_src2_b64">; + +defm DS_MIN_SRC2_F64 : DS_1A <0xd2, "ds_min_src2_f64">; +defm DS_MAX_SRC2_F64 : DS_1A <0xd3, "ds_max_src2_f64">; + +//let SubtargetPredicate = isCI in { +// DS_CONDXCHG32_RTN_B64 +// DS_CONDXCHG32_RTN_B128 +//} // End isCI + +//===----------------------------------------------------------------------===// +// MUBUF Instructions +//===----------------------------------------------------------------------===// + +defm BUFFER_LOAD_FORMAT_X : MUBUF_Load_Helper < + mubuf<0x00>, "buffer_load_format_x", VGPR_32 +>; +defm BUFFER_LOAD_FORMAT_XY : MUBUF_Load_Helper < + mubuf<0x01>, "buffer_load_format_xy", VReg_64 +>; +defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Load_Helper < + mubuf<0x02>, "buffer_load_format_xyz", VReg_96 +>; +defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper < + mubuf<0x03>, "buffer_load_format_xyzw", VReg_128 +>; +defm BUFFER_STORE_FORMAT_X : MUBUF_Store_Helper < + mubuf<0x04>, "buffer_store_format_x", VGPR_32 +>; +defm BUFFER_STORE_FORMAT_XY : MUBUF_Store_Helper < + mubuf<0x05>, "buffer_store_format_xy", VReg_64 +>; +defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Store_Helper < + mubuf<0x06>, "buffer_store_format_xyz", VReg_96 +>; +defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Store_Helper < + mubuf<0x07>, "buffer_store_format_xyzw", VReg_128 +>; +defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper < + mubuf<0x08, 0x10>, "buffer_load_ubyte", VGPR_32, i32, az_extloadi8_global +>; +defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper < + mubuf<0x09, 0x11>, "buffer_load_sbyte", VGPR_32, i32, sextloadi8_global +>; +defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper < + mubuf<0x0a, 0x12>, "buffer_load_ushort", VGPR_32, i32, az_extloadi16_global +>; +defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper < + mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, sextloadi16_global +>; +defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper < + mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, global_load +>; +defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper < + mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, global_load +>; +defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper < + mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, global_load +>; + +defm BUFFER_STORE_BYTE : MUBUF_Store_Helper < + mubuf<0x18>, "buffer_store_byte", VGPR_32, i32, truncstorei8_global +>; + +defm BUFFER_STORE_SHORT : MUBUF_Store_Helper < + mubuf<0x1a>, "buffer_store_short", VGPR_32, i32, truncstorei16_global +>; + +defm BUFFER_STORE_DWORD : MUBUF_Store_Helper < + mubuf<0x1c>, "buffer_store_dword", VGPR_32, i32, global_store +>; + +defm BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper < + mubuf<0x1d>, "buffer_store_dwordx2", VReg_64, v2i32, global_store +>; + +defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper < + mubuf<0x1e, 0x1f>, "buffer_store_dwordx4", VReg_128, v4i32, global_store +>; + +defm BUFFER_ATOMIC_SWAP : MUBUF_Atomic < + mubuf<0x30, 0x40>, "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global +>; +//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ , "buffer_atomic_cmpswap", []>; +defm BUFFER_ATOMIC_ADD : MUBUF_Atomic < + mubuf<0x32, 0x42>, "buffer_atomic_add", VGPR_32, i32, atomic_add_global +>; +defm BUFFER_ATOMIC_SUB : MUBUF_Atomic < + mubuf<0x33, 0x43>, "buffer_atomic_sub", VGPR_32, i32, atomic_sub_global +>; +//def BUFFER_ATOMIC_RSUB : MUBUF_ , "buffer_atomic_rsub", []>; // isn't on CI & VI +defm BUFFER_ATOMIC_SMIN : MUBUF_Atomic < + mubuf<0x35, 0x44>, "buffer_atomic_smin", VGPR_32, i32, atomic_min_global +>; +defm BUFFER_ATOMIC_UMIN : MUBUF_Atomic < + mubuf<0x36, 0x45>, "buffer_atomic_umin", VGPR_32, i32, atomic_umin_global +>; +defm BUFFER_ATOMIC_SMAX : MUBUF_Atomic < + mubuf<0x37, 0x46>, "buffer_atomic_smax", VGPR_32, i32, atomic_max_global +>; +defm BUFFER_ATOMIC_UMAX : MUBUF_Atomic < + mubuf<0x38, 0x47>, "buffer_atomic_umax", VGPR_32, i32, atomic_umax_global +>; +defm BUFFER_ATOMIC_AND : MUBUF_Atomic < + mubuf<0x39, 0x48>, "buffer_atomic_and", VGPR_32, i32, atomic_and_global +>; +defm BUFFER_ATOMIC_OR : MUBUF_Atomic < + mubuf<0x3a, 0x49>, "buffer_atomic_or", VGPR_32, i32, atomic_or_global +>; +defm BUFFER_ATOMIC_XOR : MUBUF_Atomic < + mubuf<0x3b, 0x4a>, "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global +>; +//def BUFFER_ATOMIC_INC : MUBUF_ , "buffer_atomic_inc", []>; +//def BUFFER_ATOMIC_DEC : MUBUF_ , "buffer_atomic_dec", []>; +//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ , "buffer_atomic_fcmpswap", []>; // isn't on VI +//def BUFFER_ATOMIC_FMIN : MUBUF_ , "buffer_atomic_fmin", []>; // isn't on VI +//def BUFFER_ATOMIC_FMAX : MUBUF_ , "buffer_atomic_fmax", []>; // isn't on VI +//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 , "buffer_atomic_swap_x2", []>; +//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 , "buffer_atomic_cmpswap_x2", []>; +//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 , "buffer_atomic_add_x2", []>; +//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 , "buffer_atomic_sub_x2", []>; +//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 , "buffer_atomic_rsub_x2", []>; // isn't on CI & VI +//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 , "buffer_atomic_smin_x2", []>; +//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 , "buffer_atomic_umin_x2", []>; +//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 , "buffer_atomic_smax_x2", []>; +//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 , "buffer_atomic_umax_x2", []>; +//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 , "buffer_atomic_and_x2", []>; +//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 , "buffer_atomic_or_x2", []>; +//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 , "buffer_atomic_xor_x2", []>; +//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 , "buffer_atomic_inc_x2", []>; +//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 , "buffer_atomic_dec_x2", []>; +//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 , "buffer_atomic_fcmpswap_x2", []>; // isn't on VI +//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 , "buffer_atomic_fmin_x2", []>; // isn't on VI +//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 , "buffer_atomic_fmax_x2", []>; // isn't on VI +//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 , "buffer_wbinvl1_sc", []>; // isn't on CI & VI +//def BUFFER_WBINVL1_VOL : MUBUF_WBINVL1 , "buffer_wbinvl1_vol", []>; // isn't on SI +//def BUFFER_WBINVL1 : MUBUF_WBINVL1 , "buffer_wbinvl1", []>; + +//===----------------------------------------------------------------------===// +// MTBUF Instructions +//===----------------------------------------------------------------------===// + +//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "tbuffer_load_format_x", []>; +//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "tbuffer_load_format_xy", []>; +//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "tbuffer_load_format_xyz", []>; +defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "tbuffer_load_format_xyzw", VReg_128>; +defm TBUFFER_STORE_FORMAT_X : MTBUF_Store_Helper <0x00000004, "tbuffer_store_format_x", VGPR_32>; +defm TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Helper <0x00000005, "tbuffer_store_format_xy", VReg_64>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "tbuffer_store_format_xyz", VReg_128>; +defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "tbuffer_store_format_xyzw", VReg_128>; + +//===----------------------------------------------------------------------===// +// MIMG Instructions +//===----------------------------------------------------------------------===// + +defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load">; +defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip">; +//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"image_load_pck", 0x00000002>; +//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"image_load_pck_sgn", 0x00000003>; +//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"image_load_mip_pck", 0x00000004>; +//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"image_load_mip_pck_sgn", 0x00000005>; +//def IMAGE_STORE : MIMG_NoPattern_ <"image_store", 0x00000008>; +//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"image_store_mip", 0x00000009>; +//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>; +//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>; +defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">; +//def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"image_atomic_swap", 0x0000000f>; +//def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"image_atomic_cmpswap", 0x00000010>; +//def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"image_atomic_add", 0x00000011>; +//def IMAGE_ATOMIC_SUB : MIMG_NoPattern_ <"image_atomic_sub", 0x00000012>; +//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>; +//def IMAGE_ATOMIC_SMIN : MIMG_NoPattern_ <"image_atomic_smin", 0x00000014>; +//def IMAGE_ATOMIC_UMIN : MIMG_NoPattern_ <"image_atomic_umin", 0x00000015>; +//def IMAGE_ATOMIC_SMAX : MIMG_NoPattern_ <"image_atomic_smax", 0x00000016>; +//def IMAGE_ATOMIC_UMAX : MIMG_NoPattern_ <"image_atomic_umax", 0x00000017>; +//def IMAGE_ATOMIC_AND : MIMG_NoPattern_ <"image_atomic_and", 0x00000018>; +//def IMAGE_ATOMIC_OR : MIMG_NoPattern_ <"image_atomic_or", 0x00000019>; +//def IMAGE_ATOMIC_XOR : MIMG_NoPattern_ <"image_atomic_xor", 0x0000001a>; +//def IMAGE_ATOMIC_INC : MIMG_NoPattern_ <"image_atomic_inc", 0x0000001b>; +//def IMAGE_ATOMIC_DEC : MIMG_NoPattern_ <"image_atomic_dec", 0x0000001c>; +//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>; +//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; +//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; +defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, "image_sample">; +defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, "image_sample_cl">; +defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "image_sample_d">; +defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, "image_sample_d_cl">; +defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, "image_sample_l">; +defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, "image_sample_b">; +defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, "image_sample_b_cl">; +defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, "image_sample_lz">; +defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, "image_sample_c">; +defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, "image_sample_c_cl">; +defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, "image_sample_c_d">; +defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, "image_sample_c_d_cl">; +defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, "image_sample_c_l">; +defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, "image_sample_c_b">; +defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, "image_sample_c_b_cl">; +defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, "image_sample_c_lz">; +defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, "image_sample_o">; +defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, "image_sample_cl_o">; +defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, "image_sample_d_o">; +defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, "image_sample_d_cl_o">; +defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, "image_sample_l_o">; +defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, "image_sample_b_o">; +defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, "image_sample_b_cl_o">; +defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, "image_sample_lz_o">; +defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, "image_sample_c_o">; +defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, "image_sample_c_cl_o">; +defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, "image_sample_c_d_o">; +defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, "image_sample_c_d_cl_o">; +defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, "image_sample_c_l_o">; +defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, "image_sample_c_b_o">; +defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, "image_sample_c_b_cl_o">; +defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, "image_sample_c_lz_o">; +defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, "image_gather4">; +defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, "image_gather4_cl">; +defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, "image_gather4_l">; +defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, "image_gather4_b">; +defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, "image_gather4_b_cl">; +defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, "image_gather4_lz">; +defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, "image_gather4_c">; +defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, "image_gather4_c_cl">; +defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, "image_gather4_c_l">; +defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, "image_gather4_c_b">; +defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, "image_gather4_c_b_cl">; +defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, "image_gather4_c_lz">; +defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, "image_gather4_o">; +defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, "image_gather4_cl_o">; +defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, "image_gather4_l_o">; +defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, "image_gather4_b_o">; +defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, "image_gather4_b_cl_o">; +defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, "image_gather4_lz_o">; +defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, "image_gather4_c_o">; +defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, "image_gather4_c_cl_o">; +defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">; +defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, "image_gather4_c_b_o">; +defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, "image_gather4_c_b_cl_o">; +defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">; +defm IMAGE_GET_LOD : MIMG_Sampler_WQM <0x00000060, "image_get_lod">; +defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, "image_sample_cd">; +defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, "image_sample_cd_cl">; +defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, "image_sample_c_cd">; +defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, "image_sample_c_cd_cl">; +defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, "image_sample_cd_o">; +defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, "image_sample_cd_cl_o">; +defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, "image_sample_c_cd_o">; +defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o">; +//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>; +//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>; + +//===----------------------------------------------------------------------===// +// VOP1 Instructions +//===----------------------------------------------------------------------===// + +let vdst = 0, src0 = 0 in { +defm V_NOP : VOP1_m , (outs), (ins), "v_nop", [], "v_nop">; +} + +let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in { +defm V_MOV_B32 : VOP1Inst , "v_mov_b32", VOP_I32_I32>; +} // End isMoveImm = 1 + +let Uses = [EXEC] in { + +// FIXME: Specify SchedRW for READFIRSTLANE_B32 + +def V_READFIRSTLANE_B32 : VOP1 < + 0x00000002, + (outs SReg_32:$vdst), + (ins VGPR_32:$src0), + "v_readfirstlane_b32 $vdst, $src0", + [] +>; + +} + +let SchedRW = [WriteQuarterRate32] in { + +defm V_CVT_I32_F64 : VOP1Inst , "v_cvt_i32_f64", + VOP_I32_F64, fp_to_sint +>; +defm V_CVT_F64_I32 : VOP1Inst , "v_cvt_f64_i32", + VOP_F64_I32, sint_to_fp +>; +defm V_CVT_F32_I32 : VOP1Inst , "v_cvt_f32_i32", + VOP_F32_I32, sint_to_fp +>; +defm V_CVT_F32_U32 : VOP1Inst , "v_cvt_f32_u32", + VOP_F32_I32, uint_to_fp +>; +defm V_CVT_U32_F32 : VOP1Inst , "v_cvt_u32_f32", + VOP_I32_F32, fp_to_uint +>; +defm V_CVT_I32_F32 : VOP1Inst , "v_cvt_i32_f32", + VOP_I32_F32, fp_to_sint +>; +defm V_CVT_F16_F32 : VOP1Inst , "v_cvt_f16_f32", + VOP_I32_F32, fp_to_f16 +>; +defm V_CVT_F32_F16 : VOP1Inst , "v_cvt_f32_f16", + VOP_F32_I32, f16_to_fp +>; +defm V_CVT_RPI_I32_F32 : VOP1Inst , "v_cvt_rpi_i32_f32", + VOP_I32_F32, cvt_rpi_i32_f32>; +defm V_CVT_FLR_I32_F32 : VOP1Inst , "v_cvt_flr_i32_f32", + VOP_I32_F32, cvt_flr_i32_f32>; +defm V_CVT_OFF_F32_I4 : VOP1Inst , "v_cvt_off_f32_i4", VOP_F32_I32>; +defm V_CVT_F32_F64 : VOP1Inst , "v_cvt_f32_f64", + VOP_F32_F64, fround +>; +defm V_CVT_F64_F32 : VOP1Inst , "v_cvt_f64_f32", + VOP_F64_F32, fextend +>; +defm V_CVT_F32_UBYTE0 : VOP1Inst , "v_cvt_f32_ubyte0", + VOP_F32_I32, AMDGPUcvt_f32_ubyte0 +>; +defm V_CVT_F32_UBYTE1 : VOP1Inst , "v_cvt_f32_ubyte1", + VOP_F32_I32, AMDGPUcvt_f32_ubyte1 +>; +defm V_CVT_F32_UBYTE2 : VOP1Inst , "v_cvt_f32_ubyte2", + VOP_F32_I32, AMDGPUcvt_f32_ubyte2 +>; +defm V_CVT_F32_UBYTE3 : VOP1Inst , "v_cvt_f32_ubyte3", + VOP_F32_I32, AMDGPUcvt_f32_ubyte3 +>; +defm V_CVT_U32_F64 : VOP1Inst , "v_cvt_u32_f64", + VOP_I32_F64, fp_to_uint +>; +defm V_CVT_F64_U32 : VOP1Inst , "v_cvt_f64_u32", + VOP_F64_I32, uint_to_fp +>; + +} // let SchedRW = [WriteQuarterRate32] + +defm V_FRACT_F32 : VOP1Inst , "v_fract_f32", + VOP_F32_F32, AMDGPUfract +>; +defm V_TRUNC_F32 : VOP1Inst , "v_trunc_f32", + VOP_F32_F32, ftrunc +>; +defm V_CEIL_F32 : VOP1Inst , "v_ceil_f32", + VOP_F32_F32, fceil +>; +defm V_RNDNE_F32 : VOP1Inst , "v_rndne_f32", + VOP_F32_F32, frint +>; +defm V_FLOOR_F32 : VOP1Inst , "v_floor_f32", + VOP_F32_F32, ffloor +>; +defm V_EXP_F32 : VOP1Inst , "v_exp_f32", + VOP_F32_F32, fexp2 +>; + +let SchedRW = [WriteQuarterRate32] in { + +defm V_LOG_F32 : VOP1Inst , "v_log_f32", + VOP_F32_F32, flog2 +>; +defm V_RCP_F32 : VOP1Inst , "v_rcp_f32", + VOP_F32_F32, AMDGPUrcp +>; +defm V_RCP_IFLAG_F32 : VOP1Inst , "v_rcp_iflag_f32", + VOP_F32_F32 +>; +defm V_RSQ_F32 : VOP1Inst , "v_rsq_f32", + VOP_F32_F32, AMDGPUrsq +>; + +} //let SchedRW = [WriteQuarterRate32] + +let SchedRW = [WriteDouble] in { + +defm V_RCP_F64 : VOP1Inst , "v_rcp_f64", + VOP_F64_F64, AMDGPUrcp +>; +defm V_RSQ_F64 : VOP1Inst , "v_rsq_f64", + VOP_F64_F64, AMDGPUrsq +>; + +} // let SchedRW = [WriteDouble]; + +defm V_SQRT_F32 : VOP1Inst , "v_sqrt_f32", + VOP_F32_F32, fsqrt +>; + +let SchedRW = [WriteDouble] in { + +defm V_SQRT_F64 : VOP1Inst , "v_sqrt_f64", + VOP_F64_F64, fsqrt +>; + +} // let SchedRW = [WriteDouble] + +defm V_SIN_F32 : VOP1Inst , "v_sin_f32", + VOP_F32_F32, AMDGPUsin +>; +defm V_COS_F32 : VOP1Inst , "v_cos_f32", + VOP_F32_F32, AMDGPUcos +>; +defm V_NOT_B32 : VOP1Inst , "v_not_b32", VOP_I32_I32>; +defm V_BFREV_B32 : VOP1Inst , "v_bfrev_b32", VOP_I32_I32>; +defm V_FFBH_U32 : VOP1Inst , "v_ffbh_u32", VOP_I32_I32>; +defm V_FFBL_B32 : VOP1Inst , "v_ffbl_b32", VOP_I32_I32>; +defm V_FFBH_I32 : VOP1Inst , "v_ffbh_i32", VOP_I32_I32>; +defm V_FREXP_EXP_I32_F64 : VOP1Inst , "v_frexp_exp_i32_f64", + VOP_I32_F64 +>; +defm V_FREXP_MANT_F64 : VOP1Inst , "v_frexp_mant_f64", + VOP_F64_F64 +>; +defm V_FRACT_F64 : VOP1Inst , "v_fract_f64", VOP_F64_F64>; +defm V_FREXP_EXP_I32_F32 : VOP1Inst , "v_frexp_exp_i32_f32", + VOP_I32_F32 +>; +defm V_FREXP_MANT_F32 : VOP1Inst , "v_frexp_mant_f32", + VOP_F32_F32 +>; +let vdst = 0, src0 = 0 in { +defm V_CLREXCP : VOP1_m , (outs), (ins), "v_clrexcp", [], + "v_clrexcp" +>; +} +defm V_MOVRELD_B32 : VOP1Inst , "v_movreld_b32", VOP_I32_I32>; +defm V_MOVRELS_B32 : VOP1Inst , "v_movrels_b32", VOP_I32_I32>; +defm V_MOVRELSD_B32 : VOP1Inst , "v_movrelsd_b32", VOP_I32_I32>; + +// These instruction only exist on SI and CI +let SubtargetPredicate = isSICI in { + +let SchedRW = [WriteQuarterRate32] in { + +defm V_MOV_FED_B32 : VOP1InstSI , "v_mov_fed_b32", VOP_I32_I32>; +defm V_LOG_CLAMP_F32 : VOP1InstSI , "v_log_clamp_f32", VOP_F32_F32>; +defm V_RCP_CLAMP_F32 : VOP1InstSI , "v_rcp_clamp_f32", VOP_F32_F32>; +defm V_RCP_LEGACY_F32 : VOP1InstSI , "v_rcp_legacy_f32", VOP_F32_F32>; +defm V_RSQ_CLAMP_F32 : VOP1InstSI , "v_rsq_clamp_f32", + VOP_F32_F32, AMDGPUrsq_clamped +>; +defm V_RSQ_LEGACY_F32 : VOP1InstSI , "v_rsq_legacy_f32", + VOP_F32_F32, AMDGPUrsq_legacy +>; + +} // End let SchedRW = [WriteQuarterRate32] + +let SchedRW = [WriteDouble] in { + +defm V_RCP_CLAMP_F64 : VOP1InstSI , "v_rcp_clamp_f64", VOP_F64_F64>; +defm V_RSQ_CLAMP_F64 : VOP1InstSI , "v_rsq_clamp_f64", + VOP_F64_F64, AMDGPUrsq_clamped +>; + +} // End SchedRW = [WriteDouble] + +} // End SubtargetPredicate = isSICI + +//===----------------------------------------------------------------------===// +// VINTRP Instructions +//===----------------------------------------------------------------------===// + +let Uses = [M0] in { + +// FIXME: Specify SchedRW for VINTRP insturctions. + +multiclass V_INTERP_P1_F32_m : VINTRP_m < + 0x00000000, + (outs VGPR_32:$dst), + (ins VGPR_32:$i, i32imm:$attr_chan, i32imm:$attr), + "v_interp_p1_f32 $dst, $i, $attr_chan, $attr, [m0]", + [(set f32:$dst, (AMDGPUinterp_p1 i32:$i, (i32 imm:$attr_chan), + (i32 imm:$attr)))] +>; + +let OtherPredicates = [has32BankLDS] in { + +defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m; + +} // End OtherPredicates = [has32BankLDS] + +let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $dst" in { + +defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m; + +} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $dst" + +let DisableEncoding = "$src0", Constraints = "$src0 = $dst" in { + +defm V_INTERP_P2_F32 : VINTRP_m < + 0x00000001, + (outs VGPR_32:$dst), + (ins VGPR_32:$src0, VGPR_32:$j, i32imm:$attr_chan, i32imm:$attr), + "v_interp_p2_f32 $dst, [$src0], $j, $attr_chan, $attr, [m0]", + [(set f32:$dst, (AMDGPUinterp_p2 f32:$src0, i32:$j, (i32 imm:$attr_chan), + (i32 imm:$attr)))]>; + +} // End DisableEncoding = "$src0", Constraints = "$src0 = $dst" + +defm V_INTERP_MOV_F32 : VINTRP_m < + 0x00000002, + (outs VGPR_32:$dst), + (ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr), + "v_interp_mov_f32 $dst, $src0, $attr_chan, $attr, [m0]", + [(set f32:$dst, (AMDGPUinterp_mov (i32 imm:$src0), (i32 imm:$attr_chan), + (i32 imm:$attr)))]>; + +} // End Uses = [M0] + +//===----------------------------------------------------------------------===// +// VOP2 Instructions +//===----------------------------------------------------------------------===// + +multiclass V_CNDMASK { + defm _e32 : VOP2_m < + op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins32, VOP_CNDMASK.Asm32, [], + name, name>; + + defm _e64 : VOP3_m < + op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins64, + name#!cast(VOP_CNDMASK.Asm64), [], name, 3>; +} + +defm V_CNDMASK_B32 : V_CNDMASK, "v_cndmask_b32">; + +let isCommutable = 1 in { +defm V_ADD_F32 : VOP2Inst , "v_add_f32", + VOP_F32_F32_F32, fadd +>; + +defm V_SUB_F32 : VOP2Inst , "v_sub_f32", VOP_F32_F32_F32, fsub>; +defm V_SUBREV_F32 : VOP2Inst , "v_subrev_f32", + VOP_F32_F32_F32, null_frag, "v_sub_f32" +>; +} // End isCommutable = 1 + +let isCommutable = 1 in { + +defm V_MUL_LEGACY_F32 : VOP2Inst , "v_mul_legacy_f32", + VOP_F32_F32_F32, int_AMDGPU_mul +>; + +defm V_MUL_F32 : VOP2Inst , "v_mul_f32", + VOP_F32_F32_F32, fmul +>; + +defm V_MUL_I32_I24 : VOP2Inst , "v_mul_i32_i24", + VOP_I32_I32_I32, AMDGPUmul_i24 +>; + +defm V_MUL_HI_I32_I24 : VOP2Inst , "v_mul_hi_i32_i24", + VOP_I32_I32_I32 +>; + +defm V_MUL_U32_U24 : VOP2Inst , "v_mul_u32_u24", + VOP_I32_I32_I32, AMDGPUmul_u24 +>; + +defm V_MUL_HI_U32_U24 : VOP2Inst , "v_mul_hi_u32_u24", + VOP_I32_I32_I32 +>; + +defm V_MIN_F32 : VOP2Inst , "v_min_f32", VOP_F32_F32_F32, + fminnum>; +defm V_MAX_F32 : VOP2Inst , "v_max_f32", VOP_F32_F32_F32, + fmaxnum>; +defm V_MIN_I32 : VOP2Inst , "v_min_i32", VOP_I32_I32_I32>; +defm V_MAX_I32 : VOP2Inst , "v_max_i32", VOP_I32_I32_I32>; +defm V_MIN_U32 : VOP2Inst , "v_min_u32", VOP_I32_I32_I32>; +defm V_MAX_U32 : VOP2Inst , "v_max_u32", VOP_I32_I32_I32>; + +defm V_LSHRREV_B32 : VOP2Inst < + vop2<0x16, 0x10>, "v_lshrrev_b32", VOP_I32_I32_I32, null_frag, + "v_lshr_b32" +>; + +defm V_ASHRREV_I32 : VOP2Inst < + vop2<0x18, 0x11>, "v_ashrrev_i32", VOP_I32_I32_I32, null_frag, + "v_ashr_i32" +>; + +defm V_LSHLREV_B32 : VOP2Inst < + vop2<0x1a, 0x12>, "v_lshlrev_b32", VOP_I32_I32_I32, null_frag, + "v_lshl_b32" +>; + +defm V_AND_B32 : VOP2Inst , "v_and_b32", VOP_I32_I32_I32>; +defm V_OR_B32 : VOP2Inst , "v_or_b32", VOP_I32_I32_I32>; +defm V_XOR_B32 : VOP2Inst , "v_xor_b32", VOP_I32_I32_I32>; + +defm V_MAC_F32 : VOP2Inst , "v_mac_f32", VOP_F32_F32_F32>; +} // End isCommutable = 1 + +defm V_MADMK_F32 : VOP2MADK , "v_madmk_f32">; + +let isCommutable = 1 in { +defm V_MADAK_F32 : VOP2MADK , "v_madak_f32">; +} // End isCommutable = 1 + +let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC +// No patterns so that the scalar instructions are always selected. +// The scalar versions will be replaced with vector when needed later. + +// V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI, +// but the VI instructions behave the same as the SI versions. +defm V_ADD_I32 : VOP2bInst , "v_add_i32", + VOP_I32_I32_I32, add +>; +defm V_SUB_I32 : VOP2bInst , "v_sub_i32", VOP_I32_I32_I32>; + +defm V_SUBREV_I32 : VOP2bInst , "v_subrev_i32", + VOP_I32_I32_I32, null_frag, "v_sub_i32" +>; + +let Uses = [VCC] in { // Carry-in comes from VCC +defm V_ADDC_U32 : VOP2bInst , "v_addc_u32", + VOP_I32_I32_I32_VCC +>; +defm V_SUBB_U32 : VOP2bInst , "v_subb_u32", + VOP_I32_I32_I32_VCC +>; +defm V_SUBBREV_U32 : VOP2bInst , "v_subbrev_u32", + VOP_I32_I32_I32_VCC, null_frag, "v_subb_u32" +>; + +} // End Uses = [VCC] +} // End isCommutable = 1, Defs = [VCC] + +defm V_READLANE_B32 : VOP2SI_3VI_m < + vop3 <0x001, 0x289>, + "v_readlane_b32", + (outs SReg_32:$vdst), + (ins VGPR_32:$src0, SCSrc_32:$src1), + "v_readlane_b32 $vdst, $src0, $src1" +>; + +defm V_WRITELANE_B32 : VOP2SI_3VI_m < + vop3 <0x002, 0x28a>, + "v_writelane_b32", + (outs VGPR_32:$vdst), + (ins SReg_32:$src0, SCSrc_32:$src1), + "v_writelane_b32 $vdst, $src0, $src1" +>; + +// These instructions only exist on SI and CI +let SubtargetPredicate = isSICI in { + +defm V_MIN_LEGACY_F32 : VOP2InstSI , "v_min_legacy_f32", + VOP_F32_F32_F32, AMDGPUfmin_legacy +>; +defm V_MAX_LEGACY_F32 : VOP2InstSI , "v_max_legacy_f32", + VOP_F32_F32_F32, AMDGPUfmax_legacy +>; + +let isCommutable = 1 in { +defm V_LSHR_B32 : VOP2InstSI , "v_lshr_b32", VOP_I32_I32_I32>; +defm V_ASHR_I32 : VOP2InstSI , "v_ashr_i32", VOP_I32_I32_I32>; +defm V_LSHL_B32 : VOP2InstSI , "v_lshl_b32", VOP_I32_I32_I32>; +} // End isCommutable = 1 +} // End let SubtargetPredicate = SICI + +let isCommutable = 1 in { +defm V_MAC_LEGACY_F32 : VOP2_VI3_Inst , "v_mac_legacy_f32", + VOP_F32_F32_F32 +>; +} // End isCommutable = 1 + +defm V_BFM_B32 : VOP2_VI3_Inst , "v_bfm_b32", + VOP_I32_I32_I32 +>; +defm V_BCNT_U32_B32 : VOP2_VI3_Inst , "v_bcnt_u32_b32", + VOP_I32_I32_I32 +>; +defm V_MBCNT_LO_U32_B32 : VOP2_VI3_Inst , "v_mbcnt_lo_u32_b32", + VOP_I32_I32_I32 +>; +defm V_MBCNT_HI_U32_B32 : VOP2_VI3_Inst , "v_mbcnt_hi_u32_b32", + VOP_I32_I32_I32 +>; +defm V_LDEXP_F32 : VOP2_VI3_Inst , "v_ldexp_f32", + VOP_F32_F32_I32, AMDGPUldexp +>; + +defm V_CVT_PKACCUM_U8_F32 : VOP2_VI3_Inst , "v_cvt_pkaccum_u8_f32", + VOP_I32_F32_I32>; // TODO: set "Uses = dst" + +defm V_CVT_PKNORM_I16_F32 : VOP2_VI3_Inst , "v_cvt_pknorm_i16_f32", + VOP_I32_F32_F32 +>; +defm V_CVT_PKNORM_U16_F32 : VOP2_VI3_Inst , "v_cvt_pknorm_u16_f32", + VOP_I32_F32_F32 +>; +defm V_CVT_PKRTZ_F16_F32 : VOP2_VI3_Inst , "v_cvt_pkrtz_f16_f32", + VOP_I32_F32_F32, int_SI_packf16 +>; +defm V_CVT_PK_U16_U32 : VOP2_VI3_Inst , "v_cvt_pk_u16_u32", + VOP_I32_I32_I32 +>; +defm V_CVT_PK_I16_I32 : VOP2_VI3_Inst , "v_cvt_pk_i16_i32", + VOP_I32_I32_I32 +>; + +//===----------------------------------------------------------------------===// +// VOP3 Instructions +//===----------------------------------------------------------------------===// + +let isCommutable = 1 in { +defm V_MAD_LEGACY_F32 : VOP3Inst , "v_mad_legacy_f32", + VOP_F32_F32_F32_F32 +>; + +defm V_MAD_F32 : VOP3Inst , "v_mad_f32", + VOP_F32_F32_F32_F32, fmad +>; + +defm V_MAD_I32_I24 : VOP3Inst , "v_mad_i32_i24", + VOP_I32_I32_I32_I32, AMDGPUmad_i24 +>; +defm V_MAD_U32_U24 : VOP3Inst , "v_mad_u32_u24", + VOP_I32_I32_I32_I32, AMDGPUmad_u24 +>; +} // End isCommutable = 1 + +defm V_CUBEID_F32 : VOP3Inst , "v_cubeid_f32", + VOP_F32_F32_F32_F32 +>; +defm V_CUBESC_F32 : VOP3Inst , "v_cubesc_f32", + VOP_F32_F32_F32_F32 +>; +defm V_CUBETC_F32 : VOP3Inst , "v_cubetc_f32", + VOP_F32_F32_F32_F32 +>; +defm V_CUBEMA_F32 : VOP3Inst , "v_cubema_f32", + VOP_F32_F32_F32_F32 +>; + +defm V_BFE_U32 : VOP3Inst , "v_bfe_u32", + VOP_I32_I32_I32_I32, AMDGPUbfe_u32 +>; +defm V_BFE_I32 : VOP3Inst , "v_bfe_i32", + VOP_I32_I32_I32_I32, AMDGPUbfe_i32 +>; + +defm V_BFI_B32 : VOP3Inst , "v_bfi_b32", + VOP_I32_I32_I32_I32, AMDGPUbfi +>; + +let isCommutable = 1 in { +defm V_FMA_F32 : VOP3Inst , "v_fma_f32", + VOP_F32_F32_F32_F32, fma +>; +defm V_FMA_F64 : VOP3Inst , "v_fma_f64", + VOP_F64_F64_F64_F64, fma +>; +} // End isCommutable = 1 + +//def V_LERP_U8 : VOP3_U8 <0x0000014d, "v_lerp_u8", []>; +defm V_ALIGNBIT_B32 : VOP3Inst , "v_alignbit_b32", + VOP_I32_I32_I32_I32 +>; +defm V_ALIGNBYTE_B32 : VOP3Inst , "v_alignbyte_b32", + VOP_I32_I32_I32_I32 +>; + +defm V_MIN3_F32 : VOP3Inst , "v_min3_f32", + VOP_F32_F32_F32_F32, AMDGPUfmin3>; + +defm V_MIN3_I32 : VOP3Inst , "v_min3_i32", + VOP_I32_I32_I32_I32, AMDGPUsmin3 +>; +defm V_MIN3_U32 : VOP3Inst , "v_min3_u32", + VOP_I32_I32_I32_I32, AMDGPUumin3 +>; +defm V_MAX3_F32 : VOP3Inst , "v_max3_f32", + VOP_F32_F32_F32_F32, AMDGPUfmax3 +>; +defm V_MAX3_I32 : VOP3Inst , "v_max3_i32", + VOP_I32_I32_I32_I32, AMDGPUsmax3 +>; +defm V_MAX3_U32 : VOP3Inst , "v_max3_u32", + VOP_I32_I32_I32_I32, AMDGPUumax3 +>; +defm V_MED3_F32 : VOP3Inst , "v_med3_f32", + VOP_F32_F32_F32_F32 +>; +defm V_MED3_I32 : VOP3Inst , "v_med3_i32", + VOP_I32_I32_I32_I32 +>; +defm V_MED3_U32 : VOP3Inst , "v_med3_u32", + VOP_I32_I32_I32_I32 +>; + +//def V_SAD_U8 : VOP3_U8 <0x0000015a, "v_sad_u8", []>; +//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "v_sad_hi_u8", []>; +//def V_SAD_U16 : VOP3_U16 <0x0000015c, "v_sad_u16", []>; +defm V_SAD_U32 : VOP3Inst , "v_sad_u32", + VOP_I32_I32_I32_I32 +>; +////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "v_cvt_pk_u8_f32", []>; +defm V_DIV_FIXUP_F32 : VOP3Inst < + vop3<0x15f, 0x1de>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup +>; + +let SchedRW = [WriteDouble] in { + +defm V_DIV_FIXUP_F64 : VOP3Inst < + vop3<0x160, 0x1df>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup +>; + +} // let SchedRW = [WriteDouble] + +let SchedRW = [WriteDouble] in { +let isCommutable = 1 in { + +defm V_ADD_F64 : VOP3Inst , "v_add_f64", + VOP_F64_F64_F64, fadd +>; +defm V_MUL_F64 : VOP3Inst , "v_mul_f64", + VOP_F64_F64_F64, fmul +>; + +defm V_MIN_F64 : VOP3Inst , "v_min_f64", + VOP_F64_F64_F64, fminnum +>; +defm V_MAX_F64 : VOP3Inst , "v_max_f64", + VOP_F64_F64_F64, fmaxnum +>; + +} // isCommutable = 1 + +defm V_LDEXP_F64 : VOP3Inst , "v_ldexp_f64", + VOP_F64_F64_I32, AMDGPUldexp +>; + +} // let SchedRW = [WriteDouble] + +let isCommutable = 1, SchedRW = [WriteQuarterRate32] in { + +defm V_MUL_LO_U32 : VOP3Inst , "v_mul_lo_u32", + VOP_I32_I32_I32 +>; +defm V_MUL_HI_U32 : VOP3Inst , "v_mul_hi_u32", + VOP_I32_I32_I32 +>; + +defm V_MUL_LO_I32 : VOP3Inst , "v_mul_lo_i32", + VOP_I32_I32_I32 +>; +defm V_MUL_HI_I32 : VOP3Inst , "v_mul_hi_i32", + VOP_I32_I32_I32 +>; + +} // isCommutable = 1, SchedRW = [WriteQuarterRate32] + +let SchedRW = [WriteFloatFMA, WriteSALU] in { +defm V_DIV_SCALE_F32 : VOP3b_32 , "v_div_scale_f32", []>; +} + +let SchedRW = [WriteDouble, WriteSALU] in { +// Double precision division pre-scale. +defm V_DIV_SCALE_F64 : VOP3b_64 , "v_div_scale_f64", []>; +} // let SchedRW = [WriteDouble] + +let isCommutable = 1, Uses = [VCC] in { + +// v_div_fmas_f32: +// result = src0 * src1 + src2 +// if (vcc) +// result *= 2^32 +// +defm V_DIV_FMAS_F32 : VOP3_VCC_Inst , "v_div_fmas_f32", + VOP_F32_F32_F32_F32, AMDGPUdiv_fmas +>; + +let SchedRW = [WriteDouble] in { +// v_div_fmas_f64: +// result = src0 * src1 + src2 +// if (vcc) +// result *= 2^64 +// +defm V_DIV_FMAS_F64 : VOP3_VCC_Inst , "v_div_fmas_f64", + VOP_F64_F64_F64_F64, AMDGPUdiv_fmas +>; + +} // End SchedRW = [WriteDouble] +} // End isCommutable = 1 + +//def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>; +//def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>; +//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "v_mqsad_u8", []>; + +let SchedRW = [WriteDouble] in { +defm V_TRIG_PREOP_F64 : VOP3Inst < + vop3<0x174, 0x292>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop +>; + +} // let SchedRW = [WriteDouble] + +// These instructions only exist on SI and CI +let SubtargetPredicate = isSICI in { + +defm V_LSHL_B64 : VOP3Inst , "v_lshl_b64", VOP_I64_I64_I32>; +defm V_LSHR_B64 : VOP3Inst , "v_lshr_b64", VOP_I64_I64_I32>; +defm V_ASHR_I64 : VOP3Inst , "v_ashr_i64", VOP_I64_I64_I32>; + +defm V_MULLIT_F32 : VOP3Inst , "v_mullit_f32", + VOP_F32_F32_F32_F32>; + +} // End SubtargetPredicate = isSICI + +let SubtargetPredicate = isVI in { + +defm V_LSHLREV_B64 : VOP3Inst , "v_lshlrev_b64", + VOP_I64_I32_I64 +>; +defm V_LSHRREV_B64 : VOP3Inst , "v_lshrrev_b64", + VOP_I64_I32_I64 +>; +defm V_ASHRREV_I64 : VOP3Inst , "v_ashrrev_i64", + VOP_I64_I32_I64 +>; + +} // End SubtargetPredicate = isVI + +//===----------------------------------------------------------------------===// +// Pseudo Instructions +//===----------------------------------------------------------------------===// +let isCodeGenOnly = 1, isPseudo = 1 in { + +// For use in patterns +def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$dst), + (ins VSrc_64:$src0, VSrc_64:$src1, SSrc_64:$src2), "", [] +>; + +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { +// 64-bit vector move instruction. This is mainly used by the SIFoldOperands +// pass to enable folding of inline immediates. +def V_MOV_B64_PSEUDO : InstSI <(outs VReg_64:$dst), (ins VSrc_64:$src0), "", []>; +} // end let hasSideEffects = 0, mayLoad = 0, mayStore = 0 + +let hasSideEffects = 1 in { +def SGPR_USE : InstSI <(outs),(ins), "", []>; +} + +// SI pseudo instructions. These are used by the CFG structurizer pass +// and should be lowered to ISA instructions prior to codegen. + +let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { +let Uses = [EXEC], Defs = [EXEC] in { + +let isBranch = 1, isTerminator = 1 in { + +def SI_IF: InstSI < + (outs SReg_64:$dst), + (ins SReg_64:$vcc, brtarget:$target), + "", + [(set i64:$dst, (int_SI_if i1:$vcc, bb:$target))] +>; + +def SI_ELSE : InstSI < + (outs SReg_64:$dst), + (ins SReg_64:$src, brtarget:$target), + "", + [(set i64:$dst, (int_SI_else i64:$src, bb:$target))] +> { + let Constraints = "$src = $dst"; +} + +def SI_LOOP : InstSI < + (outs), + (ins SReg_64:$saved, brtarget:$target), + "si_loop $saved, $target", + [(int_SI_loop i64:$saved, bb:$target)] +>; + +} // end isBranch = 1, isTerminator = 1 + +def SI_BREAK : InstSI < + (outs SReg_64:$dst), + (ins SReg_64:$src), + "si_else $dst, $src", + [(set i64:$dst, (int_SI_break i64:$src))] +>; + +def SI_IF_BREAK : InstSI < + (outs SReg_64:$dst), + (ins SReg_64:$vcc, SReg_64:$src), + "si_if_break $dst, $vcc, $src", + [(set i64:$dst, (int_SI_if_break i1:$vcc, i64:$src))] +>; + +def SI_ELSE_BREAK : InstSI < + (outs SReg_64:$dst), + (ins SReg_64:$src0, SReg_64:$src1), + "si_else_break $dst, $src0, $src1", + [(set i64:$dst, (int_SI_else_break i64:$src0, i64:$src1))] +>; + +def SI_END_CF : InstSI < + (outs), + (ins SReg_64:$saved), + "si_end_cf $saved", + [(int_SI_end_cf i64:$saved)] +>; + +} // End Uses = [EXEC], Defs = [EXEC] + +let Uses = [EXEC], Defs = [EXEC,VCC] in { +def SI_KILL : InstSI < + (outs), + (ins VSrc_32:$src), + "si_kill $src", + [(int_AMDGPU_kill f32:$src)] +>; +} // End Uses = [EXEC], Defs = [EXEC,VCC] + +} // end mayLoad = 1, mayStore = 1, hasSideEffects = 1 + +let Uses = [EXEC], Defs = [EXEC,VCC,M0] in { + +//defm SI_ : RegisterLoadStore ; + +let UseNamedOperandTable = 1 in { + +def SI_RegisterLoad : InstSI < + (outs VGPR_32:$dst, SReg_64:$temp), + (ins FRAMEri32:$addr, i32imm:$chan), + "", [] +> { + let isRegisterLoad = 1; + let mayLoad = 1; +} + +class SIRegStore : InstSI < + outs, + (ins VGPR_32:$val, FRAMEri32:$addr, i32imm:$chan), + "", [] +> { + let isRegisterStore = 1; + let mayStore = 1; +} + +let usesCustomInserter = 1 in { +def SI_RegisterStorePseudo : SIRegStore<(outs)>; +} // End usesCustomInserter = 1 +def SI_RegisterStore : SIRegStore<(outs SReg_64:$temp)>; + + +} // End UseNamedOperandTable = 1 + +def SI_INDIRECT_SRC : InstSI < + (outs VGPR_32:$dst, SReg_64:$temp), + (ins unknown:$src, VSrc_32:$idx, i32imm:$off), + "si_indirect_src $dst, $temp, $src, $idx, $off", + [] +>; + +class SI_INDIRECT_DST : InstSI < + (outs rc:$dst, SReg_64:$temp), + (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VGPR_32:$val), + "si_indirect_dst $dst, $temp, $src, $idx, $off, $val", + [] +> { + let Constraints = "$src = $dst"; +} + +def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST; +def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST; +def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST; +def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST; +def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST; + +} // Uses = [EXEC,VCC,M0], Defs = [EXEC,VCC,M0] + +multiclass SI_SPILL_SGPR { + + let UseNamedOperandTable = 1 in { + def _SAVE : InstSI < + (outs), + (ins sgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc, + SReg_32:$scratch_offset), + "", [] + >; + + def _RESTORE : InstSI < + (outs sgpr_class:$dst), + (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset), + "", [] + >; + } // End UseNamedOperandTable = 1 +} + +// It's unclear whether you can use M0 as the output of v_readlane_b32 +// instructions, so use SGPR_32 register class for spills to prevent +// this from happening. +defm SI_SPILL_S32 : SI_SPILL_SGPR ; +defm SI_SPILL_S64 : SI_SPILL_SGPR ; +defm SI_SPILL_S128 : SI_SPILL_SGPR ; +defm SI_SPILL_S256 : SI_SPILL_SGPR ; +defm SI_SPILL_S512 : SI_SPILL_SGPR ; + +multiclass SI_SPILL_VGPR { + let UseNamedOperandTable = 1, VGPRSpill = 1 in { + def _SAVE : InstSI < + (outs), + (ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc, + SReg_32:$scratch_offset), + "", [] + >; + + def _RESTORE : InstSI < + (outs vgpr_class:$dst), + (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset), + "", [] + >; + } // End UseNamedOperandTable = 1, VGPRSpill = 1 +} + +defm SI_SPILL_V32 : SI_SPILL_VGPR ; +defm SI_SPILL_V64 : SI_SPILL_VGPR ; +defm SI_SPILL_V96 : SI_SPILL_VGPR ; +defm SI_SPILL_V128 : SI_SPILL_VGPR ; +defm SI_SPILL_V256 : SI_SPILL_VGPR ; +defm SI_SPILL_V512 : SI_SPILL_VGPR ; + +let Defs = [SCC] in { + +def SI_CONSTDATA_PTR : InstSI < + (outs SReg_64:$dst), + (ins), + "", [(set SReg_64:$dst, (i64 SIconstdata_ptr))] +>; + +} // End Defs = [SCC] + +} // end IsCodeGenOnly, isPseudo + +} // end SubtargetPredicate = isGCN + +let Predicates = [isGCN] in { + +def : Pat< + (int_AMDGPU_cndlt f32:$src0, f32:$src1, f32:$src2), + (V_CNDMASK_B32_e64 $src2, $src1, + (V_CMP_GT_F32_e64 SRCMODS.NONE, 0, SRCMODS.NONE, $src0, + DSTCLAMP.NONE, DSTOMOD.NONE)) +>; + +def : Pat < + (int_AMDGPU_kilp), + (SI_KILL 0xbf800000) +>; + +/* int_SI_vs_load_input */ +def : Pat< + (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr), + (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, 0, imm:$attr_offset, 0, 0, 0) +>; + +/* int_SI_export */ +def : Pat < + (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr, + f32:$src0, f32:$src1, f32:$src2, f32:$src3), + (EXP imm:$en, imm:$tgt, imm:$compr, imm:$done, imm:$vm, + $src0, $src1, $src2, $src3) +>; + +//===----------------------------------------------------------------------===// +// SMRD Patterns +//===----------------------------------------------------------------------===// + +multiclass SMRD_Pattern { + + // 1. SI-CI: Offset as 8bit DWORD immediate + def : Pat < + (constant_load (add i64:$sbase, (i64 IMM8bitDWORD:$offset))), + (vt (Instr_IMM $sbase, (as_dword_i32imm $offset))) + >; + + // 2. Offset loaded in an 32bit SGPR + def : Pat < + (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))), + (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset))))) + >; + + // 3. No offset at all + def : Pat < + (constant_load i64:$sbase), + (vt (Instr_IMM $sbase, 0)) + >; +} + +multiclass SMRD_Pattern_vi { + + // 1. VI: Offset as 20bit immediate in bytes + def : Pat < + (constant_load (add i64:$sbase, (i64 IMM20bit:$offset))), + (vt (Instr_IMM $sbase, (as_i32imm $offset))) + >; + + // 2. Offset loaded in an 32bit SGPR + def : Pat < + (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))), + (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset))))) + >; + + // 3. No offset at all + def : Pat < + (constant_load i64:$sbase), + (vt (Instr_IMM $sbase, 0)) + >; +} + +let Predicates = [isSICI] in { +defm : SMRD_Pattern ; +defm : SMRD_Pattern ; +defm : SMRD_Pattern ; +defm : SMRD_Pattern ; +defm : SMRD_Pattern ; +defm : SMRD_Pattern ; +defm : SMRD_Pattern ; +} // End Predicates = [isSICI] + +let Predicates = [isVI] in { +defm : SMRD_Pattern_vi ; +defm : SMRD_Pattern_vi ; +defm : SMRD_Pattern_vi ; +defm : SMRD_Pattern_vi ; +defm : SMRD_Pattern_vi ; +defm : SMRD_Pattern_vi ; +defm : SMRD_Pattern_vi ; +} // End Predicates = [isVI] + +let Predicates = [isSICI] in { + +// 1. Offset as 8bit DWORD immediate +def : Pat < + (SIload_constant v4i32:$sbase, IMM8bitDWORD:$offset), + (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_dword_i32imm $offset)) +>; + +} // End Predicates = [isSICI] + +// 2. Offset loaded in an 32bit SGPR +def : Pat < + (SIload_constant v4i32:$sbase, imm:$offset), + (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset)) +>; + +//===----------------------------------------------------------------------===// +// SOP1 Patterns +//===----------------------------------------------------------------------===// + +def : Pat < + (i64 (ctpop i64:$src)), + (i64 (REG_SEQUENCE SReg_64, + (S_BCNT1_I32_B64 $src), sub0, + (S_MOV_B32 0), sub1)) +>; + +//===----------------------------------------------------------------------===// +// SOP2 Patterns +//===----------------------------------------------------------------------===// + +// V_ADD_I32_e32/S_ADD_U32 produces carry in VCC/SCC. For the vector +// case, the sgpr-copies pass will fix this to use the vector version. +def : Pat < + (i32 (addc i32:$src0, i32:$src1)), + (S_ADD_U32 $src0, $src1) +>; + +//===----------------------------------------------------------------------===// +// SOPP Patterns +//===----------------------------------------------------------------------===// + +def : Pat < + (int_AMDGPU_barrier_global), + (S_BARRIER) +>; + +//===----------------------------------------------------------------------===// +// VOP1 Patterns +//===----------------------------------------------------------------------===// + +let Predicates = [UnsafeFPMath] in { + +//def : RcpPat; +//defm : RsqPat; +//defm : RsqPat; + +def : RsqPat; +def : RsqPat; +} + +//===----------------------------------------------------------------------===// +// VOP2 Patterns +//===----------------------------------------------------------------------===// + +def : Pat < + (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)), + (V_BCNT_U32_B32_e64 $popcnt, $val) +>; + +def : Pat < + (i32 (select i1:$src0, i32:$src1, i32:$src2)), + (V_CNDMASK_B32_e64 $src2, $src1, $src0) +>; + +/********** ======================= **********/ +/********** Image sampling patterns **********/ +/********** ======================= **********/ + +// Image + sampler +class SampleRawPattern : Pat < + (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm, + i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe), + (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da), + (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc), + $addr, $rsrc, $sampler) +>; + +multiclass SampleRawPatterns { + def : SampleRawPattern(opcode # _V4_V1), i32>; + def : SampleRawPattern(opcode # _V4_V2), v2i32>; + def : SampleRawPattern(opcode # _V4_V4), v4i32>; + def : SampleRawPattern(opcode # _V4_V8), v8i32>; + def : SampleRawPattern(opcode # _V4_V16), v16i32>; +} + +// Image only +class ImagePattern : Pat < + (name vt:$addr, v8i32:$rsrc, i32:$dmask, i32:$unorm, + i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe), + (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da), + (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc), + $addr, $rsrc) +>; + +multiclass ImagePatterns { + def : ImagePattern(opcode # _V4_V1), i32>; + def : ImagePattern(opcode # _V4_V2), v2i32>; + def : ImagePattern(opcode # _V4_V4), v4i32>; +} + +// Basic sample +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; + +// Sample with comparison +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; + +// Sample with offsets +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; + +// Sample with comparison and offsets +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; +defm : SampleRawPatterns; + +// Gather opcodes +// Only the variants which make sense are defined. +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; + +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; + +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; + +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; + +def : SampleRawPattern; +def : SampleRawPattern; +def : SampleRawPattern; + +def : ImagePattern; +defm : ImagePatterns; +defm : ImagePatterns; + +/* SIsample for simple 1D texture lookup */ +def : Pat < + (SIsample i32:$addr, v32i8:$rsrc, v4i32:$sampler, imm), + (IMAGE_SAMPLE_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) +>; + +class SamplePattern : Pat < + (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, imm), + (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) +>; + +class SampleRectPattern : Pat < + (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_RECT), + (opcode 0xf, 1, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) +>; + +class SampleArrayPattern : Pat < + (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_ARRAY), + (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler) +>; + +class SampleShadowPattern : Pat < + (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW), + (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) +>; + +class SampleShadowArrayPattern : Pat < + (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY), + (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler) +>; + +/* SIsample* for texture lookups consuming more address parameters */ +multiclass SamplePatterns { + def : SamplePattern ; + def : SampleRectPattern ; + def : SampleArrayPattern ; + def : SampleShadowPattern ; + def : SampleShadowArrayPattern ; + + def : SamplePattern ; + def : SampleArrayPattern ; + def : SampleShadowPattern ; + def : SampleShadowArrayPattern ; + + def : SamplePattern ; + def : SampleArrayPattern ; + def : SampleShadowPattern ; + def : SampleShadowArrayPattern ; + + def : SamplePattern ; + def : SampleArrayPattern ; + def : SampleShadowPattern ; + def : SampleShadowArrayPattern ; +} + +defm : SamplePatterns; +defm : SamplePatterns; +defm : SamplePatterns; +defm : SamplePatterns; + +/* int_SI_imageload for texture fetches consuming varying address parameters */ +class ImageLoadPattern : Pat < + (name addr_type:$addr, v32i8:$rsrc, imm), + (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc) +>; + +class ImageLoadArrayPattern : Pat < + (name addr_type:$addr, v32i8:$rsrc, TEX_ARRAY), + (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc) +>; + +class ImageLoadMSAAPattern : Pat < + (name addr_type:$addr, v32i8:$rsrc, TEX_MSAA), + (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc) +>; + +class ImageLoadArrayMSAAPattern : Pat < + (name addr_type:$addr, v32i8:$rsrc, TEX_ARRAY_MSAA), + (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc) +>; + +multiclass ImageLoadPatterns { + def : ImageLoadPattern ; + def : ImageLoadArrayPattern ; +} + +multiclass ImageLoadMSAAPatterns { + def : ImageLoadMSAAPattern ; + def : ImageLoadArrayMSAAPattern ; +} + +defm : ImageLoadPatterns; +defm : ImageLoadPatterns; + +defm : ImageLoadMSAAPatterns; +defm : ImageLoadMSAAPatterns; + +/* Image resource information */ +def : Pat < + (int_SI_resinfo i32:$mipid, v32i8:$rsrc, imm), + (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc) +>; + +def : Pat < + (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY), + (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc) +>; + +def : Pat < + (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY_MSAA), + (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc) +>; + +/********** ============================================ **********/ +/********** Extraction, Insertion, Building and Casting **********/ +/********** ============================================ **********/ + +foreach Index = 0-2 in { + def Extract_Element_v2i32_#Index : Extract_Element < + i32, v2i32, Index, !cast(sub#Index) + >; + def Insert_Element_v2i32_#Index : Insert_Element < + i32, v2i32, Index, !cast(sub#Index) + >; + + def Extract_Element_v2f32_#Index : Extract_Element < + f32, v2f32, Index, !cast(sub#Index) + >; + def Insert_Element_v2f32_#Index : Insert_Element < + f32, v2f32, Index, !cast(sub#Index) + >; +} + +foreach Index = 0-3 in { + def Extract_Element_v4i32_#Index : Extract_Element < + i32, v4i32, Index, !cast(sub#Index) + >; + def Insert_Element_v4i32_#Index : Insert_Element < + i32, v4i32, Index, !cast(sub#Index) + >; + + def Extract_Element_v4f32_#Index : Extract_Element < + f32, v4f32, Index, !cast(sub#Index) + >; + def Insert_Element_v4f32_#Index : Insert_Element < + f32, v4f32, Index, !cast(sub#Index) + >; +} + +foreach Index = 0-7 in { + def Extract_Element_v8i32_#Index : Extract_Element < + i32, v8i32, Index, !cast(sub#Index) + >; + def Insert_Element_v8i32_#Index : Insert_Element < + i32, v8i32, Index, !cast(sub#Index) + >; + + def Extract_Element_v8f32_#Index : Extract_Element < + f32, v8f32, Index, !cast(sub#Index) + >; + def Insert_Element_v8f32_#Index : Insert_Element < + f32, v8f32, Index, !cast(sub#Index) + >; +} + +foreach Index = 0-15 in { + def Extract_Element_v16i32_#Index : Extract_Element < + i32, v16i32, Index, !cast(sub#Index) + >; + def Insert_Element_v16i32_#Index : Insert_Element < + i32, v16i32, Index, !cast(sub#Index) + >; + + def Extract_Element_v16f32_#Index : Extract_Element < + f32, v16f32, Index, !cast(sub#Index) + >; + def Insert_Element_v16f32_#Index : Insert_Element < + f32, v16f32, Index, !cast(sub#Index) + >; +} + +def : BitConvert ; +def : BitConvert ; + +def : BitConvert ; +def : BitConvert ; + +def : BitConvert ; + +def : BitConvert ; + +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; + +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; +def : BitConvert ; + +def : BitConvert ; +def : BitConvert ; + +/********** =================== **********/ +/********** Src & Dst modifiers **********/ +/********** =================== **********/ + +def : Pat < + (AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod), + (f32 FP_ZERO), (f32 FP_ONE)), + (V_ADD_F32_e64 $src0_modifiers, $src0, 0, 0, 1, $omod) +>; + +/********** ================================ **********/ +/********** Floating point absolute/negative **********/ +/********** ================================ **********/ + +// Prevent expanding both fneg and fabs. + +// FIXME: Should use S_OR_B32 +def : Pat < + (fneg (fabs f32:$src)), + (V_OR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Set sign bit */ +>; + +// FIXME: Should use S_OR_B32 +def : Pat < + (fneg (fabs f64:$src)), + (REG_SEQUENCE VReg_64, + (i32 (EXTRACT_SUBREG f64:$src, sub0)), + sub0, + (V_OR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), + (V_MOV_B32_e32 0x80000000)), // Set sign bit. + sub1) +>; + +def : Pat < + (fabs f32:$src), + (V_AND_B32_e32 $src, (V_MOV_B32_e32 0x7fffffff)) +>; + +def : Pat < + (fneg f32:$src), + (V_XOR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) +>; + +def : Pat < + (fabs f64:$src), + (REG_SEQUENCE VReg_64, + (i32 (EXTRACT_SUBREG f64:$src, sub0)), + sub0, + (V_AND_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), + (V_MOV_B32_e32 0x7fffffff)), // Set sign bit. + sub1) +>; + +def : Pat < + (fneg f64:$src), + (REG_SEQUENCE VReg_64, + (i32 (EXTRACT_SUBREG f64:$src, sub0)), + sub0, + (V_XOR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), + (V_MOV_B32_e32 0x80000000)), + sub1) +>; + +/********** ================== **********/ +/********** Immediate Patterns **********/ +/********** ================== **********/ + +def : Pat < + (SGPRImm<(i32 imm)>:$imm), + (S_MOV_B32 imm:$imm) +>; + +def : Pat < + (SGPRImm<(f32 fpimm)>:$imm), + (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm))) +>; + +def : Pat < + (i32 imm:$imm), + (V_MOV_B32_e32 imm:$imm) +>; + +def : Pat < + (f32 fpimm:$imm), + (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm))) +>; + +def : Pat < + (i64 InlineImm:$imm), + (S_MOV_B64 InlineImm:$imm) +>; + +// XXX - Should this use a s_cmp to set SCC? + +// Set to sign-extended 64-bit value (true = -1, false = 0) +def : Pat < + (i1 imm:$imm), + (S_MOV_B64 (i64 (as_i64imm $imm))) +>; + +def : Pat < + (f64 InlineFPImm:$imm), + (S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineFPImm:$imm))) +>; + +/********** ================== **********/ +/********** Intrinsic Patterns **********/ +/********** ================== **********/ + +/* llvm.AMDGPU.pow */ +def : POW_Common ; + +def : Pat < + (int_AMDGPU_div f32:$src0, f32:$src1), + (V_MUL_LEGACY_F32_e32 $src0, (V_RCP_LEGACY_F32_e32 $src1)) +>; + +def : Pat < + (int_AMDGPU_cube v4f32:$src), + (REG_SEQUENCE VReg_128, + (V_CUBETC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0), + 0 /* src1_modifiers */, (EXTRACT_SUBREG $src, sub1), + 0 /* src2_modifiers */, (EXTRACT_SUBREG $src, sub2), + 0 /* clamp */, 0 /* omod */), sub0, + (V_CUBESC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0), + 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), + 0 /* src2_modifiers */,(EXTRACT_SUBREG $src, sub2), + 0 /* clamp */, 0 /* omod */), sub1, + (V_CUBEMA_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0), + 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), + 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2), + 0 /* clamp */, 0 /* omod */), sub2, + (V_CUBEID_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0), + 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), + 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2), + 0 /* clamp */, 0 /* omod */), sub3) +>; + +def : Pat < + (i32 (sext i1:$src0)), + (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0) +>; + +class Ext32Pat : Pat < + (i32 (ext i1:$src0)), + (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src0) +>; + +def : Ext32Pat ; +def : Ext32Pat ; + +// Offset in an 32Bit VGPR +def : Pat < + (SIload_constant v4i32:$sbase, i32:$voff), + (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, 0, 0, 0, 0, 0) +>; + +// The multiplication scales from [0,1] to the unsigned integer range +def : Pat < + (AMDGPUurecip i32:$src0), + (V_CVT_U32_F32_e32 + (V_MUL_F32_e32 CONST.FP_UINT_MAX_PLUS_1, + (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0)))) +>; + +def : Pat < + (int_SI_tid), + (V_MBCNT_HI_U32_B32_e64 0xffffffff, + (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0)) +>; + +//===----------------------------------------------------------------------===// +// VOP3 Patterns +//===----------------------------------------------------------------------===// + +def : IMad24Pat; +def : UMad24Pat; + +def : Pat < + (mulhu i32:$src0, i32:$src1), + (V_MUL_HI_U32 $src0, $src1) +>; + +def : Pat < + (mulhs i32:$src0, i32:$src1), + (V_MUL_HI_I32 $src0, $src1) +>; + +defm : BFIPatterns ; +def : ROTRPattern ; + +/********** ======================= **********/ +/********** Load/Store Patterns **********/ +/********** ======================= **********/ + +class DSReadPat : Pat < + (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), + (inst $ptr, (as_i16imm $offset), (i1 0)) +>; + +def : DSReadPat ; +def : DSReadPat ; +def : DSReadPat ; +def : DSReadPat ; +def : DSReadPat ; + +let AddedComplexity = 100 in { + +def : DSReadPat ; + +} // End AddedComplexity = 100 + +def : Pat < + (v2i32 (si_load_local (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, + i8:$offset1))), + (DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0)) +>; + +class DSWritePat : Pat < + (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)), + (inst $ptr, $value, (as_i16imm $offset), (i1 0)) +>; + +def : DSWritePat ; +def : DSWritePat ; +def : DSWritePat ; + +let AddedComplexity = 100 in { + +def : DSWritePat ; +} // End AddedComplexity = 100 + +def : Pat < + (si_store_local v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, + i8:$offset1)), + (DS_WRITE2_B32 $ptr, (EXTRACT_SUBREG $value, sub0), + (EXTRACT_SUBREG $value, sub1), $offset0, $offset1, + (i1 0)) +>; + +class DSAtomicRetPat : Pat < + (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), + (inst $ptr, $value, (as_i16imm $offset), (i1 0)) +>; + +// Special case of DSAtomicRetPat for add / sub 1 -> inc / dec +// +// We need to use something for the data0, so we set a register to +// -1. For the non-rtn variants, the manual says it does +// DS[A] = (DS[A] >= D0) ? 0 : DS[A] + 1, and setting D0 to uint_max +// will always do the increment so I'm assuming it's the same. +// +// We also load this -1 with s_mov_b32 / s_mov_b64 even though this +// needs to be a VGPR. The SGPR copy pass will fix this, and it's +// easier since there is no v_mov_b64. +class DSAtomicIncRetPat : Pat < + (frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)), + (inst $ptr, (LoadImm (vt -1)), (as_i16imm $offset), (i1 0)) +>; + + +class DSAtomicCmpXChg : Pat < + (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap), + (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0)) +>; + + +// 32-bit atomics. +def : DSAtomicIncRetPat; +def : DSAtomicIncRetPat; + +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; + +def : DSAtomicCmpXChg; + +// 64-bit atomics. +def : DSAtomicIncRetPat; +def : DSAtomicIncRetPat; + +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; + +def : DSAtomicCmpXChg; + + +//===----------------------------------------------------------------------===// +// MUBUF Patterns +//===----------------------------------------------------------------------===// + +multiclass MUBUFLoad_Pattern { + def : Pat < + (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, + i16:$offset, i1:$glc, i1:$slc, i1:$tfe))), + (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe) + >; +} + +let Predicates = [isSICI] in { +defm : MUBUFLoad_Pattern ; +defm : MUBUFLoad_Pattern ; +defm : MUBUFLoad_Pattern ; +defm : MUBUFLoad_Pattern ; +defm : MUBUFLoad_Pattern ; +defm : MUBUFLoad_Pattern ; +defm : MUBUFLoad_Pattern ; +} // End Predicates = [isSICI] + +class MUBUFScratchLoadPat : Pat < + (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr, + i32:$soffset, u16imm:$offset))), + (Instr $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) +>; + +def : MUBUFScratchLoadPat ; +def : MUBUFScratchLoadPat ; +def : MUBUFScratchLoadPat ; +def : MUBUFScratchLoadPat ; +def : MUBUFScratchLoadPat ; +def : MUBUFScratchLoadPat ; +def : MUBUFScratchLoadPat ; + +// BUFFER_LOAD_DWORD*, addr64=0 +multiclass MUBUF_Load_Dword { + + def : Pat < + (vt (int_SI_buffer_load_dword v4i32:$rsrc, (i32 imm), i32:$soffset, + imm:$offset, 0, 0, imm:$glc, imm:$slc, + imm:$tfe)), + (offset $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), + (as_i1imm $slc), (as_i1imm $tfe)) + >; + + def : Pat < + (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, + imm:$offset, 1, 0, imm:$glc, imm:$slc, + imm:$tfe)), + (offen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), + (as_i1imm $tfe)) + >; + + def : Pat < + (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, + imm:$offset, 0, 1, imm:$glc, imm:$slc, + imm:$tfe)), + (idxen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), + (as_i1imm $slc), (as_i1imm $tfe)) + >; + + def : Pat < + (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset, + imm:$offset, 1, 1, imm:$glc, imm:$slc, + imm:$tfe)), + (bothen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), + (as_i1imm $tfe)) + >; +} + +defm : MUBUF_Load_Dword ; +defm : MUBUF_Load_Dword ; +defm : MUBUF_Load_Dword ; + +class MUBUFScratchStorePat : Pat < + (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset, + u16imm:$offset)), + (Instr $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) +>; + +def : MUBUFScratchStorePat ; +def : MUBUFScratchStorePat ; +def : MUBUFScratchStorePat ; +def : MUBUFScratchStorePat ; +def : MUBUFScratchStorePat ; + +/* +class MUBUFStore_Pattern : Pat < + (st vt:$value, (MUBUFScratch v4i32:$srsrc, i64:$vaddr, u16imm:$offset)), + (Instr $value, $srsrc, $vaddr, $offset) +>; + +let Predicates = [isSICI] in { +def : MUBUFStore_Pattern ; +def : MUBUFStore_Pattern ; +def : MUBUFStore_Pattern ; +def : MUBUFStore_Pattern ; +def : MUBUFStore_Pattern ; +} // End Predicates = [isSICI] + +*/ + +//===----------------------------------------------------------------------===// +// MTBUF Patterns +//===----------------------------------------------------------------------===// + +// TBUFFER_STORE_FORMAT_*, addr64=0 +class MTBUF_StoreResource : Pat< + (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr, + i32:$soffset, imm:$inst_offset, imm:$dfmt, + imm:$nfmt, imm:$offen, imm:$idxen, + imm:$glc, imm:$slc, imm:$tfe), + (opcode + $vdata, (as_i16imm $inst_offset), (as_i1imm $offen), (as_i1imm $idxen), + (as_i1imm $glc), 0, (as_i8imm $dfmt), (as_i8imm $nfmt), $vaddr, $rsrc, + (as_i1imm $slc), (as_i1imm $tfe), $soffset) +>; + +def : MTBUF_StoreResource ; +def : MTBUF_StoreResource ; +def : MTBUF_StoreResource ; +def : MTBUF_StoreResource ; + +let SubtargetPredicate = isCI in { + +defm V_QSAD_PK_U16_U8 : VOP3Inst , "v_qsad_pk_u16_u8", + VOP_I32_I32_I32 +>; +defm V_MQSAD_U16_U8 : VOP3Inst , "v_mqsad_u16_u8", + VOP_I32_I32_I32 +>; +defm V_MQSAD_U32_U8 : VOP3Inst , "v_mqsad_u32_u8", + VOP_I32_I32_I32 +>; + +let isCommutable = 1 in { +defm V_MAD_U64_U32 : VOP3Inst , "v_mad_u64_u32", + VOP_I64_I32_I32_I64 +>; + +// XXX - Does this set VCC? +defm V_MAD_I64_I32 : VOP3Inst , "v_mad_i64_i32", + VOP_I64_I32_I32_I64 +>; +} // End isCommutable = 1 + +// Remaining instructions: +// FLAT_* +// S_CBRANCH_CDBGUSER +// S_CBRANCH_CDBGSYS +// S_CBRANCH_CDBGSYS_OR_USER +// S_CBRANCH_CDBGSYS_AND_USER +// S_DCACHE_INV_VOL +// DS_NOP +// DS_GWS_SEMA_RELEASE_ALL +// DS_WRAP_RTN_B32 +// DS_CNDXCHG32_RTN_B64 +// DS_WRITE_B96 +// DS_WRITE_B128 +// DS_CONDXCHG32_RTN_B128 +// DS_READ_B96 +// DS_READ_B128 +// BUFFER_LOAD_DWORDX3 +// BUFFER_STORE_DWORDX3 + +} // End isCI + +/********** ====================== **********/ +/********** Indirect adressing **********/ +/********** ====================== **********/ + +multiclass SI_INDIRECT_Pattern { + + // 1. Extract with offset + def : Pat< + (eltvt (vector_extract vt:$vec, (add i32:$idx, imm:$off))), + (SI_INDIRECT_SRC $vec, $idx, imm:$off) + >; + + // 2. Extract without offset + def : Pat< + (eltvt (vector_extract vt:$vec, i32:$idx)), + (SI_INDIRECT_SRC $vec, $idx, 0) + >; + + // 3. Insert with offset + def : Pat< + (vector_insert vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)), + (IndDst $vec, $idx, imm:$off, $val) + >; + + // 4. Insert without offset + def : Pat< + (vector_insert vt:$vec, eltvt:$val, i32:$idx), + (IndDst $vec, $idx, 0, $val) + >; +} + +defm : SI_INDIRECT_Pattern ; +defm : SI_INDIRECT_Pattern ; +defm : SI_INDIRECT_Pattern ; +defm : SI_INDIRECT_Pattern ; + +defm : SI_INDIRECT_Pattern ; +defm : SI_INDIRECT_Pattern ; +defm : SI_INDIRECT_Pattern ; +defm : SI_INDIRECT_Pattern ; + +//===----------------------------------------------------------------------===// +// Conversion Patterns +//===----------------------------------------------------------------------===// + +def : Pat<(i32 (sext_inreg i32:$src, i1)), + (S_BFE_I32 i32:$src, 65536)>; // 0 | 1 << 16 + +// Handle sext_inreg in i64 +def : Pat < + (i64 (sext_inreg i64:$src, i1)), + (S_BFE_I64 i64:$src, 0x10000) // 0 | 1 << 16 +>; + +def : Pat < + (i64 (sext_inreg i64:$src, i8)), + (S_BFE_I64 i64:$src, 0x80000) // 0 | 8 << 16 +>; + +def : Pat < + (i64 (sext_inreg i64:$src, i16)), + (S_BFE_I64 i64:$src, 0x100000) // 0 | 16 << 16 +>; + +def : Pat < + (i64 (sext_inreg i64:$src, i32)), + (S_BFE_I64 i64:$src, 0x200000) // 0 | 32 << 16 +>; + +class ZExt_i64_i32_Pat : Pat < + (i64 (ext i32:$src)), + (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 0), sub1) +>; + +class ZExt_i64_i1_Pat : Pat < + (i64 (ext i1:$src)), + (REG_SEQUENCE VReg_64, + (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0, + (S_MOV_B32 0), sub1) +>; + + +def : ZExt_i64_i32_Pat; +def : ZExt_i64_i32_Pat; +def : ZExt_i64_i1_Pat; +def : ZExt_i64_i1_Pat; + +def : Pat < + (i64 (sext i32:$src)), + (REG_SEQUENCE SReg_64, $src, sub0, + (S_ASHR_I32 $src, 31), sub1) +>; + +def : Pat < + (i64 (sext i1:$src)), + (REG_SEQUENCE VReg_64, + (V_CNDMASK_B32_e64 0, -1, $src), sub0, + (V_CNDMASK_B32_e64 0, -1, $src), sub1) +>; + +// If we need to perform a logical operation on i1 values, we need to +// use vector comparisons since there is only one SCC register. Vector +// comparisions still write to a pair of SGPRs, so treat these as +// 64-bit comparisons. When legalizing SGPR copies, instructions +// resulting in the copies from SCC to these instructions will be +// moved to the VALU. +def : Pat < + (i1 (and i1:$src0, i1:$src1)), + (S_AND_B64 $src0, $src1) +>; + +def : Pat < + (i1 (or i1:$src0, i1:$src1)), + (S_OR_B64 $src0, $src1) +>; + +def : Pat < + (i1 (xor i1:$src0, i1:$src1)), + (S_XOR_B64 $src0, $src1) +>; + +def : Pat < + (f32 (sint_to_fp i1:$src)), + (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_NEG_ONE, $src) +>; + +def : Pat < + (f32 (uint_to_fp i1:$src)), + (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_ONE, $src) +>; + +def : Pat < + (f64 (sint_to_fp i1:$src)), + (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src)) +>; + +def : Pat < + (f64 (uint_to_fp i1:$src)), + (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src)) +>; + +//===----------------------------------------------------------------------===// +// Miscellaneous Patterns +//===----------------------------------------------------------------------===// + +def : Pat < + (i32 (trunc i64:$a)), + (EXTRACT_SUBREG $a, sub0) +>; + +def : Pat < + (i1 (trunc i32:$a)), + (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1), $a), 1) +>; + +def : Pat < + (i1 (trunc i64:$a)), + (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1), + (EXTRACT_SUBREG $a, sub0)), 1) +>; + +def : Pat < + (i32 (bswap i32:$a)), + (V_BFI_B32 (S_MOV_B32 0x00ff00ff), + (V_ALIGNBIT_B32 $a, $a, 24), + (V_ALIGNBIT_B32 $a, $a, 8)) +>; + +def : Pat < + (f32 (select i1:$src2, f32:$src1, f32:$src0)), + (V_CNDMASK_B32_e64 $src0, $src1, $src2) +>; + +multiclass BFMPatterns { + def : Pat < + (vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)), + (BFM $a, $b) + >; + + def : Pat < + (vt (add (vt (shl 1, vt:$a)), -1)), + (BFM $a, (MOV 0)) + >; +} + +defm : BFMPatterns ; +// FIXME: defm : BFMPatterns ; + +def : BFEPattern ; + +//===----------------------------------------------------------------------===// +// Fract Patterns +//===----------------------------------------------------------------------===// + +let Predicates = [isSI] in { + +// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is +// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient +// way to implement it is using V_FRACT_F64. +// The workaround for the V_FRACT bug is: +// fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) + +// Convert (x + (-floor(x)) to fract(x) +def : Pat < + (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), + (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), + (V_CNDMASK_B64_PSEUDO + $x, + (V_MIN_F64 + SRCMODS.NONE, + (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE), + SRCMODS.NONE, + (V_MOV_B64_PSEUDO 0x3fefffffffffffff), + DSTCLAMP.NONE, DSTOMOD.NONE), + (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)) +>; + +// Convert floor(x) to (x - fract(x)) +def : Pat < + (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))), + (V_ADD_F64 + $mods, + $x, + SRCMODS.NEG, + (V_CNDMASK_B64_PSEUDO + $x, + (V_MIN_F64 + SRCMODS.NONE, + (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE), + SRCMODS.NONE, + (V_MOV_B64_PSEUDO 0x3fefffffffffffff), + DSTCLAMP.NONE, DSTOMOD.NONE), + (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)), + DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +} // End Predicates = [isSI] + +let Predicates = [isCI] in { + +// Convert (x - floor(x)) to fract(x) +def : Pat < + (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)), + (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))), + (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +// Convert (x + (-floor(x))) to fract(x) +def : Pat < + (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), + (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), + (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +} // End Predicates = [isCI] + +//============================================================================// +// Miscellaneous Optimization Patterns +//============================================================================// + +def : SHA256MaPattern ; + +//============================================================================// +// Assembler aliases +//============================================================================// + +def : MnemonicAlias<"v_add_u32", "v_add_i32">; +def : MnemonicAlias<"v_sub_u32", "v_sub_i32">; +def : MnemonicAlias<"v_subrev_u32", "v_subrev_i32">; + +} // End isGCN predicate diff --git a/lib/Target/AMDGPU/SIIntrinsics.td b/lib/Target/AMDGPU/SIIntrinsics.td new file mode 100644 index 00000000000..027a0a2f516 --- /dev/null +++ b/lib/Target/AMDGPU/SIIntrinsics.td @@ -0,0 +1,199 @@ +//===-- SIIntrinsics.td - SI Intrinsic defs ----------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// SI Intrinsic Definitions +// +//===----------------------------------------------------------------------===// + + +let TargetPrefix = "SI", isTarget = 1 in { + + def int_SI_tid : Intrinsic <[llvm_i32_ty], [], [IntrNoMem]>; + def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; + def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; + def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; + def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_anyint_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ; + + // Fully-flexible TBUFFER_STORE_FORMAT_* except for the ADDR64 bit, which is not exposed + def int_SI_tbuffer_store : Intrinsic < + [], + [llvm_anyint_ty, // rsrc(SGPR) + llvm_anyint_ty, // vdata(VGPR), overloaded for types i32, v2i32, v4i32 + llvm_i32_ty, // num_channels(imm), selects opcode suffix: 1=X, 2=XY, 3=XYZ, 4=XYZW + llvm_i32_ty, // vaddr(VGPR) + llvm_i32_ty, // soffset(SGPR) + llvm_i32_ty, // inst_offset(imm) + llvm_i32_ty, // dfmt(imm) + llvm_i32_ty, // nfmt(imm) + llvm_i32_ty, // offen(imm) + llvm_i32_ty, // idxen(imm) + llvm_i32_ty, // glc(imm) + llvm_i32_ty, // slc(imm) + llvm_i32_ty], // tfe(imm) + []>; + + // Fully-flexible BUFFER_LOAD_DWORD_* except for the ADDR64 bit, which is not exposed + def int_SI_buffer_load_dword : Intrinsic < + [llvm_anyint_ty], // vdata(VGPR), overloaded for types i32, v2i32, v4i32 + [llvm_anyint_ty, // rsrc(SGPR) + llvm_anyint_ty, // vaddr(VGPR) + llvm_i32_ty, // soffset(SGPR) + llvm_i32_ty, // inst_offset(imm) + llvm_i32_ty, // offen(imm) + llvm_i32_ty, // idxen(imm) + llvm_i32_ty, // glc(imm) + llvm_i32_ty, // slc(imm) + llvm_i32_ty], // tfe(imm) + [IntrReadArgMem]>; + + def int_SI_sendmsg : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + + // Fully-flexible SAMPLE instruction. + class SampleRaw : Intrinsic < + [llvm_v4f32_ty], // vdata(VGPR) + [llvm_anyint_ty, // vaddr(VGPR) + llvm_v8i32_ty, // rsrc(SGPR) + llvm_v4i32_ty, // sampler(SGPR) + llvm_i32_ty, // dmask(imm) + llvm_i32_ty, // unorm(imm) + llvm_i32_ty, // r128(imm) + llvm_i32_ty, // da(imm) + llvm_i32_ty, // glc(imm) + llvm_i32_ty, // slc(imm) + llvm_i32_ty, // tfe(imm) + llvm_i32_ty], // lwe(imm) + [IntrNoMem]>; + + // Image instruction without a sampler. + class Image : Intrinsic < + [llvm_v4f32_ty], // vdata(VGPR) + [llvm_anyint_ty, // vaddr(VGPR) + llvm_v8i32_ty, // rsrc(SGPR) + llvm_i32_ty, // dmask(imm) + llvm_i32_ty, // unorm(imm) + llvm_i32_ty, // r128(imm) + llvm_i32_ty, // da(imm) + llvm_i32_ty, // glc(imm) + llvm_i32_ty, // slc(imm) + llvm_i32_ty, // tfe(imm) + llvm_i32_ty], // lwe(imm) + [IntrNoMem]>; + + // Basic sample + def int_SI_image_sample : SampleRaw; + def int_SI_image_sample_cl : SampleRaw; + def int_SI_image_sample_d : SampleRaw; + def int_SI_image_sample_d_cl : SampleRaw; + def int_SI_image_sample_l : SampleRaw; + def int_SI_image_sample_b : SampleRaw; + def int_SI_image_sample_b_cl : SampleRaw; + def int_SI_image_sample_lz : SampleRaw; + def int_SI_image_sample_cd : SampleRaw; + def int_SI_image_sample_cd_cl : SampleRaw; + + // Sample with comparison + def int_SI_image_sample_c : SampleRaw; + def int_SI_image_sample_c_cl : SampleRaw; + def int_SI_image_sample_c_d : SampleRaw; + def int_SI_image_sample_c_d_cl : SampleRaw; + def int_SI_image_sample_c_l : SampleRaw; + def int_SI_image_sample_c_b : SampleRaw; + def int_SI_image_sample_c_b_cl : SampleRaw; + def int_SI_image_sample_c_lz : SampleRaw; + def int_SI_image_sample_c_cd : SampleRaw; + def int_SI_image_sample_c_cd_cl : SampleRaw; + + // Sample with offsets + def int_SI_image_sample_o : SampleRaw; + def int_SI_image_sample_cl_o : SampleRaw; + def int_SI_image_sample_d_o : SampleRaw; + def int_SI_image_sample_d_cl_o : SampleRaw; + def int_SI_image_sample_l_o : SampleRaw; + def int_SI_image_sample_b_o : SampleRaw; + def int_SI_image_sample_b_cl_o : SampleRaw; + def int_SI_image_sample_lz_o : SampleRaw; + def int_SI_image_sample_cd_o : SampleRaw; + def int_SI_image_sample_cd_cl_o : SampleRaw; + + // Sample with comparison and offsets + def int_SI_image_sample_c_o : SampleRaw; + def int_SI_image_sample_c_cl_o : SampleRaw; + def int_SI_image_sample_c_d_o : SampleRaw; + def int_SI_image_sample_c_d_cl_o : SampleRaw; + def int_SI_image_sample_c_l_o : SampleRaw; + def int_SI_image_sample_c_b_o : SampleRaw; + def int_SI_image_sample_c_b_cl_o : SampleRaw; + def int_SI_image_sample_c_lz_o : SampleRaw; + def int_SI_image_sample_c_cd_o : SampleRaw; + def int_SI_image_sample_c_cd_cl_o : SampleRaw; + + // Basic gather4 + def int_SI_gather4 : SampleRaw; + def int_SI_gather4_cl : SampleRaw; + def int_SI_gather4_l : SampleRaw; + def int_SI_gather4_b : SampleRaw; + def int_SI_gather4_b_cl : SampleRaw; + def int_SI_gather4_lz : SampleRaw; + + // Gather4 with comparison + def int_SI_gather4_c : SampleRaw; + def int_SI_gather4_c_cl : SampleRaw; + def int_SI_gather4_c_l : SampleRaw; + def int_SI_gather4_c_b : SampleRaw; + def int_SI_gather4_c_b_cl : SampleRaw; + def int_SI_gather4_c_lz : SampleRaw; + + // Gather4 with offsets + def int_SI_gather4_o : SampleRaw; + def int_SI_gather4_cl_o : SampleRaw; + def int_SI_gather4_l_o : SampleRaw; + def int_SI_gather4_b_o : SampleRaw; + def int_SI_gather4_b_cl_o : SampleRaw; + def int_SI_gather4_lz_o : SampleRaw; + + // Gather4 with comparison and offsets + def int_SI_gather4_c_o : SampleRaw; + def int_SI_gather4_c_cl_o : SampleRaw; + def int_SI_gather4_c_l_o : SampleRaw; + def int_SI_gather4_c_b_o : SampleRaw; + def int_SI_gather4_c_b_cl_o : SampleRaw; + def int_SI_gather4_c_lz_o : SampleRaw; + + def int_SI_getlod : SampleRaw; + + // Image instrinsics. + def int_SI_image_load : Image; + def int_SI_image_load_mip : Image; + def int_SI_getresinfo : Image; + + // Deprecated image and sample intrinsics. + class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; + + def int_SI_sample : Sample; + def int_SI_sampleb : Sample; + def int_SI_sampled : Sample; + def int_SI_samplel : Sample; + def int_SI_imageload : Intrinsic <[llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_SI_resinfo : Intrinsic <[llvm_v4i32_ty], [llvm_i32_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; + + /* Interpolation Intrinsics */ + + def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_SI_fs_interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty], [IntrNoMem]>; + + /* Control flow Intrinsics */ + + def int_SI_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>; + def int_SI_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], []>; + def int_SI_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>; + def int_SI_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], []>; + def int_SI_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], []>; + def int_SI_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], []>; + def int_SI_end_cf : Intrinsic<[], [llvm_i64_ty], []>; +} diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp new file mode 100644 index 00000000000..9b1d256dc5a --- /dev/null +++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -0,0 +1,421 @@ +//===-- SILoadStoreOptimizer.cpp ------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass tries to fuse DS instructions with close by immediate offsets. +// This will fuse operations such as +// ds_read_b32 v0, v2 offset:16 +// ds_read_b32 v1, v2 offset:32 +// ==> +// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 +// +// +// Future improvements: +// +// - This currently relies on the scheduler to place loads and stores next to +// each other, and then only merges adjacent pairs of instructions. It would +// be good to be more flexible with interleaved instructions, and possibly run +// before scheduling. It currently missing stores of constants because loading +// the constant into the data register is placed between the stores, although +// this is arguably a scheduling problem. +// +// - Live interval recomputing seems inefficient. This currently only matches +// one pair, and recomputes live intervals and moves on to the next pair. It +// would be better to compute a list of all merges that need to occur +// +// - With a list of instructions to process, we can also merge more. If a +// cluster of loads have offsets that are too large to fit in the 8-bit +// offsets, but are close enough to fit in the 8 bits, we can add to the base +// pointer and use the new reduced offsets. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-load-store-opt" + +namespace { + +class SILoadStoreOptimizer : public MachineFunctionPass { +private: + const SIInstrInfo *TII; + const SIRegisterInfo *TRI; + MachineRegisterInfo *MRI; + LiveIntervals *LIS; + + + static bool offsetsCanBeCombined(unsigned Offset0, + unsigned Offset1, + unsigned EltSize); + + MachineBasicBlock::iterator findMatchingDSInst(MachineBasicBlock::iterator I, + unsigned EltSize); + + void updateRegDefsUses(unsigned SrcReg, + unsigned DstReg, + unsigned SubIdx); + + MachineBasicBlock::iterator mergeRead2Pair( + MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Paired, + unsigned EltSize); + + MachineBasicBlock::iterator mergeWrite2Pair( + MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Paired, + unsigned EltSize); + +public: + static char ID; + + SILoadStoreOptimizer() + : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr), + LIS(nullptr) {} + + SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) { + initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); + } + + bool optimizeBlock(MachineBasicBlock &MBB); + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Load / Store Optimizer"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addRequired(); + + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, + "SI Load / Store Optimizer", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(LiveVariables) +INITIALIZE_PASS_DEPENDENCY(SlotIndexes) +INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, + "SI Load / Store Optimizer", false, false) + +char SILoadStoreOptimizer::ID = 0; + +char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; + +FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) { + return new SILoadStoreOptimizer(TM); +} + +bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0, + unsigned Offset1, + unsigned Size) { + // XXX - Would the same offset be OK? Is there any reason this would happen or + // be useful? + if (Offset0 == Offset1) + return false; + + // This won't be valid if the offset isn't aligned. + if ((Offset0 % Size != 0) || (Offset1 % Size != 0)) + return false; + + unsigned EltOffset0 = Offset0 / Size; + unsigned EltOffset1 = Offset1 / Size; + + // Check if the new offsets fit in the reduced 8-bit range. + if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) + return true; + + // If the offset in elements doesn't fit in 8-bits, we might be able to use + // the stride 64 versions. + if ((EltOffset0 % 64 != 0) || (EltOffset1 % 64) != 0) + return false; + + return isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64); +} + +MachineBasicBlock::iterator +SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I, + unsigned EltSize){ + MachineBasicBlock::iterator E = I->getParent()->end(); + MachineBasicBlock::iterator MBBI = I; + ++MBBI; + + if (MBBI->getOpcode() != I->getOpcode()) + return E; + + // Don't merge volatiles. + if (MBBI->hasOrderedMemoryRef()) + return E; + + int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr); + const MachineOperand &AddrReg0 = I->getOperand(AddrIdx); + const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx); + + // Check same base pointer. Be careful of subregisters, which can occur with + // vectors of pointers. + if (AddrReg0.getReg() == AddrReg1.getReg() && + AddrReg0.getSubReg() == AddrReg1.getSubReg()) { + int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), + AMDGPU::OpName::offset); + unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff; + unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff; + + // Check both offsets fit in the reduced range. + if (offsetsCanBeCombined(Offset0, Offset1, EltSize)) + return MBBI; + } + + return E; +} + +void SILoadStoreOptimizer::updateRegDefsUses(unsigned SrcReg, + unsigned DstReg, + unsigned SubIdx) { + for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(SrcReg), + E = MRI->reg_end(); I != E; ) { + MachineOperand &O = *I; + ++I; + O.substVirtReg(DstReg, SubIdx, *TRI); + } +} + +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( + MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Paired, + unsigned EltSize) { + MachineBasicBlock *MBB = I->getParent(); + + // Be careful, since the addresses could be subregisters themselves in weird + // cases, like vectors of pointers. + const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr); + + unsigned DestReg0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst)->getReg(); + unsigned DestReg1 + = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst)->getReg(); + + unsigned Offset0 + = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; + unsigned Offset1 + = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; + + unsigned NewOffset0 = Offset0 / EltSize; + unsigned NewOffset1 = Offset1 / EltSize; + unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; + + // Prefer the st64 form if we can use it, even if we can fit the offset in the + // non st64 version. I'm not sure if there's any real reason to do this. + bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); + if (UseST64) { + NewOffset0 /= 64; + NewOffset1 /= 64; + Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; + } + + assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && + (NewOffset0 != NewOffset1) && + "Computed offset doesn't fit"); + + const MCInstrDesc &Read2Desc = TII->get(Opc); + + const TargetRegisterClass *SuperRC + = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; + unsigned DestReg = MRI->createVirtualRegister(SuperRC); + + DebugLoc DL = I->getDebugLoc(); + MachineInstrBuilder Read2 + = BuildMI(*MBB, I, DL, Read2Desc, DestReg) + .addOperand(*AddrReg) // addr + .addImm(NewOffset0) // offset0 + .addImm(NewOffset1) // offset1 + .addImm(0) // gds + .addMemOperand(*I->memoperands_begin()) + .addMemOperand(*Paired->memoperands_begin()); + + unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; + unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; + updateRegDefsUses(DestReg0, DestReg, SubRegIdx0); + updateRegDefsUses(DestReg1, DestReg, SubRegIdx1); + + LIS->RemoveMachineInstrFromMaps(I); + // Replacing Paired in the maps with Read2 allows us to avoid updating the + // live range for the m0 register. + LIS->ReplaceMachineInstrInMaps(Paired, Read2); + I->eraseFromParent(); + Paired->eraseFromParent(); + + LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg()); + LIS->shrinkToUses(&AddrRegLI); + + LIS->getInterval(DestReg); // Create new LI + + DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); + return Read2.getInstr(); +} + +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( + MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Paired, + unsigned EltSize) { + MachineBasicBlock *MBB = I->getParent(); + + // Be sure to use .addOperand(), and not .addReg() with these. We want to be + // sure we preserve the subregister index and any register flags set on them. + const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr); + const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0); + const MachineOperand *Data1 + = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0); + + + unsigned Offset0 + = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; + unsigned Offset1 + = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; + + unsigned NewOffset0 = Offset0 / EltSize; + unsigned NewOffset1 = Offset1 / EltSize; + unsigned Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; + + // Prefer the st64 form if we can use it, even if we can fit the offset in the + // non st64 version. I'm not sure if there's any real reason to do this. + bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); + if (UseST64) { + NewOffset0 /= 64; + NewOffset1 /= 64; + Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; + } + + assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && + (NewOffset0 != NewOffset1) && + "Computed offset doesn't fit"); + + const MCInstrDesc &Write2Desc = TII->get(Opc); + DebugLoc DL = I->getDebugLoc(); + + // repairLiveintervalsInRange() doesn't handle physical register, so we have + // to update the M0 range manually. + SlotIndex PairedIndex = LIS->getInstructionIndex(Paired); + LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI)); + LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex); + bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot(); + + MachineInstrBuilder Write2 + = BuildMI(*MBB, I, DL, Write2Desc) + .addOperand(*Addr) // addr + .addOperand(*Data0) // data0 + .addOperand(*Data1) // data1 + .addImm(NewOffset0) // offset0 + .addImm(NewOffset1) // offset1 + .addImm(0) // gds + .addMemOperand(*I->memoperands_begin()) + .addMemOperand(*Paired->memoperands_begin()); + + // XXX - How do we express subregisters here? + unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() }; + + LIS->RemoveMachineInstrFromMaps(I); + LIS->RemoveMachineInstrFromMaps(Paired); + I->eraseFromParent(); + Paired->eraseFromParent(); + + // This doesn't handle physical registers like M0 + LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs); + + if (UpdateM0Range) { + SlotIndex Write2Index = LIS->getInstructionIndex(Write2); + M0Segment->end = Write2Index.getRegSlot(); + } + + DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); + return Write2.getInstr(); +} + +// Scan through looking for adjacent LDS operations with constant offsets from +// the same base register. We rely on the scheduler to do the hard work of +// clustering nearby loads, and assume these are all adjacent. +bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { + bool Modified = false; + + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { + MachineInstr &MI = *I; + + // Don't combine if volatile. + if (MI.hasOrderedMemoryRef()) { + ++I; + continue; + } + + unsigned Opc = MI.getOpcode(); + if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) { + unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4; + MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size); + if (Match != E) { + Modified = true; + I = mergeRead2Pair(I, Match, Size); + } else { + ++I; + } + + continue; + } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) { + unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4; + MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size); + if (Match != E) { + Modified = true; + I = mergeWrite2Pair(I, Match, Size); + } else { + ++I; + } + + continue; + } + + ++I; + } + + return Modified; +} + +bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { + const TargetSubtargetInfo &STM = MF.getSubtarget(); + TRI = static_cast(STM.getRegisterInfo()); + TII = static_cast(STM.getInstrInfo()); + MRI = &MF.getRegInfo(); + + LIS = &getAnalysis(); + + DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); + + assert(!MRI->isSSA()); + + bool Modified = false; + + for (MachineBasicBlock &MBB : MF) + Modified |= optimizeBlock(MBB); + + return Modified; +} diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp new file mode 100644 index 00000000000..c319b32111f --- /dev/null +++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -0,0 +1,605 @@ +//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This pass lowers the pseudo control flow instructions to real +/// machine instructions. +/// +/// All control flow is handled using predicated instructions and +/// a predicate stack. Each Scalar ALU controls the operations of 64 Vector +/// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs +/// by writting to the 64-bit EXEC register (each bit corresponds to a +/// single vector ALU). Typically, for predicates, a vector ALU will write +/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each +/// Vector ALU) and then the ScalarALU will AND the VCC register with the +/// EXEC to update the predicates. +/// +/// For example: +/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2 +/// %SGPR0 = SI_IF %VCC +/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 +/// %SGPR0 = SI_ELSE %SGPR0 +/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0 +/// SI_END_CF %SGPR0 +/// +/// becomes: +/// +/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask +/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask +/// S_CBRANCH_EXECZ label0 // This instruction is an optional +/// // optimization which allows us to +/// // branch if all the bits of +/// // EXEC are zero. +/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch +/// +/// label0: +/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC // Restore the exec mask for the Then block +/// %EXEC = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask +/// S_BRANCH_EXECZ label1 // Use our branch optimization +/// // instruction again. +/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block +/// label1: +/// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Constants.h" + +using namespace llvm; + +namespace { + +class SILowerControlFlowPass : public MachineFunctionPass { + +private: + static const unsigned SkipThreshold = 12; + + static char ID; + const SIRegisterInfo *TRI; + const SIInstrInfo *TII; + + bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To); + + void Skip(MachineInstr &From, MachineOperand &To); + void SkipIfDead(MachineInstr &MI); + + void If(MachineInstr &MI); + void Else(MachineInstr &MI); + void Break(MachineInstr &MI); + void IfBreak(MachineInstr &MI); + void ElseBreak(MachineInstr &MI); + void Loop(MachineInstr &MI); + void EndCf(MachineInstr &MI); + + void Kill(MachineInstr &MI); + void Branch(MachineInstr &MI); + + void LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0); + void computeIndirectRegAndOffset(unsigned VecReg, unsigned &Reg, int &Offset); + void IndirectSrc(MachineInstr &MI); + void IndirectDst(MachineInstr &MI); + +public: + SILowerControlFlowPass(TargetMachine &tm) : + MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Lower control flow instructions"; + } + +}; + +} // End anonymous namespace + +char SILowerControlFlowPass::ID = 0; + +FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) { + return new SILowerControlFlowPass(tm); +} + +bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From, + MachineBasicBlock *To) { + + unsigned NumInstr = 0; + + for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty(); + MBB = *MBB->succ_begin()) { + + for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); + NumInstr < SkipThreshold && I != E; ++I) { + + if (I->isBundle() || !I->isBundled()) + if (++NumInstr >= SkipThreshold) + return true; + } + } + + return false; +} + +void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) { + + if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB())) + return; + + DebugLoc DL = From.getDebugLoc(); + BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) + .addOperand(To) + .addReg(AMDGPU::EXEC); +} + +void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + if (MBB.getParent()->getInfo()->getShaderType() != + ShaderType::PIXEL || + !shouldSkip(&MBB, &MBB.getParent()->back())) + return; + + MachineBasicBlock::iterator Insert = &MI; + ++Insert; + + // If the exec mask is non-zero, skip the next two instructions + BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addImm(3) + .addReg(AMDGPU::EXEC); + + // Exec mask is zero: Export to NULL target... + BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP)) + .addImm(0) + .addImm(0x09) // V_008DFC_SQ_EXP_NULL + .addImm(0) + .addImm(1) + .addImm(1) + .addReg(AMDGPU::VGPR0) + .addReg(AMDGPU::VGPR0) + .addReg(AMDGPU::VGPR0) + .addReg(AMDGPU::VGPR0); + + // ... and terminate wavefront + BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); +} + +void SILowerControlFlowPass::If(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + unsigned Reg = MI.getOperand(0).getReg(); + unsigned Vcc = MI.getOperand(1).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg) + .addReg(Vcc); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg) + .addReg(AMDGPU::EXEC) + .addReg(Reg); + + Skip(MI, MI.getOperand(2)); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::Else(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Src = MI.getOperand(1).getReg(); + + BuildMI(MBB, MBB.getFirstNonPHI(), DL, + TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst) + .addReg(Src); // Saved EXEC + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(Dst); + + Skip(MI, MI.getOperand(2)); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::Break(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Src = MI.getOperand(1).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + .addReg(AMDGPU::EXEC) + .addReg(Src); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::IfBreak(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Vcc = MI.getOperand(1).getReg(); + unsigned Src = MI.getOperand(2).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + .addReg(Vcc) + .addReg(Src); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Saved = MI.getOperand(1).getReg(); + unsigned Src = MI.getOperand(2).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + .addReg(Saved) + .addReg(Src); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::Loop(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + unsigned Src = MI.getOperand(0).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(Src); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addOperand(MI.getOperand(1)) + .addReg(AMDGPU::EXEC); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::EndCf(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + unsigned Reg = MI.getOperand(0).getReg(); + + BuildMI(MBB, MBB.getFirstNonPHI(), DL, + TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(Reg); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::Branch(MachineInstr &MI) { + if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode()) + MI.eraseFromParent(); + + // If these aren't equal, this is probably an infinite loop. +} + +void SILowerControlFlowPass::Kill(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + const MachineOperand &Op = MI.getOperand(0); + +#ifndef NDEBUG + const SIMachineFunctionInfo *MFI + = MBB.getParent()->getInfo(); + // Kill is only allowed in pixel / geometry shaders. + assert(MFI->getShaderType() == ShaderType::PIXEL || + MFI->getShaderType() == ShaderType::GEOMETRY); +#endif + + // Clear this thread from the exec mask if the operand is negative + if ((Op.isImm())) { + // Constant operand: Set exec mask to 0 or do nothing + if (Op.getImm() & 0x80000000) { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + .addImm(0); + } + } else { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC) + .addImm(0) + .addOperand(Op); + } + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + MachineBasicBlock::iterator I = MI; + + unsigned Save = MI.getOperand(1).getReg(); + unsigned Idx = MI.getOperand(3).getReg(); + + if (AMDGPU::SReg_32RegClass.contains(Idx)) { + if (Offset) { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) + .addReg(Idx) + .addImm(Offset); + } else { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addReg(Idx); + } + MBB.insert(I, MovRel); + } else { + + assert(AMDGPU::SReg_64RegClass.contains(Save)); + assert(AMDGPU::VGPR_32RegClass.contains(Idx)); + + // Save the EXEC mask + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save) + .addReg(AMDGPU::EXEC); + + // Read the next variant into VCC (lower 32 bits) <- also loop target + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + AMDGPU::VCC_LO) + .addReg(Idx); + + // Move index from VCC into M0 + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addReg(AMDGPU::VCC_LO); + + // Compare the just read M0 value to all possible Idx values + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC) + .addReg(AMDGPU::M0) + .addReg(Idx); + + // Update EXEC, save the original EXEC value to VCC + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) + .addReg(AMDGPU::VCC); + + if (Offset) { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) + .addReg(AMDGPU::M0) + .addImm(Offset); + } + // Do the actual move + MBB.insert(I, MovRel); + + // Update EXEC, switch all done bits to 0 and all todo bits to 1 + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(AMDGPU::VCC); + + // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addImm(-7) + .addReg(AMDGPU::EXEC); + + // Restore EXEC + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + .addReg(Save); + + } + MI.eraseFromParent(); +} + +/// \param @VecReg The register which holds element zero of the vector +/// being addressed into. +/// \param[out] @Reg The base register to use in the indirect addressing instruction. +/// \param[in,out] @Offset As an input, this is the constant offset part of the +// indirect Index. e.g. v0 = v[VecReg + Offset] +// As an output, this is a constant value that needs +// to be added to the value stored in M0. +void SILowerControlFlowPass::computeIndirectRegAndOffset(unsigned VecReg, + unsigned &Reg, + int &Offset) { + unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0); + if (!SubReg) + SubReg = VecReg; + + const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg); + int RegIdx = TRI->getHWRegIndex(SubReg) + Offset; + + if (RegIdx < 0) { + Offset = RegIdx; + RegIdx = 0; + } else { + Offset = 0; + } + + Reg = RC->getRegister(RegIdx); +} + +void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Vec = MI.getOperand(2).getReg(); + int Off = MI.getOperand(4).getImm(); + unsigned Reg; + + computeIndirectRegAndOffset(Vec, Reg, Off); + + MachineInstr *MovRel = + BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) + .addReg(Reg) + .addReg(AMDGPU::M0, RegState::Implicit) + .addReg(Vec, RegState::Implicit); + + LoadM0(MI, MovRel, Off); +} + +void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + unsigned Dst = MI.getOperand(0).getReg(); + int Off = MI.getOperand(4).getImm(); + unsigned Val = MI.getOperand(5).getReg(); + unsigned Reg; + + computeIndirectRegAndOffset(Dst, Reg, Off); + + MachineInstr *MovRel = + BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32)) + .addReg(Reg, RegState::Define) + .addReg(Val) + .addReg(AMDGPU::M0, RegState::Implicit) + .addReg(Dst, RegState::Implicit); + + LoadM0(MI, MovRel, Off); +} + +bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { + TII = static_cast(MF.getSubtarget().getInstrInfo()); + TRI = + static_cast(MF.getSubtarget().getRegisterInfo()); + SIMachineFunctionInfo *MFI = MF.getInfo(); + + bool HaveKill = false; + bool NeedWQM = false; + bool NeedFlat = false; + unsigned Depth = 0; + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + + MachineInstr &MI = *I; + if (TII->isWQM(MI.getOpcode()) || TII->isDS(MI.getOpcode())) + NeedWQM = true; + + // Flat uses m0 in case it needs to access LDS. + if (TII->isFLAT(MI.getOpcode())) + NeedFlat = true; + + switch (MI.getOpcode()) { + default: break; + case AMDGPU::SI_IF: + ++Depth; + If(MI); + break; + + case AMDGPU::SI_ELSE: + Else(MI); + break; + + case AMDGPU::SI_BREAK: + Break(MI); + break; + + case AMDGPU::SI_IF_BREAK: + IfBreak(MI); + break; + + case AMDGPU::SI_ELSE_BREAK: + ElseBreak(MI); + break; + + case AMDGPU::SI_LOOP: + ++Depth; + Loop(MI); + break; + + case AMDGPU::SI_END_CF: + if (--Depth == 0 && HaveKill) { + SkipIfDead(MI); + HaveKill = false; + } + EndCf(MI); + break; + + case AMDGPU::SI_KILL: + if (Depth == 0) + SkipIfDead(MI); + else + HaveKill = true; + Kill(MI); + break; + + case AMDGPU::S_BRANCH: + Branch(MI); + break; + + case AMDGPU::SI_INDIRECT_SRC: + IndirectSrc(MI); + break; + + case AMDGPU::SI_INDIRECT_DST_V1: + case AMDGPU::SI_INDIRECT_DST_V2: + case AMDGPU::SI_INDIRECT_DST_V4: + case AMDGPU::SI_INDIRECT_DST_V8: + case AMDGPU::SI_INDIRECT_DST_V16: + IndirectDst(MI); + break; + } + } + } + + if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) { + MachineBasicBlock &MBB = MF.front(); + BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64), + AMDGPU::EXEC).addReg(AMDGPU::EXEC); + } + + // FIXME: This seems inappropriate to do here. + if (NeedFlat && MFI->IsKernel) { + // Insert the prologue initializing the SGPRs pointing to the scratch space + // for flat accesses. + const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + + // TODO: What to use with function calls? + + // FIXME: This is reporting stack size that is used in a scratch buffer + // rather than registers as well. + uint64_t StackSizeBytes = FrameInfo->getStackSize(); + + int IndirectBegin + = static_cast(TII)->getIndirectIndexBegin(MF); + // Convert register index to 256-byte unit. + uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256); + + assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff && + "Stack limits should be smaller than 16-bits"); + + // Initialize the flat scratch register pair. + // TODO: Can we use one s_mov_b64 here? + + // Offset is in units of 256-bytes. + MachineBasicBlock &MBB = MF.front(); + DebugLoc NoDL; + MachineBasicBlock::iterator Start = MBB.getFirstNonPHI(); + const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32); + + assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes)); + + BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO) + .addImm(StackOffset); + + // Documentation says size is "per-thread scratch size in bytes" + BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI) + .addImm(StackSizeBytes); + } + + return true; +} diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp new file mode 100644 index 00000000000..67421e231d8 --- /dev/null +++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -0,0 +1,151 @@ +//===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// i1 values are usually inserted by the CFG Structurize pass and they are +/// unique in that they can be copied from VALU to SALU registers. +/// This is not possible for any other value type. Since there are no +/// MOV instructions for i1, we to use V_CMP_* and V_CNDMASK to move the i1. +/// +//===----------------------------------------------------------------------===// +// + +#define DEBUG_TYPE "si-i1-copies" +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +namespace { + +class SILowerI1Copies : public MachineFunctionPass { +public: + static char ID; + +public: + SILowerI1Copies() : MachineFunctionPass(ID) { + initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Lower i1 Copies"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, + "SI Lower i1 Copies", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, + "SI Lower i1 Copies", false, false) + +char SILowerI1Copies::ID = 0; + +char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID; + +FunctionPass *llvm::createSILowerI1CopiesPass() { + return new SILowerI1Copies(); +} + +bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIInstrInfo *TII = + static_cast(MF.getSubtarget().getInstrInfo()); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + std::vector I1Defs; + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + MachineInstr &MI = *I; + + if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) { + unsigned Reg = MI.getOperand(0).getReg(); + const TargetRegisterClass *RC = MRI.getRegClass(Reg); + if (RC == &AMDGPU::VReg_1RegClass) + MRI.setRegClass(Reg, &AMDGPU::SReg_64RegClass); + continue; + } + + if (MI.getOpcode() != AMDGPU::COPY) + continue; + + const MachineOperand &Dst = MI.getOperand(0); + const MachineOperand &Src = MI.getOperand(1); + + if (!TargetRegisterInfo::isVirtualRegister(Src.getReg()) || + !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) + continue; + + const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg()); + const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg()); + + if (DstRC == &AMDGPU::VReg_1RegClass && + TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) { + I1Defs.push_back(Dst.getReg()); + DebugLoc DL = MI.getDebugLoc(); + + MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg()); + if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) { + if (DefInst->getOperand(1).isImm()) { + I1Defs.push_back(Dst.getReg()); + + int64_t Val = DefInst->getOperand(1).getImm(); + assert(Val == 0 || Val == -1); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32)) + .addOperand(Dst) + .addImm(Val); + MI.eraseFromParent(); + continue; + } + } + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64)) + .addOperand(Dst) + .addImm(0) + .addImm(-1) + .addOperand(Src); + MI.eraseFromParent(); + } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) && + SrcRC == &AMDGPU::VReg_1RegClass) { + BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_I32_e64)) + .addOperand(Dst) + .addOperand(Src) + .addImm(0); + MI.eraseFromParent(); + } + } + } + + for (unsigned Reg : I1Defs) + MRI.setRegClass(Reg, &AMDGPU::VGPR_32RegClass); + + return false; +} diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp new file mode 100644 index 00000000000..587ea63d679 --- /dev/null +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -0,0 +1,77 @@ +//===-- SIMachineFunctionInfo.cpp - SI Machine Function Info -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + + +#include "SIMachineFunctionInfo.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" + +#define MAX_LANES 64 + +using namespace llvm; + + +// Pin the vtable to this file. +void SIMachineFunctionInfo::anchor() {} + +SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) + : AMDGPUMachineFunction(MF), + TIDReg(AMDGPU::NoRegister), + HasSpilledVGPRs(false), + PSInputAddr(0), + NumUserSGPRs(0), + LDSWaveSpillSize(0) { } + +SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( + MachineFunction *MF, + unsigned FrameIndex, + unsigned SubIdx) { + const MachineFrameInfo *FrameInfo = MF->getFrameInfo(); + const SIRegisterInfo *TRI = static_cast( + MF->getSubtarget().getRegisterInfo()); + MachineRegisterInfo &MRI = MF->getRegInfo(); + int64_t Offset = FrameInfo->getObjectOffset(FrameIndex); + Offset += SubIdx * 4; + + unsigned LaneVGPRIdx = Offset / (64 * 4); + unsigned Lane = (Offset / 4) % 64; + + struct SpilledReg Spill; + + if (!LaneVGPRs.count(LaneVGPRIdx)) { + unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass); + LaneVGPRs[LaneVGPRIdx] = LaneVGPR; + MRI.setPhysRegUsed(LaneVGPR); + + // Add this register as live-in to all blocks to avoid machine verifer + // complaining about use of an undefined physical register. + for (MachineFunction::iterator BI = MF->begin(), BE = MF->end(); + BI != BE; ++BI) { + BI->addLiveIn(LaneVGPR); + } + } + + Spill.VGPR = LaneVGPRs[LaneVGPRIdx]; + Spill.Lane = Lane; + return Spill; +} + +unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize( + const MachineFunction &MF) const { + const AMDGPUSubtarget &ST = MF.getSubtarget(); + // FIXME: We should get this information from kernel attributes if it + // is available. + return getShaderType() == ShaderType::COMPUTE ? 256 : ST.getWavefrontSize(); +} diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h new file mode 100644 index 00000000000..667da4c8af6 --- /dev/null +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -0,0 +1,66 @@ +//===- SIMachineFunctionInfo.h - SIMachineFunctionInfo interface -*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + + +#ifndef LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H +#define LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H + +#include "AMDGPUMachineFunction.h" +#include "SIRegisterInfo.h" +#include + +namespace llvm { + +class MachineRegisterInfo; + +/// This class keeps track of the SPI_SP_INPUT_ADDR config register, which +/// tells the hardware which interpolation parameters to load. +class SIMachineFunctionInfo : public AMDGPUMachineFunction { + void anchor() override; + + unsigned TIDReg; + bool HasSpilledVGPRs; + +public: + + struct SpilledReg { + unsigned VGPR; + int Lane; + SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) { } + SpilledReg() : VGPR(0), Lane(-1) { } + bool hasLane() { return Lane != -1;} + }; + + // SIMachineFunctionInfo definition + + SIMachineFunctionInfo(const MachineFunction &MF); + SpilledReg getSpilledReg(MachineFunction *MF, unsigned FrameIndex, + unsigned SubIdx); + unsigned PSInputAddr; + unsigned NumUserSGPRs; + std::map LaneVGPRs; + unsigned LDSWaveSpillSize; + unsigned ScratchOffsetReg; + bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; }; + unsigned getTIDReg() const { return TIDReg; }; + void setTIDReg(unsigned Reg) { TIDReg = Reg; } + bool hasSpilledVGPRs() const { return HasSpilledVGPRs; } + void setHasSpilledVGPRs(bool Spill = true) { HasSpilledVGPRs = Spill; } + + unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const; +}; + +} // End namespace llvm + + +#endif diff --git a/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp b/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp new file mode 100644 index 00000000000..0a7f684552f --- /dev/null +++ b/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp @@ -0,0 +1,194 @@ +//===-- SIPrepareScratchRegs.cpp - Use predicates for control flow --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// +/// This pass loads scratch pointer and scratch offset into a register or a +/// frame index which can be used anywhere in the program. These values will +/// be used for spilling VGPRs. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIDefines.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" + +using namespace llvm; + +namespace { + +class SIPrepareScratchRegs : public MachineFunctionPass { + +private: + static char ID; + +public: + SIPrepareScratchRegs() : MachineFunctionPass(ID) { } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI prepare scratch registers"; + } + +}; + +} // End anonymous namespace + +char SIPrepareScratchRegs::ID = 0; + +FunctionPass *llvm::createSIPrepareScratchRegs() { + return new SIPrepareScratchRegs(); +} + +bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) { + SIMachineFunctionInfo *MFI = MF.getInfo(); + const SIInstrInfo *TII = + static_cast(MF.getSubtarget().getInstrInfo()); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + MachineBasicBlock *Entry = MF.begin(); + MachineBasicBlock::iterator I = Entry->begin(); + DebugLoc DL = I->getDebugLoc(); + + // FIXME: If we don't have enough VGPRs for SGPR spilling we will need to + // run this pass. + if (!MFI->hasSpilledVGPRs()) + return false; + + unsigned ScratchPtrPreloadReg = + TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); + unsigned ScratchOffsetPreloadReg = + TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); + + if (!Entry->isLiveIn(ScratchPtrPreloadReg)) + Entry->addLiveIn(ScratchPtrPreloadReg); + + if (!Entry->isLiveIn(ScratchOffsetPreloadReg)) + Entry->addLiveIn(ScratchOffsetPreloadReg); + + // Load the scratch offset. + unsigned ScratchOffsetReg = + TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_32RegClass); + int ScratchOffsetFI = -1; + + if (ScratchOffsetReg != AMDGPU::NoRegister) { + // Found an SGPR to use + MRI.setPhysRegUsed(ScratchOffsetReg); + BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg) + .addReg(ScratchOffsetPreloadReg); + } else { + // No SGPR is available, we must spill. + ScratchOffsetFI = FrameInfo->CreateSpillStackObject(4,4); + BuildMI(*Entry, I, DL, TII->get(AMDGPU::SI_SPILL_S32_SAVE)) + .addReg(ScratchOffsetPreloadReg) + .addFrameIndex(ScratchOffsetFI) + .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) + .addReg(AMDGPU::SGPR0, RegState::Undef); + } + + + // Now that we have the scratch pointer and offset values, we need to + // add them to all the SI_SPILL_V* instructions. + + RegScavenger RS; + unsigned ScratchRsrcFI = FrameInfo->CreateSpillStackObject(16, 4); + RS.addScavengingFrameIndex(ScratchRsrcFI); + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + // Add the scratch offset reg as a live-in so that the register scavenger + // doesn't re-use it. + if (!MBB.isLiveIn(ScratchOffsetReg) && + ScratchOffsetReg != AMDGPU::NoRegister) + MBB.addLiveIn(ScratchOffsetReg); + RS.enterBasicBlock(&MBB); + + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + MachineInstr &MI = *I; + RS.forward(I); + DebugLoc DL = MI.getDebugLoc(); + if (!TII->isVGPRSpill(MI.getOpcode())) + continue; + + // Scratch resource + unsigned ScratchRsrcReg = + RS.scavengeRegister(&AMDGPU::SReg_128RegClass, 0); + + uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE | + 0xffffffff; // Size + + unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); + unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); + unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); + unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0) + .addExternalSymbol("SCRATCH_RSRC_DWORD0") + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1) + .addExternalSymbol("SCRATCH_RSRC_DWORD1") + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2) + .addImm(Rsrc & 0xffffffff) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3) + .addImm(Rsrc >> 32) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + // Scratch Offset + if (ScratchOffsetReg == AMDGPU::NoRegister) { + ScratchOffsetReg = RS.scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); + BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S32_RESTORE), + ScratchOffsetReg) + .addFrameIndex(ScratchOffsetFI) + .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) + .addReg(AMDGPU::SGPR0, RegState::Undef); + } else if (!MBB.isLiveIn(ScratchOffsetReg)) { + MBB.addLiveIn(ScratchOffsetReg); + } + + if (ScratchRsrcReg == AMDGPU::NoRegister || + ScratchOffsetReg == AMDGPU::NoRegister) { + LLVMContext &Ctx = MF.getFunction()->getContext(); + Ctx.emitError("ran out of SGPRs for spilling VGPRs"); + ScratchRsrcReg = AMDGPU::SGPR0; + ScratchOffsetReg = AMDGPU::SGPR0; + } + MI.getOperand(2).setReg(ScratchRsrcReg); + MI.getOperand(2).setIsKill(true); + MI.getOperand(2).setIsUndef(false); + MI.getOperand(3).setReg(ScratchOffsetReg); + MI.getOperand(3).setIsUndef(false); + MI.getOperand(3).setIsKill(false); + MI.addOperand(MachineOperand::CreateReg(Rsrc0, false, true, true)); + MI.addOperand(MachineOperand::CreateReg(Rsrc1, false, true, true)); + MI.addOperand(MachineOperand::CreateReg(Rsrc2, false, true, true)); + MI.addOperand(MachineOperand::CreateReg(Rsrc3, false, true, true)); + } + } + return true; +} diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp new file mode 100644 index 00000000000..db2ff0b1f95 --- /dev/null +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -0,0 +1,543 @@ +//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief SI implementation of the TargetRegisterInfo class. +// +//===----------------------------------------------------------------------===// + + +#include "SIRegisterInfo.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" + +using namespace llvm; + +SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() {} + +BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { + BitVector Reserved(getNumRegs()); + Reserved.set(AMDGPU::EXEC); + + // EXEC_LO and EXEC_HI could be allocated and used as regular register, + // but this seems likely to result in bugs, so I'm marking them as reserved. + Reserved.set(AMDGPU::EXEC_LO); + Reserved.set(AMDGPU::EXEC_HI); + + Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); + Reserved.set(AMDGPU::FLAT_SCR); + Reserved.set(AMDGPU::FLAT_SCR_LO); + Reserved.set(AMDGPU::FLAT_SCR_HI); + + // Reserve some VGPRs to use as temp registers in case we have to spill VGPRs + Reserved.set(AMDGPU::VGPR255); + Reserved.set(AMDGPU::VGPR254); + + // Tonga and Iceland can only allocate a fixed number of SGPRs due + // to a hw bug. + if (MF.getSubtarget().hasSGPRInitBug()) { + unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); + // Reserve some SGPRs for FLAT_SCRATCH and VCC (4 SGPRs). + // Assume XNACK_MASK is unused. + unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4; + + for (unsigned i = Limit; i < NumSGPRs; ++i) { + unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); + MCRegAliasIterator R = MCRegAliasIterator(Reg, this, true); + + for (; R.isValid(); ++R) + Reserved.set(*R); + } + } + + return Reserved; +} + +unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, + unsigned Idx) const { + + const AMDGPUSubtarget &STI = MF.getSubtarget(); + // FIXME: We should adjust the max number of waves based on LDS size. + unsigned SGPRLimit = getNumSGPRsAllowed(STI.getGeneration(), + STI.getMaxWavesPerCU()); + unsigned VGPRLimit = getNumVGPRsAllowed(STI.getMaxWavesPerCU()); + + for (regclass_iterator I = regclass_begin(), E = regclass_end(); + I != E; ++I) { + + unsigned NumSubRegs = std::max((int)(*I)->getSize() / 4, 1); + unsigned Limit; + + if (isSGPRClass(*I)) { + Limit = SGPRLimit / NumSubRegs; + } else { + Limit = VGPRLimit / NumSubRegs; + } + + const int *Sets = getRegClassPressureSets(*I); + assert(Sets); + for (unsigned i = 0; Sets[i] != -1; ++i) { + if (Sets[i] == (int)Idx) + return Limit; + } + } + return 256; +} + +bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { + return Fn.getFrameInfo()->hasStackObjects(); +} + +static unsigned getNumSubRegsForSpillOp(unsigned Op) { + + switch (Op) { + case AMDGPU::SI_SPILL_S512_SAVE: + case AMDGPU::SI_SPILL_S512_RESTORE: + case AMDGPU::SI_SPILL_V512_SAVE: + case AMDGPU::SI_SPILL_V512_RESTORE: + return 16; + case AMDGPU::SI_SPILL_S256_SAVE: + case AMDGPU::SI_SPILL_S256_RESTORE: + case AMDGPU::SI_SPILL_V256_SAVE: + case AMDGPU::SI_SPILL_V256_RESTORE: + return 8; + case AMDGPU::SI_SPILL_S128_SAVE: + case AMDGPU::SI_SPILL_S128_RESTORE: + case AMDGPU::SI_SPILL_V128_SAVE: + case AMDGPU::SI_SPILL_V128_RESTORE: + return 4; + case AMDGPU::SI_SPILL_V96_SAVE: + case AMDGPU::SI_SPILL_V96_RESTORE: + return 3; + case AMDGPU::SI_SPILL_S64_SAVE: + case AMDGPU::SI_SPILL_S64_RESTORE: + case AMDGPU::SI_SPILL_V64_SAVE: + case AMDGPU::SI_SPILL_V64_RESTORE: + return 2; + case AMDGPU::SI_SPILL_S32_SAVE: + case AMDGPU::SI_SPILL_S32_RESTORE: + case AMDGPU::SI_SPILL_V32_SAVE: + case AMDGPU::SI_SPILL_V32_RESTORE: + return 1; + default: llvm_unreachable("Invalid spill opcode"); + } +} + +void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, + unsigned LoadStoreOp, + unsigned Value, + unsigned ScratchRsrcReg, + unsigned ScratchOffset, + int64_t Offset, + RegScavenger *RS) const { + + MachineBasicBlock *MBB = MI->getParent(); + const MachineFunction *MF = MI->getParent()->getParent(); + const SIInstrInfo *TII = + static_cast(MF->getSubtarget().getInstrInfo()); + LLVMContext &Ctx = MF->getFunction()->getContext(); + DebugLoc DL = MI->getDebugLoc(); + bool IsLoad = TII->get(LoadStoreOp).mayLoad(); + + bool RanOutOfSGPRs = false; + unsigned SOffset = ScratchOffset; + + unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + unsigned Size = NumSubRegs * 4; + + if (!isUInt<12>(Offset + Size)) { + SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0); + if (SOffset == AMDGPU::NoRegister) { + RanOutOfSGPRs = true; + SOffset = AMDGPU::SGPR0; + } + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) + .addReg(ScratchOffset) + .addImm(Offset); + Offset = 0; + } + + if (RanOutOfSGPRs) + Ctx.emitError("Ran out of SGPRs for spilling VGPRS"); + + for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) { + unsigned SubReg = NumSubRegs > 1 ? + getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) : + Value; + bool IsKill = (i == e - 1); + + BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) + .addReg(SubReg, getDefRegState(IsLoad)) + .addReg(ScratchRsrcReg, getKillRegState(IsKill)) + .addReg(SOffset) + .addImm(Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addReg(Value, RegState::Implicit | getDefRegState(IsLoad)); + } +} + +void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, + int SPAdj, unsigned FIOperandNum, + RegScavenger *RS) const { + MachineFunction *MF = MI->getParent()->getParent(); + MachineBasicBlock *MBB = MI->getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo(); + MachineFrameInfo *FrameInfo = MF->getFrameInfo(); + const SIInstrInfo *TII = + static_cast(MF->getSubtarget().getInstrInfo()); + DebugLoc DL = MI->getDebugLoc(); + + MachineOperand &FIOp = MI->getOperand(FIOperandNum); + int Index = MI->getOperand(FIOperandNum).getIndex(); + + switch (MI->getOpcode()) { + // SGPR register spill + case AMDGPU::SI_SPILL_S512_SAVE: + case AMDGPU::SI_SPILL_S256_SAVE: + case AMDGPU::SI_SPILL_S128_SAVE: + case AMDGPU::SI_SPILL_S64_SAVE: + case AMDGPU::SI_SPILL_S32_SAVE: { + unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + + for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { + unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(), + &AMDGPU::SGPR_32RegClass, i); + struct SIMachineFunctionInfo::SpilledReg Spill = + MFI->getSpilledReg(MF, Index, i); + + if (Spill.VGPR == AMDGPU::NoRegister) { + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("Ran out of VGPRs for spilling SGPR"); + } + + BuildMI(*MBB, MI, DL, + TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), + Spill.VGPR) + .addReg(SubReg) + .addImm(Spill.Lane); + + } + MI->eraseFromParent(); + break; + } + + // SGPR register restore + case AMDGPU::SI_SPILL_S512_RESTORE: + case AMDGPU::SI_SPILL_S256_RESTORE: + case AMDGPU::SI_SPILL_S128_RESTORE: + case AMDGPU::SI_SPILL_S64_RESTORE: + case AMDGPU::SI_SPILL_S32_RESTORE: { + unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + + for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { + unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(), + &AMDGPU::SGPR_32RegClass, i); + struct SIMachineFunctionInfo::SpilledReg Spill = + MFI->getSpilledReg(MF, Index, i); + + if (Spill.VGPR == AMDGPU::NoRegister) { + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("Ran out of VGPRs for spilling SGPR"); + } + + BuildMI(*MBB, MI, DL, + TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), + SubReg) + .addReg(Spill.VGPR) + .addImm(Spill.Lane) + .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); + } + + // TODO: only do this when it is needed + switch (MF->getSubtarget().getGeneration()) { + case AMDGPUSubtarget::SOUTHERN_ISLANDS: + // "VALU writes SGPR" -> "SMRD reads that SGPR" needs "S_NOP 3" on SI + TII->insertNOPs(MI, 3); + break; + case AMDGPUSubtarget::SEA_ISLANDS: + break; + default: // VOLCANIC_ISLANDS and later + // "VALU writes SGPR -> VMEM reads that SGPR" needs "S_NOP 4" on VI + // and later. This also applies to VALUs which write VCC, but we're + // unlikely to see VMEM use VCC. + TII->insertNOPs(MI, 4); + } + + MI->eraseFromParent(); + break; + } + + // VGPR register spill + case AMDGPU::SI_SPILL_V512_SAVE: + case AMDGPU::SI_SPILL_V256_SAVE: + case AMDGPU::SI_SPILL_V128_SAVE: + case AMDGPU::SI_SPILL_V96_SAVE: + case AMDGPU::SI_SPILL_V64_SAVE: + case AMDGPU::SI_SPILL_V32_SAVE: + buildScratchLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, + TII->getNamedOperand(*MI, AMDGPU::OpName::src)->getReg(), + TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(), + TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(), + FrameInfo->getObjectOffset(Index), RS); + MI->eraseFromParent(); + break; + case AMDGPU::SI_SPILL_V32_RESTORE: + case AMDGPU::SI_SPILL_V64_RESTORE: + case AMDGPU::SI_SPILL_V96_RESTORE: + case AMDGPU::SI_SPILL_V128_RESTORE: + case AMDGPU::SI_SPILL_V256_RESTORE: + case AMDGPU::SI_SPILL_V512_RESTORE: { + buildScratchLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, + TII->getNamedOperand(*MI, AMDGPU::OpName::dst)->getReg(), + TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(), + TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(), + FrameInfo->getObjectOffset(Index), RS); + MI->eraseFromParent(); + break; + } + + default: { + int64_t Offset = FrameInfo->getObjectOffset(Index); + FIOp.ChangeToImmediate(Offset); + if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) { + unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, SPAdj); + BuildMI(*MBB, MI, MI->getDebugLoc(), + TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) + .addImm(Offset); + FIOp.ChangeToRegister(TmpReg, false, false, true); + } + } + } +} + +const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass( + MVT VT) const { + switch(VT.SimpleTy) { + default: + case MVT::i32: return &AMDGPU::VGPR_32RegClass; + } +} + +unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const { + return getEncodingValue(Reg) & 0xff; +} + +const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { + assert(!TargetRegisterInfo::isVirtualRegister(Reg)); + + static const TargetRegisterClass *BaseClasses[] = { + &AMDGPU::VGPR_32RegClass, + &AMDGPU::SReg_32RegClass, + &AMDGPU::VReg_64RegClass, + &AMDGPU::SReg_64RegClass, + &AMDGPU::VReg_96RegClass, + &AMDGPU::VReg_128RegClass, + &AMDGPU::SReg_128RegClass, + &AMDGPU::VReg_256RegClass, + &AMDGPU::SReg_256RegClass, + &AMDGPU::VReg_512RegClass + }; + + for (const TargetRegisterClass *BaseClass : BaseClasses) { + if (BaseClass->contains(Reg)) { + return BaseClass; + } + } + return nullptr; +} + +bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { + return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) || + getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) || + getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) || + getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) || + getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) || + getCommonSubClass(&AMDGPU::VReg_512RegClass, RC); +} + +const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( + const TargetRegisterClass *SRC) const { + if (hasVGPRs(SRC)) { + return SRC; + } else if (SRC == &AMDGPU::SCCRegRegClass) { + return &AMDGPU::VCCRegRegClass; + } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_32RegClass)) { + return &AMDGPU::VGPR_32RegClass; + } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_64RegClass)) { + return &AMDGPU::VReg_64RegClass; + } else if (getCommonSubClass(SRC, &AMDGPU::SReg_128RegClass)) { + return &AMDGPU::VReg_128RegClass; + } else if (getCommonSubClass(SRC, &AMDGPU::SReg_256RegClass)) { + return &AMDGPU::VReg_256RegClass; + } else if (getCommonSubClass(SRC, &AMDGPU::SReg_512RegClass)) { + return &AMDGPU::VReg_512RegClass; + } + return nullptr; +} + +const TargetRegisterClass *SIRegisterInfo::getSubRegClass( + const TargetRegisterClass *RC, unsigned SubIdx) const { + if (SubIdx == AMDGPU::NoSubRegister) + return RC; + + // If this register has a sub-register, we can safely assume it is a 32-bit + // register, because all of SI's sub-registers are 32-bit. + if (isSGPRClass(RC)) { + return &AMDGPU::SGPR_32RegClass; + } else { + return &AMDGPU::VGPR_32RegClass; + } +} + +unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg, + const TargetRegisterClass *SubRC, + unsigned Channel) const { + + switch (Reg) { + case AMDGPU::VCC: + switch(Channel) { + case 0: return AMDGPU::VCC_LO; + case 1: return AMDGPU::VCC_HI; + default: llvm_unreachable("Invalid SubIdx for VCC"); + } + + case AMDGPU::FLAT_SCR: + switch (Channel) { + case 0: + return AMDGPU::FLAT_SCR_LO; + case 1: + return AMDGPU::FLAT_SCR_HI; + default: + llvm_unreachable("Invalid SubIdx for FLAT_SCR"); + } + break; + + case AMDGPU::EXEC: + switch (Channel) { + case 0: + return AMDGPU::EXEC_LO; + case 1: + return AMDGPU::EXEC_HI; + default: + llvm_unreachable("Invalid SubIdx for EXEC"); + } + break; + } + + const TargetRegisterClass *RC = getPhysRegClass(Reg); + // 32-bit registers don't have sub-registers, so we can just return the + // Reg. We need to have this check here, because the calculation below + // using getHWRegIndex() will fail with special 32-bit registers like + // VCC_LO, VCC_HI, EXEC_LO, EXEC_HI and M0. + if (RC->getSize() == 4) { + assert(Channel == 0); + return Reg; + } + + unsigned Index = getHWRegIndex(Reg); + return SubRC->getRegister(Index + Channel); +} + +bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const { + return OpType == AMDGPU::OPERAND_REG_IMM32; +} + +bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { + if (opCanUseLiteralConstant(OpType)) + return true; + + return OpType == AMDGPU::OPERAND_REG_INLINE_C; +} + +unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, + enum PreloadedValue Value) const { + + const SIMachineFunctionInfo *MFI = MF.getInfo(); + switch (Value) { + case SIRegisterInfo::TGID_X: + return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0); + case SIRegisterInfo::TGID_Y: + return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1); + case SIRegisterInfo::TGID_Z: + return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2); + case SIRegisterInfo::SCRATCH_WAVE_OFFSET: + if (MFI->getShaderType() != ShaderType::COMPUTE) + return MFI->ScratchOffsetReg; + return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4); + case SIRegisterInfo::SCRATCH_PTR: + return AMDGPU::SGPR2_SGPR3; + case SIRegisterInfo::INPUT_PTR: + return AMDGPU::SGPR0_SGPR1; + case SIRegisterInfo::TIDIG_X: + return AMDGPU::VGPR0; + case SIRegisterInfo::TIDIG_Y: + return AMDGPU::VGPR1; + case SIRegisterInfo::TIDIG_Z: + return AMDGPU::VGPR2; + } + llvm_unreachable("unexpected preloaded value type"); +} + +/// \brief Returns a register that is not used at any point in the function. +/// If all registers are used, then this function will return +// AMDGPU::NoRegister. +unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, + const TargetRegisterClass *RC) const { + + for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); + I != E; ++I) { + if (!MRI.isPhysRegUsed(*I)) + return *I; + } + return AMDGPU::NoRegister; +} + +unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const { + switch(WaveCount) { + case 10: return 24; + case 9: return 28; + case 8: return 32; + case 7: return 36; + case 6: return 40; + case 5: return 48; + case 4: return 64; + case 3: return 84; + case 2: return 128; + default: return 256; + } +} + +unsigned SIRegisterInfo::getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen, + unsigned WaveCount) const { + if (gen >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + switch (WaveCount) { + case 10: return 80; + case 9: return 80; + case 8: return 96; + default: return 102; + } + } else { + switch(WaveCount) { + case 10: return 48; + case 9: return 56; + case 8: return 64; + case 7: return 72; + case 6: return 80; + case 5: return 96; + default: return 103; + } + } +} diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h new file mode 100644 index 00000000000..bfdb67c5e12 --- /dev/null +++ b/lib/Target/AMDGPU/SIRegisterInfo.h @@ -0,0 +1,131 @@ +//===-- SIRegisterInfo.h - SI Register Info Interface ----------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface definition for SIRegisterInfo +// +//===----------------------------------------------------------------------===// + + +#ifndef LLVM_LIB_TARGET_R600_SIREGISTERINFO_H +#define LLVM_LIB_TARGET_R600_SIREGISTERINFO_H + +#include "AMDGPURegisterInfo.h" +#include "AMDGPUSubtarget.h" +#include "llvm/Support/Debug.h" + +namespace llvm { + +struct SIRegisterInfo : public AMDGPURegisterInfo { + + SIRegisterInfo(); + + BitVector getReservedRegs(const MachineFunction &MF) const override; + + unsigned getRegPressureSetLimit(const MachineFunction &MF, + unsigned Idx) const override; + + bool requiresRegisterScavenging(const MachineFunction &Fn) const override; + + void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, + unsigned FIOperandNum, + RegScavenger *RS) const override; + + /// \brief get the register class of the specified type to use in the + /// CFGStructurizer + const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override; + + unsigned getHWRegIndex(unsigned Reg) const override; + + /// \brief Return the 'base' register class for this register. + /// e.g. SGPR0 => SReg_32, VGPR => VGPR_32 SGPR0_SGPR1 -> SReg_32, etc. + const TargetRegisterClass *getPhysRegClass(unsigned Reg) const; + + /// \returns true if this class contains only SGPR registers + bool isSGPRClass(const TargetRegisterClass *RC) const { + if (!RC) + return false; + + return !hasVGPRs(RC); + } + + /// \returns true if this class ID contains only SGPR registers + bool isSGPRClassID(unsigned RCID) const { + if (static_cast(RCID) == -1) + return false; + + return isSGPRClass(getRegClass(RCID)); + } + + /// \returns true if this class contains VGPR registers. + bool hasVGPRs(const TargetRegisterClass *RC) const; + + /// \returns A VGPR reg class with the same width as \p SRC + const TargetRegisterClass *getEquivalentVGPRClass( + const TargetRegisterClass *SRC) const; + + /// \returns The register class that is used for a sub-register of \p RC for + /// the given \p SubIdx. If \p SubIdx equals NoSubRegister, \p RC will + /// be returned. + const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC, + unsigned SubIdx) const; + + /// \p Channel This is the register channel (e.g. a value from 0-16), not the + /// SubReg index. + /// \returns The sub-register of Reg that is in Channel. + unsigned getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC, + unsigned Channel) const; + + /// \returns True if operands defined with this operand type can accept + /// a literal constant (i.e. any 32-bit immediate). + bool opCanUseLiteralConstant(unsigned OpType) const; + + /// \returns True if operands defined with this operand type can accept + /// an inline constant. i.e. An integer value in the range (-16, 64) or + /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. + bool opCanUseInlineConstant(unsigned OpType) const; + + enum PreloadedValue { + TGID_X, + TGID_Y, + TGID_Z, + SCRATCH_WAVE_OFFSET, + SCRATCH_PTR, + INPUT_PTR, + TIDIG_X, + TIDIG_Y, + TIDIG_Z + }; + + /// \brief Returns the physical register that \p Value is stored in. + unsigned getPreloadedValue(const MachineFunction &MF, + enum PreloadedValue Value) const; + + /// \brief Give the maximum number of VGPRs that can be used by \p WaveCount + /// concurrent waves. + unsigned getNumVGPRsAllowed(unsigned WaveCount) const; + + /// \brief Give the maximum number of SGPRs that can be used by \p WaveCount + /// concurrent waves. + unsigned getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen, + unsigned WaveCount) const; + + unsigned findUnusedRegister(const MachineRegisterInfo &MRI, + const TargetRegisterClass *RC) const; + +private: + void buildScratchLoadStore(MachineBasicBlock::iterator MI, + unsigned LoadStoreOp, unsigned Value, + unsigned ScratchRsrcReg, unsigned ScratchOffset, + int64_t Offset, RegScavenger *RS) const; +}; + +} // End namespace llvm + +#endif diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td new file mode 100644 index 00000000000..2a9017fa2a9 --- /dev/null +++ b/lib/Target/AMDGPU/SIRegisterInfo.td @@ -0,0 +1,284 @@ +//===-- SIRegisterInfo.td - SI Register defs ---------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Declarations that describe the SI registers +//===----------------------------------------------------------------------===// + +class SIReg encoding = 0> : Register { + let Namespace = "AMDGPU"; + let HWEncoding = encoding; +} + +// Special Registers +def VCC_LO : SIReg<"vcc_lo", 106>; +def VCC_HI : SIReg<"vcc_hi", 107>; + +// VCC for 64-bit instructions +def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> { + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = 106; +} + +def EXEC_LO : SIReg<"exec_lo", 126>; +def EXEC_HI : SIReg<"exec_hi", 127>; + +def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]> { + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = 126; +} + +def SCC : SIReg<"scc", 253>; +def M0 : SIReg <"m0", 124>; + +def FLAT_SCR_LO : SIReg<"flat_scr_lo", 104>; // Offset in units of 256-bytes. +def FLAT_SCR_HI : SIReg<"flat_scr_hi", 105>; // Size is the per-thread scratch size, in bytes. + +// Pair to indicate location of scratch space for flat accesses. +def FLAT_SCR : RegisterWithSubRegs <"flat_scr", [FLAT_SCR_LO, FLAT_SCR_HI]> { + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = 104; +} + +// SGPR registers +foreach Index = 0-101 in { + def SGPR#Index : SIReg <"SGPR"#Index, Index>; +} + +// VGPR registers +foreach Index = 0-255 in { + def VGPR#Index : SIReg <"VGPR"#Index, Index> { + let HWEncoding{8} = 1; + } +} + +//===----------------------------------------------------------------------===// +// Groupings using register classes and tuples +//===----------------------------------------------------------------------===// + +// SGPR 32-bit registers +def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, + (add (sequence "SGPR%u", 0, 101))>; + +// SGPR 64-bit registers +def SGPR_64Regs : RegisterTuples<[sub0, sub1], + [(add (decimate (trunc SGPR_32, 101), 2)), + (add (decimate (shl SGPR_32, 1), 2))]>; + +// SGPR 128-bit registers +def SGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3], + [(add (decimate (trunc SGPR_32, 99), 4)), + (add (decimate (shl SGPR_32, 1), 4)), + (add (decimate (shl SGPR_32, 2), 4)), + (add (decimate (shl SGPR_32, 3), 4))]>; + +// SGPR 256-bit registers +def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7], + [(add (decimate (trunc SGPR_32, 95), 4)), + (add (decimate (shl SGPR_32, 1), 4)), + (add (decimate (shl SGPR_32, 2), 4)), + (add (decimate (shl SGPR_32, 3), 4)), + (add (decimate (shl SGPR_32, 4), 4)), + (add (decimate (shl SGPR_32, 5), 4)), + (add (decimate (shl SGPR_32, 6), 4)), + (add (decimate (shl SGPR_32, 7), 4))]>; + +// SGPR 512-bit registers +def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, + sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15], + [(add (decimate (trunc SGPR_32, 87), 4)), + (add (decimate (shl SGPR_32, 1), 4)), + (add (decimate (shl SGPR_32, 2), 4)), + (add (decimate (shl SGPR_32, 3), 4)), + (add (decimate (shl SGPR_32, 4), 4)), + (add (decimate (shl SGPR_32, 5), 4)), + (add (decimate (shl SGPR_32, 6), 4)), + (add (decimate (shl SGPR_32, 7), 4)), + (add (decimate (shl SGPR_32, 8), 4)), + (add (decimate (shl SGPR_32, 9), 4)), + (add (decimate (shl SGPR_32, 10), 4)), + (add (decimate (shl SGPR_32, 11), 4)), + (add (decimate (shl SGPR_32, 12), 4)), + (add (decimate (shl SGPR_32, 13), 4)), + (add (decimate (shl SGPR_32, 14), 4)), + (add (decimate (shl SGPR_32, 15), 4))]>; + +// VGPR 32-bit registers +def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, + (add (sequence "VGPR%u", 0, 255))>; + +// VGPR 64-bit registers +def VGPR_64 : RegisterTuples<[sub0, sub1], + [(add (trunc VGPR_32, 255)), + (add (shl VGPR_32, 1))]>; + +// VGPR 96-bit registers +def VGPR_96 : RegisterTuples<[sub0, sub1, sub2], + [(add (trunc VGPR_32, 254)), + (add (shl VGPR_32, 1)), + (add (shl VGPR_32, 2))]>; + +// VGPR 128-bit registers +def VGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3], + [(add (trunc VGPR_32, 253)), + (add (shl VGPR_32, 1)), + (add (shl VGPR_32, 2)), + (add (shl VGPR_32, 3))]>; + +// VGPR 256-bit registers +def VGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7], + [(add (trunc VGPR_32, 249)), + (add (shl VGPR_32, 1)), + (add (shl VGPR_32, 2)), + (add (shl VGPR_32, 3)), + (add (shl VGPR_32, 4)), + (add (shl VGPR_32, 5)), + (add (shl VGPR_32, 6)), + (add (shl VGPR_32, 7))]>; + +// VGPR 512-bit registers +def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, + sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15], + [(add (trunc VGPR_32, 241)), + (add (shl VGPR_32, 1)), + (add (shl VGPR_32, 2)), + (add (shl VGPR_32, 3)), + (add (shl VGPR_32, 4)), + (add (shl VGPR_32, 5)), + (add (shl VGPR_32, 6)), + (add (shl VGPR_32, 7)), + (add (shl VGPR_32, 8)), + (add (shl VGPR_32, 9)), + (add (shl VGPR_32, 10)), + (add (shl VGPR_32, 11)), + (add (shl VGPR_32, 12)), + (add (shl VGPR_32, 13)), + (add (shl VGPR_32, 14)), + (add (shl VGPR_32, 15))]>; + +//===----------------------------------------------------------------------===// +// Register classes used as source and destination +//===----------------------------------------------------------------------===// + +class RegImmMatcher : AsmOperandClass { + let Name = name; + let RenderMethod = "addRegOrImmOperands"; +} + +// Special register classes for predicates and the M0 register +def SCCReg : RegisterClass<"AMDGPU", [i32, i1], 32, (add SCC)> { + let CopyCost = -1; // Theoretically it is possible to read from SCC, + // but it should never be necessary. +} + +def VCCReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add VCC)>; +def EXECReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add EXEC)>; + +// Register class for all scalar registers (SGPRs + Special Registers) +def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32, + (add SGPR_32, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI) +>; + +def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 64, (add SGPR_64Regs)>; + +def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 64, + (add SGPR_64, VCCReg, EXECReg, FLAT_SCR) +>; + +def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8], 128, (add SGPR_128)>; + +def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256)>; + +def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 512, (add SGPR_512)>; + +// Register class for all vector registers (VGPRs + Interploation Registers) +def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 64, (add VGPR_64)>; + +def VReg_96 : RegisterClass<"AMDGPU", [untyped], 96, (add VGPR_96)> { + let Size = 96; +} + +def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)>; + +def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add VGPR_256)>; + +def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>; + +def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> { + let Size = 32; +} + +class RegImmOperand : RegisterOperand { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_IMM32"; +} + +class RegInlineOperand : RegisterOperand { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_INLINE_C"; +} + +//===----------------------------------------------------------------------===// +// SSrc_* Operands with an SGPR or a 32-bit immediate +//===----------------------------------------------------------------------===// + +def SSrc_32 : RegImmOperand { + let ParserMatchClass = RegImmMatcher<"SSrc32">; +} + +def SSrc_64 : RegImmOperand { + let ParserMatchClass = RegImmMatcher<"SSrc64">; +} + +//===----------------------------------------------------------------------===// +// SCSrc_* Operands with an SGPR or a inline constant +//===----------------------------------------------------------------------===// + +def SCSrc_32 : RegInlineOperand { + let ParserMatchClass = RegImmMatcher<"SCSrc32">; +} + +//===----------------------------------------------------------------------===// +// VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate +//===----------------------------------------------------------------------===// + +def VS_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VGPR_32, SReg_32)>; + +def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>; + +def VSrc_32 : RegisterOperand { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_IMM32"; + let ParserMatchClass = RegImmMatcher<"VSrc32">; +} + +def VSrc_64 : RegisterOperand { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_IMM32"; + let ParserMatchClass = RegImmMatcher<"VSrc64">; +} + +//===----------------------------------------------------------------------===// +// VCSrc_* Operands with an SGPR, VGPR or an inline constant +//===----------------------------------------------------------------------===// + +def VCSrc_32 : RegisterOperand { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_INLINE_C"; + let ParserMatchClass = RegImmMatcher<"VCSrc32">; +} + +def VCSrc_64 : RegisterOperand { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_INLINE_C"; + let ParserMatchClass = RegImmMatcher<"VCSrc64">; +} diff --git a/lib/Target/AMDGPU/SISchedule.td b/lib/Target/AMDGPU/SISchedule.td new file mode 100644 index 00000000000..9b1f676020b --- /dev/null +++ b/lib/Target/AMDGPU/SISchedule.td @@ -0,0 +1,91 @@ +//===-- SISchedule.td - SI Scheduling definitons -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// MachineModel definitions for Southern Islands (SI) +// +//===----------------------------------------------------------------------===// + +def WriteBranch : SchedWrite; +def WriteExport : SchedWrite; +def WriteLDS : SchedWrite; +def WriteSALU : SchedWrite; +def WriteSMEM : SchedWrite; +def WriteVMEM : SchedWrite; + +// Vector ALU instructions +def Write32Bit : SchedWrite; +def WriteQuarterRate32 : SchedWrite; + +def WriteFloatFMA : SchedWrite; + +def WriteDouble : SchedWrite; +def WriteDoubleAdd : SchedWrite; + +def SIFullSpeedModel : SchedMachineModel; +def SIQuarterSpeedModel : SchedMachineModel; + +// BufferSize = 0 means the processors are in-order. +let BufferSize = 0 in { + +// XXX: Are the resource counts correct? +def HWBranch : ProcResource<1>; +def HWExport : ProcResource<7>; // Taken from S_WAITCNT +def HWLGKM : ProcResource<31>; // Taken from S_WAITCNT +def HWSALU : ProcResource<1>; +def HWVMEM : ProcResource<15>; // Taken from S_WAITCNT +def HWVALU : ProcResource<1>; + +} + +class HWWriteRes resources, + int latency> : WriteRes { + let Latency = latency; +} + +class HWVALUWriteRes : + HWWriteRes; + + +// The latency numbers are taken from AMD Accelerated Parallel Processing +// guide. They may not be acurate. + +// The latency values are 1 / (operations / cycle) / 4. +multiclass SICommonWriteRes { + + def : HWWriteRes; // XXX: Guessed ??? + def : HWWriteRes; // XXX: Guessed ??? + def : HWWriteRes; // 2 - 64 + def : HWWriteRes; + def : HWWriteRes; // XXX: Guessed ??? + def : HWWriteRes; // 300 - 600 + + def : HWVALUWriteRes; + def : HWVALUWriteRes; +} + + +let SchedModel = SIFullSpeedModel in { + +defm : SICommonWriteRes; + +def : HWVALUWriteRes; +def : HWVALUWriteRes; +def : HWVALUWriteRes; + +} // End SchedModel = SIFullSpeedModel + +let SchedModel = SIQuarterSpeedModel in { + +defm : SICommonWriteRes; + +def : HWVALUWriteRes; +def : HWVALUWriteRes; +def : HWVALUWriteRes; + +} // End SchedModel = SIQuarterSpeedModel diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp new file mode 100644 index 00000000000..51e72cdb5f9 --- /dev/null +++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -0,0 +1,272 @@ +//===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// The pass tries to use the 32-bit encoding for instructions when possible. +//===----------------------------------------------------------------------===// +// + +#include "AMDGPU.h" +#include "AMDGPUMCInstLower.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +#define DEBUG_TYPE "si-shrink-instructions" + +STATISTIC(NumInstructionsShrunk, + "Number of 64-bit instruction reduced to 32-bit."); +STATISTIC(NumLiteralConstantsFolded, + "Number of literal constants folded into 32-bit instructions."); + +namespace llvm { + void initializeSIShrinkInstructionsPass(PassRegistry&); +} + +using namespace llvm; + +namespace { + +class SIShrinkInstructions : public MachineFunctionPass { +public: + static char ID; + +public: + SIShrinkInstructions() : MachineFunctionPass(ID) { + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Shrink Instructions"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIShrinkInstructions, DEBUG_TYPE, + "SI Lower il Copies", false, false) +INITIALIZE_PASS_END(SIShrinkInstructions, DEBUG_TYPE, + "SI Lower il Copies", false, false) + +char SIShrinkInstructions::ID = 0; + +FunctionPass *llvm::createSIShrinkInstructionsPass() { + return new SIShrinkInstructions(); +} + +static bool isVGPR(const MachineOperand *MO, const SIRegisterInfo &TRI, + const MachineRegisterInfo &MRI) { + if (!MO->isReg()) + return false; + + if (TargetRegisterInfo::isVirtualRegister(MO->getReg())) + return TRI.hasVGPRs(MRI.getRegClass(MO->getReg())); + + return TRI.hasVGPRs(TRI.getPhysRegClass(MO->getReg())); +} + +static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, + const SIRegisterInfo &TRI, + const MachineRegisterInfo &MRI) { + + const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); + // Can't shrink instruction with three operands. + // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add + // a special case for it. It can only be shrunk if the third operand + // is vcc. We should handle this the same way we handle vopc, by addding + // a register allocation hint pre-regalloc and then do the shrining + // post-regalloc. + if (Src2) + return false; + + const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + const MachineOperand *Src1Mod = + TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); + + if (Src1 && (!isVGPR(Src1, TRI, MRI) || (Src1Mod && Src1Mod->getImm() != 0))) + return false; + + // We don't need to check src0, all input types are legal, so just make sure + // src0 isn't using any modifiers. + if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)) + return false; + + // Check output modifiers + if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) + return false; + + if (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp)) + return false; + + return true; +} + +/// \brief This function checks \p MI for operands defined by a move immediate +/// instruction and then folds the literal constant into the instruction if it +/// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instruction +/// and will only fold literal constants if we are still in SSA. +static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, + MachineRegisterInfo &MRI, bool TryToCommute = true) { + + if (!MRI.isSSA()) + return; + + assert(TII->isVOP1(MI.getOpcode()) || TII->isVOP2(MI.getOpcode()) || + TII->isVOPC(MI.getOpcode())); + + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); + MachineOperand &Src0 = MI.getOperand(Src0Idx); + + // Only one literal constant is allowed per instruction, so if src0 is a + // literal constant then we can't do any folding. + if (Src0.isImm() && + TII->isLiteralConstant(Src0, TII->getOpSize(MI, Src0Idx))) + return; + + // Literal constants and SGPRs can only be used in Src0, so if Src0 is an + // SGPR, we cannot commute the instruction, so we can't fold any literal + // constants. + if (Src0.isReg() && !isVGPR(&Src0, TRI, MRI)) + return; + + // Try to fold Src0 + if (Src0.isReg()) { + unsigned Reg = Src0.getReg(); + MachineInstr *Def = MRI.getUniqueVRegDef(Reg); + if (Def && Def->isMoveImmediate()) { + MachineOperand &MovSrc = Def->getOperand(1); + bool ConstantFolded = false; + + if (MovSrc.isImm() && isUInt<32>(MovSrc.getImm())) { + Src0.ChangeToImmediate(MovSrc.getImm()); + ConstantFolded = true; + } + if (ConstantFolded) { + if (MRI.use_empty(Reg)) + Def->eraseFromParent(); + ++NumLiteralConstantsFolded; + return; + } + } + } + + // We have failed to fold src0, so commute the instruction and try again. + if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(&MI)) + foldImmediates(MI, TII, MRI, false); + +} + +bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIInstrInfo *TII = + static_cast(MF.getSubtarget().getInstrInfo()); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + std::vector I1Defs; + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + MachineInstr &MI = *I; + + // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. + if (MI.getOpcode() == AMDGPU::S_MOV_B32) { + const MachineOperand &Src = MI.getOperand(1); + + if (Src.isImm()) { + if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4)) + MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); + } + + continue; + } + + if (!TII->hasVALU32BitEncoding(MI.getOpcode())) + continue; + + if (!canShrink(MI, TII, TRI, MRI)) { + // Try commuting the instruction and see if that enables us to shrink + // it. + if (!MI.isCommutable() || !TII->commuteInstruction(&MI) || + !canShrink(MI, TII, TRI, MRI)) + continue; + } + + // getVOPe32 could be -1 here if we started with an instruction that had + // a 32-bit encoding and then commuted it to an instruction that did not. + if (!TII->hasVALU32BitEncoding(MI.getOpcode())) + continue; + + int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); + + if (TII->isVOPC(Op32)) { + unsigned DstReg = MI.getOperand(0).getReg(); + if (TargetRegisterInfo::isVirtualRegister(DstReg)) { + // VOPC instructions can only write to the VCC register. We can't + // force them to use VCC here, because the register allocator has + // trouble with sequences like this, which cause the allocator to run + // out of registers if vreg0 and vreg1 belong to the VCCReg register + // class: + // vreg0 = VOPC; + // vreg1 = VOPC; + // S_AND_B64 vreg0, vreg1 + // + // So, instead of forcing the instruction to write to VCC, we provide + // a hint to the register allocator to use VCC and then we we will run + // this pass again after RA and shrink it if it outputs to VCC. + MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC); + continue; + } + if (DstReg != AMDGPU::VCC) + continue; + } + + // We can shrink this instruction + DEBUG(dbgs() << "Shrinking "; MI.dump(); dbgs() << '\n';); + + MachineInstrBuilder Inst32 = + BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32)); + + // dst + Inst32.addOperand(MI.getOperand(0)); + + Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); + + const MachineOperand *Src1 = + TII->getNamedOperand(MI, AMDGPU::OpName::src1); + if (Src1) + Inst32.addOperand(*Src1); + + ++NumInstructionsShrunk; + MI.eraseFromParent(); + + foldImmediates(*Inst32, TII, MRI); + DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); + + + } + } + return false; +} diff --git a/lib/Target/AMDGPU/SITypeRewriter.cpp b/lib/Target/AMDGPU/SITypeRewriter.cpp new file mode 100644 index 00000000000..591ce857cc7 --- /dev/null +++ b/lib/Target/AMDGPU/SITypeRewriter.cpp @@ -0,0 +1,161 @@ +//===-- SITypeRewriter.cpp - Remove unwanted types ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass removes performs the following type substitution on all +/// non-compute shaders: +/// +/// v16i8 => i128 +/// - v16i8 is used for constant memory resource descriptors. This type is +/// legal for some compute APIs, and we don't want to declare it as legal +/// in the backend, because we want the legalizer to expand all v16i8 +/// operations. +/// v1* => * +/// - Having v1* types complicates the legalizer and we can easily replace +/// - them with the element type. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstVisitor.h" + +using namespace llvm; + +namespace { + +class SITypeRewriter : public FunctionPass, + public InstVisitor { + + static char ID; + Module *Mod; + Type *v16i8; + Type *v4i32; + +public: + SITypeRewriter() : FunctionPass(ID) { } + bool doInitialization(Module &M) override; + bool runOnFunction(Function &F) override; + const char *getPassName() const override { + return "SI Type Rewriter"; + } + void visitLoadInst(LoadInst &I); + void visitCallInst(CallInst &I); + void visitBitCast(BitCastInst &I); +}; + +} // End anonymous namespace + +char SITypeRewriter::ID = 0; + +bool SITypeRewriter::doInitialization(Module &M) { + Mod = &M; + v16i8 = VectorType::get(Type::getInt8Ty(M.getContext()), 16); + v4i32 = VectorType::get(Type::getInt32Ty(M.getContext()), 4); + return false; +} + +bool SITypeRewriter::runOnFunction(Function &F) { + Attribute A = F.getFnAttribute("ShaderType"); + + unsigned ShaderType = ShaderType::COMPUTE; + if (A.isStringAttribute()) { + StringRef Str = A.getValueAsString(); + Str.getAsInteger(0, ShaderType); + } + if (ShaderType == ShaderType::COMPUTE) + return false; + + visit(F); + visit(F); + + return false; +} + +void SITypeRewriter::visitLoadInst(LoadInst &I) { + Value *Ptr = I.getPointerOperand(); + Type *PtrTy = Ptr->getType(); + Type *ElemTy = PtrTy->getPointerElementType(); + IRBuilder<> Builder(&I); + if (ElemTy == v16i8) { + Value *BitCast = Builder.CreateBitCast(Ptr, + PointerType::get(v4i32,PtrTy->getPointerAddressSpace())); + LoadInst *Load = Builder.CreateLoad(BitCast); + SmallVector, 8> MD; + I.getAllMetadataOtherThanDebugLoc(MD); + for (unsigned i = 0, e = MD.size(); i != e; ++i) { + Load->setMetadata(MD[i].first, MD[i].second); + } + Value *BitCastLoad = Builder.CreateBitCast(Load, I.getType()); + I.replaceAllUsesWith(BitCastLoad); + I.eraseFromParent(); + } +} + +void SITypeRewriter::visitCallInst(CallInst &I) { + IRBuilder<> Builder(&I); + + SmallVector Args; + SmallVector Types; + bool NeedToReplace = false; + Function *F = I.getCalledFunction(); + std::string Name = F->getName(); + for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) { + Value *Arg = I.getArgOperand(i); + if (Arg->getType() == v16i8) { + Args.push_back(Builder.CreateBitCast(Arg, v4i32)); + Types.push_back(v4i32); + NeedToReplace = true; + Name = Name + ".v4i32"; + } else if (Arg->getType()->isVectorTy() && + Arg->getType()->getVectorNumElements() == 1 && + Arg->getType()->getVectorElementType() == + Type::getInt32Ty(I.getContext())){ + Type *ElementTy = Arg->getType()->getVectorElementType(); + std::string TypeName = "i32"; + InsertElementInst *Def = cast(Arg); + Args.push_back(Def->getOperand(1)); + Types.push_back(ElementTy); + std::string VecTypeName = "v1" + TypeName; + Name = Name.replace(Name.find(VecTypeName), VecTypeName.length(), TypeName); + NeedToReplace = true; + } else { + Args.push_back(Arg); + Types.push_back(Arg->getType()); + } + } + + if (!NeedToReplace) { + return; + } + Function *NewF = Mod->getFunction(Name); + if (!NewF) { + NewF = Function::Create(FunctionType::get(F->getReturnType(), Types, false), GlobalValue::ExternalLinkage, Name, Mod); + NewF->setAttributes(F->getAttributes()); + } + I.replaceAllUsesWith(Builder.CreateCall(NewF, Args)); + I.eraseFromParent(); +} + +void SITypeRewriter::visitBitCast(BitCastInst &I) { + IRBuilder<> Builder(&I); + if (I.getDestTy() != v4i32) { + return; + } + + if (BitCastInst *Op = dyn_cast(I.getOperand(0))) { + if (Op->getSrcTy() == v4i32) { + I.replaceAllUsesWith(Op->getOperand(0)); + I.eraseFromParent(); + } + } +} + +FunctionPass *llvm::createSITypeRewriter() { + return new SITypeRewriter(); +} diff --git a/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp new file mode 100644 index 00000000000..2112135aa5d --- /dev/null +++ b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp @@ -0,0 +1,30 @@ +//===-- TargetInfo/AMDGPUTargetInfo.cpp - TargetInfo for AMDGPU -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUTargetMachine.h" +#include "llvm/Support/TargetRegistry.h" + +using namespace llvm; + +/// \brief The target which suports all AMD GPUs. This will eventually +/// be deprecated and there will be a R600 target and a GCN target. +Target llvm::TheAMDGPUTarget; +/// \brief The target for GCN GPUs +Target llvm::TheGCNTarget; + +/// \brief Extern function to initialize the targets for the AMDGPU backend +extern "C" void LLVMInitializeAMDGPUTargetInfo() { + RegisterTarget + R600(TheAMDGPUTarget, "r600", "AMD GPUs HD2XXX-HD6XXX"); + RegisterTarget GCN(TheGCNTarget, "amdgcn", "AMD GCN GPUs"); +} diff --git a/lib/Target/AMDGPU/TargetInfo/CMakeLists.txt b/lib/Target/AMDGPU/TargetInfo/CMakeLists.txt new file mode 100644 index 00000000000..961dc550900 --- /dev/null +++ b/lib/Target/AMDGPU/TargetInfo/CMakeLists.txt @@ -0,0 +1,3 @@ +add_llvm_library(LLVMAMDGPUInfo + AMDGPUTargetInfo.cpp + ) diff --git a/lib/Target/AMDGPU/TargetInfo/LLVMBuild.txt b/lib/Target/AMDGPU/TargetInfo/LLVMBuild.txt new file mode 100644 index 00000000000..291317fa072 --- /dev/null +++ b/lib/Target/AMDGPU/TargetInfo/LLVMBuild.txt @@ -0,0 +1,23 @@ +;===- ./lib/Target/AMDGPU/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = AMDGPUInfo +parent = AMDGPU +required_libraries = Support +add_to_library_groups = AMDGPU diff --git a/lib/Target/AMDGPU/TargetInfo/Makefile b/lib/Target/AMDGPU/TargetInfo/Makefile new file mode 100644 index 00000000000..b8ac4e78230 --- /dev/null +++ b/lib/Target/AMDGPU/TargetInfo/Makefile @@ -0,0 +1,15 @@ +##===- lib/Target/AMDGPU/TargetInfo/Makefile ----------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## +LEVEL = ../../../.. +LIBRARYNAME = LLVMR600Info + +# Hack: we need to include 'main' target directory to grab private headers +CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. + +include $(LEVEL)/Makefile.common diff --git a/lib/Target/AMDGPU/VIInstrFormats.td b/lib/Target/AMDGPU/VIInstrFormats.td new file mode 100644 index 00000000000..d8738f99263 --- /dev/null +++ b/lib/Target/AMDGPU/VIInstrFormats.td @@ -0,0 +1,166 @@ +//===-- VIInstrFormats.td - VI Instruction Encodings ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// VI Instruction format definitions. +// +//===----------------------------------------------------------------------===// + +class DSe_vi op> : Enc64 { + bits<8> vdst; + bits<1> gds; + bits<8> addr; + bits<8> data0; + bits<8> data1; + bits<8> offset0; + bits<8> offset1; + + let Inst{7-0} = offset0; + let Inst{15-8} = offset1; + let Inst{16} = gds; + let Inst{24-17} = op; + let Inst{31-26} = 0x36; //encoding + let Inst{39-32} = addr; + let Inst{47-40} = data0; + let Inst{55-48} = data1; + let Inst{63-56} = vdst; +} + +class MUBUFe_vi op> : Enc64 { + bits<12> offset; + bits<1> offen; + bits<1> idxen; + bits<1> glc; + bits<1> lds; + bits<8> vaddr; + bits<8> vdata; + bits<7> srsrc; + bits<1> slc; + bits<1> tfe; + bits<8> soffset; + + let Inst{11-0} = offset; + let Inst{12} = offen; + let Inst{13} = idxen; + let Inst{14} = glc; + let Inst{16} = lds; + let Inst{17} = slc; + let Inst{24-18} = op; + let Inst{31-26} = 0x38; //encoding + let Inst{39-32} = vaddr; + let Inst{47-40} = vdata; + let Inst{52-48} = srsrc{6-2}; + let Inst{55} = tfe; + let Inst{63-56} = soffset; +} + +class MTBUFe_vi op> : Enc64 { + bits<12> offset; + bits<1> offen; + bits<1> idxen; + bits<1> glc; + bits<4> dfmt; + bits<3> nfmt; + bits<8> vaddr; + bits<8> vdata; + bits<7> srsrc; + bits<1> slc; + bits<1> tfe; + bits<8> soffset; + + let Inst{11-0} = offset; + let Inst{12} = offen; + let Inst{13} = idxen; + let Inst{14} = glc; + let Inst{18-15} = op; + let Inst{22-19} = dfmt; + let Inst{25-23} = nfmt; + let Inst{31-26} = 0x3a; //encoding + let Inst{39-32} = vaddr; + let Inst{47-40} = vdata; + let Inst{52-48} = srsrc{6-2}; + let Inst{54} = slc; + let Inst{55} = tfe; + let Inst{63-56} = soffset; +} + +class SMEMe_vi op, bit imm> : Enc64 { + bits<7> sbase; + bits<7> sdata; + bits<1> glc; + bits<20> offset; + + let Inst{5-0} = sbase{6-1}; + let Inst{12-6} = sdata; + let Inst{16} = glc; + let Inst{17} = imm; + let Inst{25-18} = op; + let Inst{31-26} = 0x30; //encoding + let Inst{51-32} = offset; +} + +class VOP3e_vi op> : Enc64 { + bits<8> vdst; + bits<2> src0_modifiers; + bits<9> src0; + bits<2> src1_modifiers; + bits<9> src1; + bits<2> src2_modifiers; + bits<9> src2; + bits<1> clamp; + bits<2> omod; + + let Inst{7-0} = vdst; + let Inst{8} = src0_modifiers{1}; + let Inst{9} = src1_modifiers{1}; + let Inst{10} = src2_modifiers{1}; + let Inst{15} = clamp; + let Inst{25-16} = op; + let Inst{31-26} = 0x34; //encoding + let Inst{40-32} = src0; + let Inst{49-41} = src1; + let Inst{58-50} = src2; + let Inst{60-59} = omod; + let Inst{61} = src0_modifiers{0}; + let Inst{62} = src1_modifiers{0}; + let Inst{63} = src2_modifiers{0}; +} + +class VOP3be_vi op> : Enc64 { + bits<8> vdst; + bits<2> src0_modifiers; + bits<9> src0; + bits<2> src1_modifiers; + bits<9> src1; + bits<2> src2_modifiers; + bits<9> src2; + bits<7> sdst; + bits<2> omod; + bits<1> clamp; + + let Inst{7-0} = vdst; + let Inst{14-8} = sdst; + let Inst{15} = clamp; + let Inst{25-16} = op; + let Inst{31-26} = 0x34; //encoding + let Inst{40-32} = src0; + let Inst{49-41} = src1; + let Inst{58-50} = src2; + let Inst{60-59} = omod; + let Inst{61} = src0_modifiers{0}; + let Inst{62} = src1_modifiers{0}; + let Inst{63} = src2_modifiers{0}; +} + +class EXPe_vi : EXPe { + let Inst{31-26} = 0x31; //encoding +} + +class VINTRPe_vi op> : VINTRPe { + let Inst{31-26} = 0x35; // encoding +} diff --git a/lib/Target/AMDGPU/VIInstructions.td b/lib/Target/AMDGPU/VIInstructions.td new file mode 100644 index 00000000000..5bf86e649ce --- /dev/null +++ b/lib/Target/AMDGPU/VIInstructions.td @@ -0,0 +1,106 @@ +//===-- VIInstructions.td - VI Instruction Defintions ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// Instruction definitions for VI and newer. +//===----------------------------------------------------------------------===// + +let SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI in { + +//===----------------------------------------------------------------------===// +// VOP1 Instructions +//===----------------------------------------------------------------------===// + +defm V_CVT_F16_U16 : VOP1Inst , "v_cvt_f16_u16", VOP_F16_I16>; +defm V_CVT_F16_I16 : VOP1Inst , "v_cvt_f16_i16", VOP_F16_I16>; +defm V_CVT_U16_F16 : VOP1Inst , "v_cvt_u16_f16", VOP_I16_F16>; +defm V_CVT_I16_F16 : VOP1Inst , "v_cvt_i16_f16", VOP_I16_F16>; +defm V_RCP_F16 : VOP1Inst , "v_rcp_f16", VOP_F16_F16>; +defm V_SQRT_F16 : VOP1Inst , "v_sqrt_f16", VOP_F16_F16>; +defm V_RSQ_F16 : VOP1Inst , "v_rsq_f16", VOP_F16_F16>; +defm V_LOG_F16 : VOP1Inst , "v_log_f16", VOP_F16_F16>; +defm V_EXP_F16 : VOP1Inst , "v_exp_f16", VOP_F16_F16>; +defm V_FREXP_MANT_F16 : VOP1Inst , "v_frexp_mant_f16", + VOP_F16_F16 +>; +defm V_FREXP_EXP_I16_F16 : VOP1Inst , "v_frexp_exp_i16_f16", + VOP_I16_F16 +>; +defm V_FLOOR_F16 : VOP1Inst , "v_floor_f16", VOP_F16_F16>; +defm V_CEIL_F16 : VOP1Inst , "v_ceil_f16", VOP_F16_F16>; +defm V_TRUNC_F16 : VOP1Inst , "v_trunc_f16", VOP_F16_F16>; +defm V_RNDNE_F16 : VOP1Inst , "v_rndne_f16", VOP_F16_F16>; +defm V_FRACT_F16 : VOP1Inst , "v_fract_f16", VOP_F16_F16>; +defm V_SIN_F16 : VOP1Inst , "v_sin_f16", VOP_F16_F16>; +defm V_COS_F16 : VOP1Inst , "v_cos_f16", VOP_F16_F16>; + +//===----------------------------------------------------------------------===// +// VOP2 Instructions +//===----------------------------------------------------------------------===// + +let isCommutable = 1 in { + +defm V_ADD_F16 : VOP2Inst , "v_add_f16", VOP_F16_F16_F16>; +defm V_SUB_F16 : VOP2Inst , "v_sub_f16", VOP_F16_F16_F16>; +defm V_SUBREV_F16 : VOP2Inst , "v_subrev_f16", VOP_F16_F16_F16, + null_frag, "v_sub_f16" +>; +defm V_MUL_F16 : VOP2Inst , "v_mul_f16", VOP_F16_F16_F16>; +defm V_MAC_F16 : VOP2Inst , "v_mac_f16", VOP_F16_F16_F16>; +} // End isCommutable = 1 +defm V_MADMK_F16 : VOP2MADK , "v_madmk_f16">; +let isCommutable = 1 in { +defm V_MADAK_F16 : VOP2MADK , "v_madak_f16">; +defm V_ADD_U16 : VOP2Inst , "v_add_u16", VOP_I16_I16_I16>; +defm V_SUB_U16 : VOP2Inst , "v_sub_u16" , VOP_I16_I16_I16>; +defm V_SUBREV_U16 : VOP2Inst , "v_subrev_u16", VOP_I16_I16_I16>; +defm V_MUL_LO_U16 : VOP2Inst , "v_mul_lo_u16", VOP_I16_I16_I16>; +} // End isCommutable = 1 +defm V_LSHLREV_B16 : VOP2Inst , "v_lshlrev_b16", VOP_I16_I16_I16>; +defm V_LSHRREV_B16 : VOP2Inst , "v_lshrrev_b16", VOP_I16_I16_I16>; +defm V_ASHRREV_B16 : VOP2Inst , "v_ashrrev_b16", VOP_I16_I16_I16>; +let isCommutable = 1 in { +defm V_MAX_F16 : VOP2Inst , "v_max_f16", VOP_F16_F16_F16>; +defm V_MIN_F16 : VOP2Inst , "v_min_f16", VOP_F16_F16_F16>; +defm V_MAX_U16 : VOP2Inst , "v_max_u16", VOP_I16_I16_I16>; +defm V_MAX_I16 : VOP2Inst , "v_max_i16", VOP_I16_I16_I16>; +defm V_MIN_U16 : VOP2Inst , "v_min_u16", VOP_I16_I16_I16>; +defm V_MIN_I16 : VOP2Inst , "v_min_i16", VOP_I16_I16_I16>; +} // End isCommutable = 1 +defm V_LDEXP_F16 : VOP2Inst , "v_ldexp_f16", VOP_F16_F16_I16>; + +// Aliases to simplify matching of floating-pint instructions that are VOP2 on +// SI and VOP3 on VI. + +class SI2_VI3Alias : InstAlias < + name#" $dst, $src0, $src1", + (inst VGPR_32:$dst, 0, VCSrc_32:$src0, 0, VCSrc_32:$src1, 0, 0) +>, PredicateControl { + let UseInstAsmMatchConverter = 0; +} + +def : SI2_VI3Alias <"v_ldexp_f32", V_LDEXP_F32_e64_vi>; +def : SI2_VI3Alias <"v_cvt_pkaccum_u8_f32", V_CVT_PKACCUM_U8_F32_e64_vi>; +def : SI2_VI3Alias <"v_cvt_pknorm_i16_f32", V_CVT_PKNORM_I16_F32_e64_vi>; +def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>; +def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>; + +} // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI + +//===----------------------------------------------------------------------===// +// SMEM Patterns +//===----------------------------------------------------------------------===// + +let Predicates = [isVI] in { + +// 1. Offset as 20bit DWORD immediate +def : Pat < + (SIload_constant v4i32:$sbase, IMM20bit:$offset), + (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset)) +>; + +} // End Predicates = [isVI] diff --git a/lib/Target/LLVMBuild.txt b/lib/Target/LLVMBuild.txt index 3af3426b94c..ab823248928 100644 --- a/lib/Target/LLVMBuild.txt +++ b/lib/Target/LLVMBuild.txt @@ -19,6 +19,7 @@ ; will typically require only insertion of a line. [common] subdirectories = + AMDGPU ARM AArch64 BPF @@ -28,7 +29,6 @@ subdirectories = NVPTX Mips PowerPC - R600 Sparc SystemZ X86 diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h deleted file mode 100644 index 0a05d25189b..00000000000 --- a/lib/Target/R600/AMDGPU.h +++ /dev/null @@ -1,148 +0,0 @@ -//===-- AMDGPU.h - MachineFunction passes hw codegen --------------*- C++ -*-=// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -/// \file -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_AMDGPU_H -#define LLVM_LIB_TARGET_R600_AMDGPU_H - -#include "llvm/Support/TargetRegistry.h" -#include "llvm/Target/TargetMachine.h" - -namespace llvm { - -class AMDGPUInstrPrinter; -class AMDGPUSubtarget; -class AMDGPUTargetMachine; -class FunctionPass; -class MCAsmInfo; -class raw_ostream; -class Target; -class TargetMachine; - -// R600 Passes -FunctionPass *createR600VectorRegMerger(TargetMachine &tm); -FunctionPass *createR600TextureIntrinsicsReplacer(); -FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm); -FunctionPass *createR600EmitClauseMarkers(); -FunctionPass *createR600ClauseMergePass(TargetMachine &tm); -FunctionPass *createR600Packetizer(TargetMachine &tm); -FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm); -FunctionPass *createAMDGPUCFGStructurizerPass(); - -// SI Passes -FunctionPass *createSITypeRewriter(); -FunctionPass *createSIAnnotateControlFlowPass(); -FunctionPass *createSIFoldOperandsPass(); -FunctionPass *createSILowerI1CopiesPass(); -FunctionPass *createSIShrinkInstructionsPass(); -FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm); -FunctionPass *createSILowerControlFlowPass(TargetMachine &tm); -FunctionPass *createSIFixControlFlowLiveIntervalsPass(); -FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm); -FunctionPass *createSIFixSGPRLiveRangesPass(); -FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); -FunctionPass *createSIInsertWaits(TargetMachine &tm); -FunctionPass *createSIPrepareScratchRegs(); - -void initializeSIFoldOperandsPass(PassRegistry &); -extern char &SIFoldOperandsID; - -void initializeSILowerI1CopiesPass(PassRegistry &); -extern char &SILowerI1CopiesID; - -void initializeSILoadStoreOptimizerPass(PassRegistry &); -extern char &SILoadStoreOptimizerID; - -// Passes common to R600 and SI -FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST); -Pass *createAMDGPUStructurizeCFGPass(); -FunctionPass *createAMDGPUISelDag(TargetMachine &tm); -ModulePass *createAMDGPUAlwaysInlinePass(); - -void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&); -extern char &SIFixControlFlowLiveIntervalsID; - -void initializeSIFixSGPRLiveRangesPass(PassRegistry&); -extern char &SIFixSGPRLiveRangesID; - - -extern Target TheAMDGPUTarget; -extern Target TheGCNTarget; - -namespace AMDGPU { -enum TargetIndex { - TI_CONSTDATA_START, - TI_SCRATCH_RSRC_DWORD0, - TI_SCRATCH_RSRC_DWORD1, - TI_SCRATCH_RSRC_DWORD2, - TI_SCRATCH_RSRC_DWORD3 -}; -} - -#define END_OF_TEXT_LABEL_NAME "EndOfTextLabel" - -} // End namespace llvm - -namespace ShaderType { - enum Type { - PIXEL = 0, - VERTEX = 1, - GEOMETRY = 2, - COMPUTE = 3 - }; -} - -/// OpenCL uses address spaces to differentiate between -/// various memory regions on the hardware. On the CPU -/// all of the address spaces point to the same memory, -/// however on the GPU, each address space points to -/// a separate piece of memory that is unique from other -/// memory locations. -namespace AMDGPUAS { -enum AddressSpaces : unsigned { - PRIVATE_ADDRESS = 0, ///< Address space for private memory. - GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). - CONSTANT_ADDRESS = 2, ///< Address space for constant memory - LOCAL_ADDRESS = 3, ///< Address space for local memory. - FLAT_ADDRESS = 4, ///< Address space for flat memory. - REGION_ADDRESS = 5, ///< Address space for region memory. - PARAM_D_ADDRESS = 6, ///< Address space for direct addressible parameter memory (CONST0) - PARAM_I_ADDRESS = 7, ///< Address space for indirect addressible parameter memory (VTX1) - - // Do not re-order the CONSTANT_BUFFER_* enums. Several places depend on this - // order to be able to dynamically index a constant buffer, for example: - // - // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx - - CONSTANT_BUFFER_0 = 8, - CONSTANT_BUFFER_1 = 9, - CONSTANT_BUFFER_2 = 10, - CONSTANT_BUFFER_3 = 11, - CONSTANT_BUFFER_4 = 12, - CONSTANT_BUFFER_5 = 13, - CONSTANT_BUFFER_6 = 14, - CONSTANT_BUFFER_7 = 15, - CONSTANT_BUFFER_8 = 16, - CONSTANT_BUFFER_9 = 17, - CONSTANT_BUFFER_10 = 18, - CONSTANT_BUFFER_11 = 19, - CONSTANT_BUFFER_12 = 20, - CONSTANT_BUFFER_13 = 21, - CONSTANT_BUFFER_14 = 22, - CONSTANT_BUFFER_15 = 23, - ADDRESS_NONE = 24, ///< Address space for unknown memory. - LAST_ADDRESS = ADDRESS_NONE, - - // Some places use this if the address space can't be determined. - UNKNOWN_ADDRESS_SPACE = ~0u -}; - -} // namespace AMDGPUAS - -#endif diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td deleted file mode 100644 index 2e7e39a54d3..00000000000 --- a/lib/Target/R600/AMDGPU.td +++ /dev/null @@ -1,266 +0,0 @@ -//===-- AMDGPU.td - AMDGPU Tablegen files ------------------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -include "llvm/Target/Target.td" - -//===----------------------------------------------------------------------===// -// Subtarget Features -//===----------------------------------------------------------------------===// - -// Debugging Features - -def FeatureDumpCode : SubtargetFeature <"DumpCode", - "DumpCode", - "true", - "Dump MachineInstrs in the CodeEmitter">; - -def FeatureDumpCodeLower : SubtargetFeature <"dumpcode", - "DumpCode", - "true", - "Dump MachineInstrs in the CodeEmitter">; - -def FeatureIRStructurizer : SubtargetFeature <"disable-irstructurizer", - "EnableIRStructurizer", - "false", - "Disable IR Structurizer">; - -def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca", - "EnablePromoteAlloca", - "true", - "Enable promote alloca pass">; - -// Target features - -def FeatureIfCvt : SubtargetFeature <"disable-ifcvt", - "EnableIfCvt", - "false", - "Disable the if conversion pass">; - -def FeatureFP64 : SubtargetFeature<"fp64", - "FP64", - "true", - "Enable double precision operations">; - -def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals", - "FP64Denormals", - "true", - "Enable double precision denormal handling", - [FeatureFP64]>; - -def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf", - "FastFMAF32", - "true", - "Assuming f32 fma is at least as fast as mul + add", - []>; - -// Some instructions do not support denormals despite this flag. Using -// fp32 denormals also causes instructions to run at the double -// precision rate for the device. -def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals", - "FP32Denormals", - "true", - "Enable single precision denormal handling">; - -def Feature64BitPtr : SubtargetFeature<"64BitPtr", - "Is64bit", - "true", - "Specify if 64-bit addressing should be used">; - -def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst", - "R600ALUInst", - "false", - "Older version of ALU instructions encoding">; - -def FeatureVertexCache : SubtargetFeature<"HasVertexCache", - "HasVertexCache", - "true", - "Specify use of dedicated vertex cache">; - -def FeatureCaymanISA : SubtargetFeature<"caymanISA", - "CaymanISA", - "true", - "Use Cayman ISA">; - -def FeatureCFALUBug : SubtargetFeature<"cfalubug", - "CFALUBug", - "true", - "GPU has CF_ALU bug">; - -// XXX - This should probably be removed once enabled by default -def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt", - "EnableLoadStoreOpt", - "true", - "Enable SI load/store optimizer pass">; - -def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", - "FlatAddressSpace", - "true", - "Support flat address space">; - -def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling", - "EnableVGPRSpilling", - "true", - "Enable spilling of VGPRs to scratch memory">; - -def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug", - "SGPRInitBug", - "true", - "VI SGPR initilization bug requiring a fixed SGPR allocation size">; - -class SubtargetFeatureFetchLimit : - SubtargetFeature <"fetch"#Value, - "TexVTXClauseSize", - Value, - "Limit the maximum number of fetches in a clause to "#Value>; - -def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">; -def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">; - -class SubtargetFeatureWavefrontSize : SubtargetFeature< - "wavefrontsize"#Value, - "WavefrontSize", - !cast(Value), - "The number of threads per wavefront">; - -def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>; -def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>; -def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>; - -class SubtargetFeatureLDSBankCount : SubtargetFeature < - "ldsbankcount"#Value, - "LDSBankCount", - !cast(Value), - "The number of LDS banks per compute unit.">; - -def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>; -def FeatureLDSBankCount32 : SubtargetFeatureLDSBankCount<32>; - -class SubtargetFeatureLocalMemorySize : SubtargetFeature< - "localmemorysize"#Value, - "LocalMemorySize", - !cast(Value), - "The size of local memory in bytes">; - -def FeatureGCN : SubtargetFeature<"gcn", - "IsGCN", - "true", - "GCN or newer GPU">; - -def FeatureGCN1Encoding : SubtargetFeature<"gcn1-encoding", - "GCN1Encoding", - "true", - "Encoding format for SI and CI">; - -def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding", - "GCN3Encoding", - "true", - "Encoding format for VI">; - -def FeatureCIInsts : SubtargetFeature<"ci-insts", - "CIInsts", - "true", - "Additional intstructions for CI+">; - -// Dummy feature used to disable assembler instructions. -def FeatureDisable : SubtargetFeature<"", - "FeatureDisable","true", - "Dummy feature to disable assembler" - " instructions">; - -class SubtargetFeatureGeneration Implies> : - SubtargetFeature ; - -def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>; -def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>; -def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>; - -def FeatureR600 : SubtargetFeatureGeneration<"R600", - [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]>; - -def FeatureR700 : SubtargetFeatureGeneration<"R700", - [FeatureFetchLimit16, FeatureLocalMemorySize0]>; - -def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN", - [FeatureFetchLimit16, FeatureLocalMemorySize32768]>; - -def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS", - [FeatureFetchLimit16, FeatureWavefrontSize64, - FeatureLocalMemorySize32768] ->; - -def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS", - [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768, - FeatureWavefrontSize64, FeatureGCN, FeatureGCN1Encoding, - FeatureLDSBankCount32]>; - -def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS", - [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536, - FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace, - FeatureGCN1Encoding, FeatureCIInsts]>; - -def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS", - [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536, - FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, - FeatureGCN3Encoding, FeatureCIInsts, FeatureLDSBankCount32]>; - -//===----------------------------------------------------------------------===// - -def AMDGPUInstrInfo : InstrInfo { - let guessInstructionProperties = 1; - let noNamedPositionallyEncodedOperands = 1; -} - -def AMDGPUAsmParser : AsmParser { - // Some of the R600 registers have the same name, so this crashes. - // For example T0_XYZW and T0_XY both have the asm name T0. - let ShouldEmitMatchRegisterName = 0; -} - -def AMDGPU : Target { - // Pull in Instruction Info: - let InstructionSet = AMDGPUInstrInfo; - let AssemblyParsers = [AMDGPUAsmParser]; -} - -// Dummy Instruction itineraries for pseudo instructions -def ALU_NULL : FuncUnit; -def NullALU : InstrItinClass; - -//===----------------------------------------------------------------------===// -// Predicate helper class -//===----------------------------------------------------------------------===// - -def TruePredicate : Predicate<"true">; -def isSICI : Predicate< - "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" - "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS" ->, AssemblerPredicate<"FeatureGCN1Encoding">; - -class PredicateControl { - Predicate SubtargetPredicate; - Predicate SIAssemblerPredicate = isSICI; - list AssemblerPredicates = []; - Predicate AssemblerPredicate = TruePredicate; - list OtherPredicates = []; - list Predicates = !listconcat([SubtargetPredicate, AssemblerPredicate], - AssemblerPredicates, - OtherPredicates); -} - -// Include AMDGPU TD files -include "R600Schedule.td" -include "SISchedule.td" -include "Processors.td" -include "AMDGPUInstrInfo.td" -include "AMDGPUIntrinsics.td" -include "AMDGPURegisterInfo.td" -include "AMDGPUInstructions.td" -include "AMDGPUCallingConv.td" diff --git a/lib/Target/R600/AMDGPUAlwaysInlinePass.cpp b/lib/Target/R600/AMDGPUAlwaysInlinePass.cpp deleted file mode 100644 index 0b426bc63dd..00000000000 --- a/lib/Target/R600/AMDGPUAlwaysInlinePass.cpp +++ /dev/null @@ -1,67 +0,0 @@ -//===-- AMDGPUAlwaysInlinePass.cpp - Promote Allocas ----------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// This pass marks all internal functions as always_inline and creates -/// duplicates of all other functions a marks the duplicates as always_inline. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "llvm/IR/Module.h" -#include "llvm/Transforms/Utils/Cloning.h" - -using namespace llvm; - -namespace { - -class AMDGPUAlwaysInline : public ModulePass { - - static char ID; - -public: - AMDGPUAlwaysInline() : ModulePass(ID) { } - bool runOnModule(Module &M) override; - const char *getPassName() const override { return "AMDGPU Always Inline Pass"; } -}; - -} // End anonymous namespace - -char AMDGPUAlwaysInline::ID = 0; - -bool AMDGPUAlwaysInline::runOnModule(Module &M) { - - std::vector FuncsToClone; - for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { - Function &F = *I; - if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty() && - !F.hasFnAttribute(Attribute::NoInline)) - FuncsToClone.push_back(&F); - } - - for (Function *F : FuncsToClone) { - ValueToValueMapTy VMap; - Function *NewFunc = CloneFunction(F, VMap, false); - NewFunc->setLinkage(GlobalValue::InternalLinkage); - F->getParent()->getFunctionList().push_back(NewFunc); - F->replaceAllUsesWith(NewFunc); - } - - for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { - Function &F = *I; - if (F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::NoInline)) { - F.addFnAttr(Attribute::AlwaysInline); - } - } - return false; -} - -ModulePass *llvm::createAMDGPUAlwaysInlinePass() { - return new AMDGPUAlwaysInline(); -} diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp deleted file mode 100644 index 56b50a9c159..00000000000 --- a/lib/Target/R600/AMDGPUAsmPrinter.cpp +++ /dev/null @@ -1,600 +0,0 @@ -//===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer --------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// -/// The AMDGPUAsmPrinter is used to print both assembly string and also binary -/// code. When passed an MCAsmStreamer it prints assembly and when passed -/// an MCObjectStreamer it outputs binary code. -// -//===----------------------------------------------------------------------===// -// - -#include "AMDGPUAsmPrinter.h" -#include "InstPrinter/AMDGPUInstPrinter.h" -#include "AMDGPU.h" -#include "AMDKernelCodeT.h" -#include "AMDGPUSubtarget.h" -#include "R600Defines.h" -#include "R600MachineFunctionInfo.h" -#include "R600RegisterInfo.h" -#include "SIDefines.h" -#include "SIMachineFunctionInfo.h" -#include "SIRegisterInfo.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCSectionELF.h" -#include "llvm/MC/MCStreamer.h" -#include "llvm/Support/ELF.h" -#include "llvm/Support/MathExtras.h" -#include "llvm/Support/TargetRegistry.h" -#include "llvm/Target/TargetLoweringObjectFile.h" - -using namespace llvm; - -// TODO: This should get the default rounding mode from the kernel. We just set -// the default here, but this could change if the OpenCL rounding mode pragmas -// are used. -// -// The denormal mode here should match what is reported by the OpenCL runtime -// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but -// can also be override to flush with the -cl-denorms-are-zero compiler flag. -// -// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double -// precision, and leaves single precision to flush all and does not report -// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports -// CL_FP_DENORM for both. -// -// FIXME: It seems some instructions do not support single precision denormals -// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32, -// and sin_f32, cos_f32 on most parts). - -// We want to use these instructions, and using fp32 denormals also causes -// instructions to run at the double precision rate for the device so it's -// probably best to just report no single precision denormals. -static uint32_t getFPMode(const MachineFunction &F) { - const AMDGPUSubtarget& ST = F.getSubtarget(); - // TODO: Is there any real use for the flush in only / flush out only modes? - - uint32_t FP32Denormals = - ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; - - uint32_t FP64Denormals = - ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; - - return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) | - FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) | - FP_DENORM_MODE_SP(FP32Denormals) | - FP_DENORM_MODE_DP(FP64Denormals); -} - -static AsmPrinter * -createAMDGPUAsmPrinterPass(TargetMachine &tm, - std::unique_ptr &&Streamer) { - return new AMDGPUAsmPrinter(tm, std::move(Streamer)); -} - -extern "C" void LLVMInitializeR600AsmPrinter() { - TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass); - TargetRegistry::RegisterAsmPrinter(TheGCNTarget, createAMDGPUAsmPrinterPass); -} - -AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, - std::unique_ptr Streamer) - : AsmPrinter(TM, std::move(Streamer)) {} - -void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { - - // This label is used to mark the end of the .text section. - const TargetLoweringObjectFile &TLOF = getObjFileLowering(); - OutStreamer->SwitchSection(TLOF.getTextSection()); - MCSymbol *EndOfTextLabel = - OutContext.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); - OutStreamer->EmitLabel(EndOfTextLabel); -} - -bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { - - // The starting address of all shader programs must be 256 bytes aligned. - MF.setAlignment(8); - - SetupMachineFunction(MF); - - MCContext &Context = getObjFileLowering().getContext(); - MCSectionELF *ConfigSection = - Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0); - OutStreamer->SwitchSection(ConfigSection); - - const AMDGPUSubtarget &STM = MF.getSubtarget(); - SIProgramInfo KernelInfo; - if (STM.isAmdHsaOS()) { - getSIProgramInfo(KernelInfo, MF); - EmitAmdKernelCodeT(MF, KernelInfo); - OutStreamer->EmitCodeAlignment(2 << (MF.getAlignment() - 1)); - } else if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - getSIProgramInfo(KernelInfo, MF); - EmitProgramInfoSI(MF, KernelInfo); - } else { - EmitProgramInfoR600(MF); - } - - DisasmLines.clear(); - HexLines.clear(); - DisasmLineMaxLen = 0; - - EmitFunctionBody(); - - if (isVerbose()) { - MCSectionELF *CommentSection = - Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0); - OutStreamer->SwitchSection(CommentSection); - - if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - OutStreamer->emitRawComment(" Kernel info:", false); - OutStreamer->emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen), - false); - OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR), - false); - OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR), - false); - OutStreamer->emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode), - false); - OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode), - false); - OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize), - false); - } else { - R600MachineFunctionInfo *MFI = MF.getInfo(); - OutStreamer->emitRawComment( - Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->StackSize))); - } - } - - if (STM.dumpCode()) { - - OutStreamer->SwitchSection( - Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0)); - - for (size_t i = 0; i < DisasmLines.size(); ++i) { - std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' '); - Comment += " ; " + HexLines[i] + "\n"; - - OutStreamer->EmitBytes(StringRef(DisasmLines[i])); - OutStreamer->EmitBytes(StringRef(Comment)); - } - } - - return false; -} - -void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { - unsigned MaxGPR = 0; - bool killPixel = false; - const AMDGPUSubtarget &STM = MF.getSubtarget(); - const R600RegisterInfo *RI = - static_cast(STM.getRegisterInfo()); - const R600MachineFunctionInfo *MFI = MF.getInfo(); - - for (const MachineBasicBlock &MBB : MF) { - for (const MachineInstr &MI : MBB) { - if (MI.getOpcode() == AMDGPU::KILLGT) - killPixel = true; - unsigned numOperands = MI.getNumOperands(); - for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { - const MachineOperand &MO = MI.getOperand(op_idx); - if (!MO.isReg()) - continue; - unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff; - - // Register with value > 127 aren't GPR - if (HWReg > 127) - continue; - MaxGPR = std::max(MaxGPR, HWReg); - } - } - } - - unsigned RsrcReg; - if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) { - // Evergreen / Northern Islands - switch (MFI->getShaderType()) { - default: // Fall through - case ShaderType::COMPUTE: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break; - case ShaderType::GEOMETRY: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break; - case ShaderType::PIXEL: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break; - case ShaderType::VERTEX: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break; - } - } else { - // R600 / R700 - switch (MFI->getShaderType()) { - default: // Fall through - case ShaderType::GEOMETRY: // Fall through - case ShaderType::COMPUTE: // Fall through - case ShaderType::VERTEX: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break; - case ShaderType::PIXEL: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break; - } - } - - OutStreamer->EmitIntValue(RsrcReg, 4); - OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) | - S_STACK_SIZE(MFI->StackSize), 4); - OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4); - OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4); - - if (MFI->getShaderType() == ShaderType::COMPUTE) { - OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4); - OutStreamer->EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4); - } -} - -void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, - const MachineFunction &MF) const { - const AMDGPUSubtarget &STM = MF.getSubtarget(); - const SIMachineFunctionInfo *MFI = MF.getInfo(); - uint64_t CodeSize = 0; - unsigned MaxSGPR = 0; - unsigned MaxVGPR = 0; - bool VCCUsed = false; - bool FlatUsed = false; - const SIRegisterInfo *RI = - static_cast(STM.getRegisterInfo()); - - for (const MachineBasicBlock &MBB : MF) { - for (const MachineInstr &MI : MBB) { - // TODO: CodeSize should account for multiple functions. - CodeSize += MI.getDesc().Size; - - unsigned numOperands = MI.getNumOperands(); - for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { - const MachineOperand &MO = MI.getOperand(op_idx); - unsigned width = 0; - bool isSGPR = false; - - if (!MO.isReg()) { - continue; - } - unsigned reg = MO.getReg(); - if (reg == AMDGPU::VCC || reg == AMDGPU::VCC_LO || - reg == AMDGPU::VCC_HI) { - VCCUsed = true; - continue; - } else if (reg == AMDGPU::FLAT_SCR || - reg == AMDGPU::FLAT_SCR_LO || - reg == AMDGPU::FLAT_SCR_HI) { - FlatUsed = true; - continue; - } - - switch (reg) { - default: break; - case AMDGPU::SCC: - case AMDGPU::EXEC: - case AMDGPU::M0: - continue; - } - - if (AMDGPU::SReg_32RegClass.contains(reg)) { - isSGPR = true; - width = 1; - } else if (AMDGPU::VGPR_32RegClass.contains(reg)) { - isSGPR = false; - width = 1; - } else if (AMDGPU::SReg_64RegClass.contains(reg)) { - isSGPR = true; - width = 2; - } else if (AMDGPU::VReg_64RegClass.contains(reg)) { - isSGPR = false; - width = 2; - } else if (AMDGPU::VReg_96RegClass.contains(reg)) { - isSGPR = false; - width = 3; - } else if (AMDGPU::SReg_128RegClass.contains(reg)) { - isSGPR = true; - width = 4; - } else if (AMDGPU::VReg_128RegClass.contains(reg)) { - isSGPR = false; - width = 4; - } else if (AMDGPU::SReg_256RegClass.contains(reg)) { - isSGPR = true; - width = 8; - } else if (AMDGPU::VReg_256RegClass.contains(reg)) { - isSGPR = false; - width = 8; - } else if (AMDGPU::SReg_512RegClass.contains(reg)) { - isSGPR = true; - width = 16; - } else if (AMDGPU::VReg_512RegClass.contains(reg)) { - isSGPR = false; - width = 16; - } else { - llvm_unreachable("Unknown register class"); - } - unsigned hwReg = RI->getEncodingValue(reg) & 0xff; - unsigned maxUsed = hwReg + width - 1; - if (isSGPR) { - MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR; - } else { - MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR; - } - } - } - } - - if (VCCUsed) - MaxSGPR += 2; - - if (FlatUsed) - MaxSGPR += 2; - - // We found the maximum register index. They start at 0, so add one to get the - // number of registers. - ProgInfo.NumVGPR = MaxVGPR + 1; - ProgInfo.NumSGPR = MaxSGPR + 1; - - if (STM.hasSGPRInitBug()) { - if (ProgInfo.NumSGPR > AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG) - llvm_unreachable("Too many SGPRs used with the SGPR init bug"); - - ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; - } - - ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4; - ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8; - // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode - // register. - ProgInfo.FloatMode = getFPMode(MF); - - // XXX: Not quite sure what this does, but sc seems to unset this. - ProgInfo.IEEEMode = 0; - - // Do not clamp NAN to 0. - ProgInfo.DX10Clamp = 0; - - const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); - ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF); - - ProgInfo.FlatUsed = FlatUsed; - ProgInfo.VCCUsed = VCCUsed; - ProgInfo.CodeLen = CodeSize; - - unsigned LDSAlignShift; - if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { - // LDS is allocated in 64 dword blocks. - LDSAlignShift = 8; - } else { - // LDS is allocated in 128 dword blocks. - LDSAlignShift = 9; - } - - unsigned LDSSpillSize = MFI->LDSWaveSpillSize * - MFI->getMaximumWorkGroupSize(MF); - - ProgInfo.LDSSize = MFI->LDSSize + LDSSpillSize; - ProgInfo.LDSBlocks = - RoundUpToAlignment(ProgInfo.LDSSize, 1 << LDSAlignShift) >> LDSAlignShift; - - // Scratch is allocated in 256 dword blocks. - unsigned ScratchAlignShift = 10; - // We need to program the hardware with the amount of scratch memory that - // is used by the entire wave. ProgInfo.ScratchSize is the amount of - // scratch memory used per thread. - ProgInfo.ScratchBlocks = - RoundUpToAlignment(ProgInfo.ScratchSize * STM.getWavefrontSize(), - 1 << ScratchAlignShift) >> ScratchAlignShift; - - ProgInfo.ComputePGMRSrc1 = - S_00B848_VGPRS(ProgInfo.VGPRBlocks) | - S_00B848_SGPRS(ProgInfo.SGPRBlocks) | - S_00B848_PRIORITY(ProgInfo.Priority) | - S_00B848_FLOAT_MODE(ProgInfo.FloatMode) | - S_00B848_PRIV(ProgInfo.Priv) | - S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) | - S_00B848_IEEE_MODE(ProgInfo.DebugMode) | - S_00B848_IEEE_MODE(ProgInfo.IEEEMode); - - ProgInfo.ComputePGMRSrc2 = - S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) | - S_00B84C_USER_SGPR(MFI->NumUserSGPRs) | - S_00B84C_TGID_X_EN(1) | - S_00B84C_TGID_Y_EN(1) | - S_00B84C_TGID_Z_EN(1) | - S_00B84C_TG_SIZE_EN(1) | - S_00B84C_TIDIG_COMP_CNT(2) | - S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks); -} - -static unsigned getRsrcReg(unsigned ShaderType) { - switch (ShaderType) { - default: // Fall through - case ShaderType::COMPUTE: return R_00B848_COMPUTE_PGM_RSRC1; - case ShaderType::GEOMETRY: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; - case ShaderType::PIXEL: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; - case ShaderType::VERTEX: return R_00B128_SPI_SHADER_PGM_RSRC1_VS; - } -} - -void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, - const SIProgramInfo &KernelInfo) { - const AMDGPUSubtarget &STM = MF.getSubtarget(); - const SIMachineFunctionInfo *MFI = MF.getInfo(); - unsigned RsrcReg = getRsrcReg(MFI->getShaderType()); - - if (MFI->getShaderType() == ShaderType::COMPUTE) { - OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4); - - OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4); - - OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); - OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc2, 4); - - OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4); - OutStreamer->EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4); - - // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = - // 0" comment but I don't see a corresponding field in the register spec. - } else { - OutStreamer->EmitIntValue(RsrcReg, 4); - OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) | - S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4); - if (STM.isVGPRSpillingEnabled(MFI)) { - OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4); - OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4); - } - } - - if (MFI->getShaderType() == ShaderType::PIXEL) { - OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); - OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4); - OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); - OutStreamer->EmitIntValue(MFI->PSInputAddr, 4); - } -} - -void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, - const SIProgramInfo &KernelInfo) const { - const SIMachineFunctionInfo *MFI = MF.getInfo(); - const AMDGPUSubtarget &STM = MF.getSubtarget(); - amd_kernel_code_t header; - - memset(&header, 0, sizeof(header)); - - header.amd_code_version_major = AMD_CODE_VERSION_MAJOR; - header.amd_code_version_minor = AMD_CODE_VERSION_MINOR; - - header.struct_byte_size = sizeof(amd_kernel_code_t); - - header.target_chip = STM.getAmdKernelCodeChipID(); - - header.kernel_code_entry_byte_offset = (1ULL << MF.getAlignment()); - - header.compute_pgm_resource_registers = - KernelInfo.ComputePGMRSrc1 | - (KernelInfo.ComputePGMRSrc2 << 32); - - // Code Properties: - header.code_properties = AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR | - AMD_CODE_PROPERTY_IS_PTR64; - - if (KernelInfo.FlatUsed) - header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; - - if (KernelInfo.ScratchBlocks) - header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE; - - header.workitem_private_segment_byte_size = KernelInfo.ScratchSize; - header.workgroup_group_segment_byte_size = KernelInfo.LDSSize; - - // MFI->ABIArgOffset is the number of bytes for the kernel arguments - // plus 36. 36 is the number of bytes reserved at the begining of the - // input buffer to store work-group size information. - // FIXME: We should be adding the size of the implicit arguments - // to this value. - header.kernarg_segment_byte_size = MFI->ABIArgOffset; - - header.wavefront_sgpr_count = KernelInfo.NumSGPR; - header.workitem_vgpr_count = KernelInfo.NumVGPR; - - // FIXME: What values do I put for these alignments - header.kernarg_segment_alignment = 0; - header.group_segment_alignment = 0; - header.private_segment_alignment = 0; - - header.code_type = 1; // HSA_EXT_CODE_KERNEL - - header.wavefront_size = STM.getWavefrontSize(); - - MCSectionELF *VersionSection = - OutContext.getELFSection(".hsa.version", ELF::SHT_PROGBITS, 0); - OutStreamer->SwitchSection(VersionSection); - OutStreamer->EmitBytes(Twine("HSA Code Unit:" + - Twine(header.hsail_version_major) + "." + - Twine(header.hsail_version_minor) + ":" + - "AMD:" + - Twine(header.amd_code_version_major) + "." + - Twine(header.amd_code_version_minor) + ":" + - "GFX8.1:0").str()); - - OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); - - if (isVerbose()) { - OutStreamer->emitRawComment("amd_code_version_major = " + - Twine(header.amd_code_version_major), false); - OutStreamer->emitRawComment("amd_code_version_minor = " + - Twine(header.amd_code_version_minor), false); - OutStreamer->emitRawComment("struct_byte_size = " + - Twine(header.struct_byte_size), false); - OutStreamer->emitRawComment("target_chip = " + - Twine(header.target_chip), false); - OutStreamer->emitRawComment(" compute_pgm_rsrc1: " + - Twine::utohexstr(KernelInfo.ComputePGMRSrc1), - false); - OutStreamer->emitRawComment(" compute_pgm_rsrc2: " + - Twine::utohexstr(KernelInfo.ComputePGMRSrc2), - false); - OutStreamer->emitRawComment("enable_sgpr_private_segment_buffer = " + - Twine((bool)(header.code_properties & - AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE)), false); - OutStreamer->emitRawComment("enable_sgpr_kernarg_segment_ptr = " + - Twine((bool)(header.code_properties & - AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR)), false); - OutStreamer->emitRawComment("private_element_size = 2 ", false); - OutStreamer->emitRawComment("is_ptr64 = " + - Twine((bool)(header.code_properties & AMD_CODE_PROPERTY_IS_PTR64)), false); - OutStreamer->emitRawComment("workitem_private_segment_byte_size = " + - Twine(header.workitem_private_segment_byte_size), - false); - OutStreamer->emitRawComment("workgroup_group_segment_byte_size = " + - Twine(header.workgroup_group_segment_byte_size), - false); - OutStreamer->emitRawComment("gds_segment_byte_size = " + - Twine(header.gds_segment_byte_size), false); - OutStreamer->emitRawComment("kernarg_segment_byte_size = " + - Twine(header.kernarg_segment_byte_size), false); - OutStreamer->emitRawComment("wavefront_sgpr_count = " + - Twine(header.wavefront_sgpr_count), false); - OutStreamer->emitRawComment("workitem_vgpr_count = " + - Twine(header.workitem_vgpr_count), false); - OutStreamer->emitRawComment("code_type = " + Twine(header.code_type), false); - OutStreamer->emitRawComment("wavefront_size = " + - Twine((int)header.wavefront_size), false); - OutStreamer->emitRawComment("optimization_level = " + - Twine(header.optimization_level), false); - OutStreamer->emitRawComment("hsail_profile = " + - Twine(header.hsail_profile), false); - OutStreamer->emitRawComment("hsail_machine_model = " + - Twine(header.hsail_machine_model), false); - OutStreamer->emitRawComment("hsail_version_major = " + - Twine(header.hsail_version_major), false); - OutStreamer->emitRawComment("hsail_version_minor = " + - Twine(header.hsail_version_minor), false); - } - - OutStreamer->EmitBytes(StringRef((char*)&header, sizeof(header))); -} - -bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, - const char *ExtraCode, raw_ostream &O) { - if (ExtraCode && ExtraCode[0]) { - if (ExtraCode[1] != 0) - return true; // Unknown modifier. - - switch (ExtraCode[0]) { - default: - // See if this is a generic print operand - return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O); - case 'r': - break; - } - } - - AMDGPUInstPrinter::printRegOperand(MI->getOperand(OpNo).getReg(), O, - *TM.getSubtargetImpl(*MF->getFunction())->getRegisterInfo()); - return false; -} diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h deleted file mode 100644 index 1acff3a3222..00000000000 --- a/lib/Target/R600/AMDGPUAsmPrinter.h +++ /dev/null @@ -1,113 +0,0 @@ -//===-- AMDGPUAsmPrinter.h - Print AMDGPU assembly code ---------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief AMDGPU Assembly printer class. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H -#define LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H - -#include "llvm/CodeGen/AsmPrinter.h" -#include - -namespace llvm { - -class AMDGPUAsmPrinter : public AsmPrinter { -private: - struct SIProgramInfo { - SIProgramInfo() : - VGPRBlocks(0), - SGPRBlocks(0), - Priority(0), - FloatMode(0), - Priv(0), - DX10Clamp(0), - DebugMode(0), - IEEEMode(0), - ScratchSize(0), - ComputePGMRSrc1(0), - LDSBlocks(0), - ScratchBlocks(0), - ComputePGMRSrc2(0), - NumVGPR(0), - NumSGPR(0), - FlatUsed(false), - VCCUsed(false), - CodeLen(0) {} - - // Fields set in PGM_RSRC1 pm4 packet. - uint32_t VGPRBlocks; - uint32_t SGPRBlocks; - uint32_t Priority; - uint32_t FloatMode; - uint32_t Priv; - uint32_t DX10Clamp; - uint32_t DebugMode; - uint32_t IEEEMode; - uint32_t ScratchSize; - - uint64_t ComputePGMRSrc1; - - // Fields set in PGM_RSRC2 pm4 packet. - uint32_t LDSBlocks; - uint32_t ScratchBlocks; - - uint64_t ComputePGMRSrc2; - - uint32_t NumVGPR; - uint32_t NumSGPR; - uint32_t LDSSize; - bool FlatUsed; - - // Bonus information for debugging. - bool VCCUsed; - uint64_t CodeLen; - }; - - void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const; - void findNumUsedRegistersSI(const MachineFunction &MF, - unsigned &NumSGPR, - unsigned &NumVGPR) const; - - /// \brief Emit register usage information so that the GPU driver - /// can correctly setup the GPU state. - void EmitProgramInfoR600(const MachineFunction &MF); - void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo); - void EmitAmdKernelCodeT(const MachineFunction &MF, - const SIProgramInfo &KernelInfo) const; - -public: - explicit AMDGPUAsmPrinter(TargetMachine &TM, - std::unique_ptr Streamer); - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "AMDGPU Assembly Printer"; - } - - /// Implemented in AMDGPUMCInstLower.cpp - void EmitInstruction(const MachineInstr *MI) override; - - void EmitEndOfAsmFile(Module &M) override; - - bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &O) override; - -protected: - std::vector DisasmLines, HexLines; - size_t DisasmLineMaxLen; -}; - -} // End anonymous llvm - -#endif diff --git a/lib/Target/R600/AMDGPUCallingConv.td b/lib/Target/R600/AMDGPUCallingConv.td deleted file mode 100644 index 6ffa7a08358..00000000000 --- a/lib/Target/R600/AMDGPUCallingConv.td +++ /dev/null @@ -1,82 +0,0 @@ -//===---- AMDCallingConv.td - Calling Conventions for Radeon GPUs ---------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This describes the calling conventions for the AMD Radeon GPUs. -// -//===----------------------------------------------------------------------===// - -// Inversion of CCIfInReg -class CCIfNotInReg : CCIf<"!ArgFlags.isInReg()", A> {} - -// Calling convention for SI -def CC_SI : CallingConv<[ - - CCIfInReg>>, - - CCIfInReg>>, - - CCIfNotInReg>>, - - CCIfByVal>> - -]>; - -// Calling convention for R600 -def CC_R600 : CallingConv<[ - CCIfInReg>> -]>; - -// Calling convention for compute kernels -def CC_AMDGPU_Kernel : CallingConv<[ - CCCustom<"allocateStack"> -]>; - -def CC_AMDGPU : CallingConv<[ - CCIf<"static_cast" - "(State.getMachineFunction().getSubtarget()).getGeneration() >=" - "AMDGPUSubtarget::SOUTHERN_ISLANDS && " - "State.getMachineFunction().getInfo()" - "->getShaderType() == ShaderType::COMPUTE", - CCDelegateTo>, - CCIf<"static_cast" - "(State.getMachineFunction().getSubtarget()).getGeneration() < " - "AMDGPUSubtarget::SOUTHERN_ISLANDS && " - "State.getMachineFunction().getInfo()" - "->getShaderType() == ShaderType::COMPUTE", - CCDelegateTo>, - CCIf<"static_cast" - "(State.getMachineFunction().getSubtarget()).getGeneration() >= " - "AMDGPUSubtarget::SOUTHERN_ISLANDS", - CCDelegateTo>, - CCIf<"static_cast" - "(State.getMachineFunction().getSubtarget()).getGeneration() < " - "AMDGPUSubtarget::SOUTHERN_ISLANDS", - CCDelegateTo> -]>; diff --git a/lib/Target/R600/AMDGPUFrameLowering.cpp b/lib/Target/R600/AMDGPUFrameLowering.cpp deleted file mode 100644 index 8175786fb9b..00000000000 --- a/lib/Target/R600/AMDGPUFrameLowering.cpp +++ /dev/null @@ -1,112 +0,0 @@ -//===----------------------- AMDGPUFrameLowering.cpp ----------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//==-----------------------------------------------------------------------===// -// -// Interface to describe a layout of a stack frame on a AMDIL target machine -// -//===----------------------------------------------------------------------===// -#include "AMDGPUFrameLowering.h" -#include "AMDGPURegisterInfo.h" -#include "R600MachineFunctionInfo.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/Instructions.h" - -using namespace llvm; -AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl, - int LAO, unsigned TransAl) - : TargetFrameLowering(D, StackAl, LAO, TransAl) { } - -AMDGPUFrameLowering::~AMDGPUFrameLowering() { } - -unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const { - - // XXX: Hardcoding to 1 for now. - // - // I think the StackWidth should stored as metadata associated with the - // MachineFunction. This metadata can either be added by a frontend, or - // calculated by a R600 specific LLVM IR pass. - // - // The StackWidth determines how stack objects are laid out in memory. - // For a vector stack variable, like: int4 stack[2], the data will be stored - // in the following ways depending on the StackWidth. - // - // StackWidth = 1: - // - // T0.X = stack[0].x - // T1.X = stack[0].y - // T2.X = stack[0].z - // T3.X = stack[0].w - // T4.X = stack[1].x - // T5.X = stack[1].y - // T6.X = stack[1].z - // T7.X = stack[1].w - // - // StackWidth = 2: - // - // T0.X = stack[0].x - // T0.Y = stack[0].y - // T1.X = stack[0].z - // T1.Y = stack[0].w - // T2.X = stack[1].x - // T2.Y = stack[1].y - // T3.X = stack[1].z - // T3.Y = stack[1].w - // - // StackWidth = 4: - // T0.X = stack[0].x - // T0.Y = stack[0].y - // T0.Z = stack[0].z - // T0.W = stack[0].w - // T1.X = stack[1].x - // T1.Y = stack[1].y - // T1.Z = stack[1].z - // T1.W = stack[1].w - return 1; -} - -/// \returns The number of registers allocated for \p FI. -int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF, - int FI) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - // Start the offset at 2 so we don't overwrite work group information. - // XXX: We should only do this when the shader actually uses this - // information. - unsigned OffsetBytes = 2 * (getStackWidth(MF) * 4); - int UpperBound = FI == -1 ? MFI->getNumObjects() : FI; - - for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) { - OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(i)); - OffsetBytes += MFI->getObjectSize(i); - // Each register holds 4 bytes, so we must always align the offset to at - // least 4 bytes, so that 2 frame objects won't share the same register. - OffsetBytes = RoundUpToAlignment(OffsetBytes, 4); - } - - if (FI != -1) - OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(FI)); - - return OffsetBytes / (getStackWidth(MF) * 4); -} - -const TargetFrameLowering::SpillSlot * -AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const { - NumEntries = 0; - return nullptr; -} -void AMDGPUFrameLowering::emitPrologue(MachineFunction &MF, - MachineBasicBlock &MBB) const {} -void -AMDGPUFrameLowering::emitEpilogue(MachineFunction &MF, - MachineBasicBlock &MBB) const { -} - -bool -AMDGPUFrameLowering::hasFP(const MachineFunction &MF) const { - return false; -} diff --git a/lib/Target/R600/AMDGPUFrameLowering.h b/lib/Target/R600/AMDGPUFrameLowering.h deleted file mode 100644 index 9f31be1af79..00000000000 --- a/lib/Target/R600/AMDGPUFrameLowering.h +++ /dev/null @@ -1,45 +0,0 @@ -//===--------------------- AMDGPUFrameLowering.h ----------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Interface to describe a layout of a stack frame on a AMDIL target -/// machine. -// -//===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H -#define LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H - -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/Target/TargetFrameLowering.h" - -namespace llvm { - -/// \brief Information about the stack frame layout on the AMDGPU targets. -/// -/// It holds the direction of the stack growth, the known stack alignment on -/// entry to each function, and the offset to the locals area. -/// See TargetFrameInfo for more comments. -class AMDGPUFrameLowering : public TargetFrameLowering { -public: - AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO, - unsigned TransAl = 1); - virtual ~AMDGPUFrameLowering(); - - /// \returns The number of 32-bit sub-registers that are used when storing - /// values to the stack. - unsigned getStackWidth(const MachineFunction &MF) const; - int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; - const SpillSlot * - getCalleeSavedSpillSlots(unsigned &NumEntries) const override; - void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; - void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; - bool hasFP(const MachineFunction &MF) const override; -}; -} // namespace llvm -#endif diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp deleted file mode 100644 index df4461eac4d..00000000000 --- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ /dev/null @@ -1,1371 +0,0 @@ -//===-- AMDILISelDAGToDAG.cpp - A dag to dag inst selector for AMDIL ------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//==-----------------------------------------------------------------------===// -// -/// \file -/// \brief Defines an instruction selector for the AMDGPU target. -// -//===----------------------------------------------------------------------===// -#include "AMDGPUInstrInfo.h" -#include "AMDGPUISelLowering.h" // For AMDGPUISD -#include "AMDGPURegisterInfo.h" -#include "AMDGPUSubtarget.h" -#include "R600InstrInfo.h" -#include "SIDefines.h" -#include "SIISelLowering.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/FunctionLoweringInfo.h" -#include "llvm/CodeGen/PseudoSourceValue.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/CodeGen/SelectionDAGISel.h" -#include "llvm/IR/Function.h" - -using namespace llvm; - -//===----------------------------------------------------------------------===// -// Instruction Selector Implementation -//===----------------------------------------------------------------------===// - -namespace { -/// AMDGPU specific code to select AMDGPU machine instructions for -/// SelectionDAG operations. -class AMDGPUDAGToDAGISel : public SelectionDAGISel { - // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can - // make the right decision when generating code for different targets. - const AMDGPUSubtarget *Subtarget; -public: - AMDGPUDAGToDAGISel(TargetMachine &TM); - virtual ~AMDGPUDAGToDAGISel(); - bool runOnMachineFunction(MachineFunction &MF) override; - SDNode *Select(SDNode *N) override; - const char *getPassName() const override; - void PostprocessISelDAG() override; - -private: - bool isInlineImmediate(SDNode *N) const; - bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs, - const R600InstrInfo *TII); - bool FoldOperands(unsigned, const R600InstrInfo *, std::vector &); - bool FoldDotOperands(unsigned, const R600InstrInfo *, std::vector &); - - // Complex pattern selectors - bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2); - bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2); - bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2); - - static bool checkType(const Value *ptr, unsigned int addrspace); - static bool checkPrivateAddress(const MachineMemOperand *Op); - - static bool isGlobalStore(const StoreSDNode *N); - static bool isFlatStore(const StoreSDNode *N); - static bool isPrivateStore(const StoreSDNode *N); - static bool isLocalStore(const StoreSDNode *N); - static bool isRegionStore(const StoreSDNode *N); - - bool isCPLoad(const LoadSDNode *N) const; - bool isConstantLoad(const LoadSDNode *N, int cbID) const; - bool isGlobalLoad(const LoadSDNode *N) const; - bool isFlatLoad(const LoadSDNode *N) const; - bool isParamLoad(const LoadSDNode *N) const; - bool isPrivateLoad(const LoadSDNode *N) const; - bool isLocalLoad(const LoadSDNode *N) const; - bool isRegionLoad(const LoadSDNode *N) const; - - SDNode *glueCopyToM0(SDNode *N) const; - - const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; - bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr); - bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg, - SDValue& Offset); - bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); - bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); - bool isDSOffsetLegal(const SDValue &Base, unsigned Offset, - unsigned OffsetBits) const; - bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; - bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, - SDValue &Offset1) const; - void SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, - SDValue &SOffset, SDValue &Offset, SDValue &Offen, - SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, - SDValue &TFE) const; - bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, - SDValue &SOffset, SDValue &Offset, SDValue &GLC, - SDValue &SLC, SDValue &TFE) const; - bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, - SDValue &VAddr, SDValue &SOffset, SDValue &Offset, - SDValue &SLC) const; - bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr, - SDValue &SOffset, SDValue &ImmOffset) const; - bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, - SDValue &Offset, SDValue &GLC, SDValue &SLC, - SDValue &TFE) const; - bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, - SDValue &Offset, SDValue &GLC) const; - SDNode *SelectAddrSpaceCast(SDNode *N); - bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; - bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, - SDValue &Clamp, SDValue &Omod) const; - - bool SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods, - SDValue &Omod) const; - bool SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, SDValue &SrcMods, - SDValue &Clamp, - SDValue &Omod) const; - - SDNode *SelectADD_SUB_I64(SDNode *N); - SDNode *SelectDIV_SCALE(SDNode *N); - - SDNode *getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val, - uint32_t Offset, uint32_t Width); - SDNode *SelectS_BFEFromShifts(SDNode *N); - SDNode *SelectS_BFE(SDNode *N); - - // Include the pieces autogenerated from the target description. -#include "AMDGPUGenDAGISel.inc" -}; -} // end anonymous namespace - -/// \brief This pass converts a legalized DAG into a AMDGPU-specific -// DAG, ready for instruction scheduling. -FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM) { - return new AMDGPUDAGToDAGISel(TM); -} - -AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM) - : SelectionDAGISel(TM) {} - -bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { - Subtarget = &static_cast(MF.getSubtarget()); - return SelectionDAGISel::runOnMachineFunction(MF); -} - -AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() { -} - -bool AMDGPUDAGToDAGISel::isInlineImmediate(SDNode *N) const { - const SITargetLowering *TL - = static_cast(getTargetLowering()); - return TL->analyzeImmediate(N) == 0; -} - -/// \brief Determine the register class for \p OpNo -/// \returns The register class of the virtual register that will be used for -/// the given operand number \OpNo or NULL if the register class cannot be -/// determined. -const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, - unsigned OpNo) const { - if (!N->isMachineOpcode()) - return nullptr; - - switch (N->getMachineOpcode()) { - default: { - const MCInstrDesc &Desc = - Subtarget->getInstrInfo()->get(N->getMachineOpcode()); - unsigned OpIdx = Desc.getNumDefs() + OpNo; - if (OpIdx >= Desc.getNumOperands()) - return nullptr; - int RegClass = Desc.OpInfo[OpIdx].RegClass; - if (RegClass == -1) - return nullptr; - - return Subtarget->getRegisterInfo()->getRegClass(RegClass); - } - case AMDGPU::REG_SEQUENCE: { - unsigned RCID = cast(N->getOperand(0))->getZExtValue(); - const TargetRegisterClass *SuperRC = - Subtarget->getRegisterInfo()->getRegClass(RCID); - - SDValue SubRegOp = N->getOperand(OpNo + 1); - unsigned SubRegIdx = cast(SubRegOp)->getZExtValue(); - return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC, - SubRegIdx); - } - } -} - -bool AMDGPUDAGToDAGISel::SelectADDRParam( - SDValue Addr, SDValue& R1, SDValue& R2) { - - if (Addr.getOpcode() == ISD::FrameIndex) { - if (FrameIndexSDNode *FIN = dyn_cast(Addr)) { - R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32); - R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); - } else { - R1 = Addr; - R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); - } - } else if (Addr.getOpcode() == ISD::ADD) { - R1 = Addr.getOperand(0); - R2 = Addr.getOperand(1); - } else { - R1 = Addr; - R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); - } - return true; -} - -bool AMDGPUDAGToDAGISel::SelectADDR(SDValue Addr, SDValue& R1, SDValue& R2) { - if (Addr.getOpcode() == ISD::TargetExternalSymbol || - Addr.getOpcode() == ISD::TargetGlobalAddress) { - return false; - } - return SelectADDRParam(Addr, R1, R2); -} - - -bool AMDGPUDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) { - if (Addr.getOpcode() == ISD::TargetExternalSymbol || - Addr.getOpcode() == ISD::TargetGlobalAddress) { - return false; - } - - if (Addr.getOpcode() == ISD::FrameIndex) { - if (FrameIndexSDNode *FIN = dyn_cast(Addr)) { - R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64); - R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64); - } else { - R1 = Addr; - R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64); - } - } else if (Addr.getOpcode() == ISD::ADD) { - R1 = Addr.getOperand(0); - R2 = Addr.getOperand(1); - } else { - R1 = Addr; - R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64); - } - return true; -} - -SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const { - if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || - !checkType(cast(N)->getMemOperand()->getValue(), - AMDGPUAS::LOCAL_ADDRESS)) - return N; - - const SITargetLowering& Lowering = - *static_cast(getTargetLowering()); - - // Write max value to m0 before each load operation - - SDValue M0 = Lowering.copyToM0(*CurDAG, CurDAG->getEntryNode(), SDLoc(N), - CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32)); - - SDValue Glue = M0.getValue(1); - - SmallVector Ops; - for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { - Ops.push_back(N->getOperand(i)); - } - Ops.push_back(Glue); - CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops); - - return N; -} - -SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { - unsigned int Opc = N->getOpcode(); - if (N->isMachineOpcode()) { - N->setNodeId(-1); - return nullptr; // Already selected. - } - - if (isa(N)) - N = glueCopyToM0(N); - - switch (Opc) { - default: break; - // We are selecting i64 ADD here instead of custom lower it during - // DAG legalization, so we can fold some i64 ADDs used for address - // calculation into the LOAD and STORE instructions. - case ISD::ADD: - case ISD::SUB: { - if (N->getValueType(0) != MVT::i64 || - Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) - break; - - return SelectADD_SUB_I64(N); - } - case ISD::SCALAR_TO_VECTOR: - case AMDGPUISD::BUILD_VERTICAL_VECTOR: - case ISD::BUILD_VECTOR: { - unsigned RegClassID; - const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo(); - EVT VT = N->getValueType(0); - unsigned NumVectorElts = VT.getVectorNumElements(); - EVT EltVT = VT.getVectorElementType(); - assert(EltVT.bitsEq(MVT::i32)); - if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - bool UseVReg = true; - for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); - U != E; ++U) { - if (!U->isMachineOpcode()) { - continue; - } - const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo()); - if (!RC) { - continue; - } - if (static_cast(TRI)->isSGPRClass(RC)) { - UseVReg = false; - } - } - switch(NumVectorElts) { - case 1: RegClassID = UseVReg ? AMDGPU::VGPR_32RegClassID : - AMDGPU::SReg_32RegClassID; - break; - case 2: RegClassID = UseVReg ? AMDGPU::VReg_64RegClassID : - AMDGPU::SReg_64RegClassID; - break; - case 4: RegClassID = UseVReg ? AMDGPU::VReg_128RegClassID : - AMDGPU::SReg_128RegClassID; - break; - case 8: RegClassID = UseVReg ? AMDGPU::VReg_256RegClassID : - AMDGPU::SReg_256RegClassID; - break; - case 16: RegClassID = UseVReg ? AMDGPU::VReg_512RegClassID : - AMDGPU::SReg_512RegClassID; - break; - default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR"); - } - } else { - // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG - // that adds a 128 bits reg copy when going through TwoAddressInstructions - // pass. We want to avoid 128 bits copies as much as possible because they - // can't be bundled by our scheduler. - switch(NumVectorElts) { - case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break; - case 4: - if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR) - RegClassID = AMDGPU::R600_Reg128VerticalRegClassID; - else - RegClassID = AMDGPU::R600_Reg128RegClassID; - break; - default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR"); - } - } - - SDLoc DL(N); - SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); - - if (NumVectorElts == 1) { - return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, - N->getOperand(0), RegClass); - } - - assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not " - "supported yet"); - // 16 = Max Num Vector Elements - // 2 = 2 REG_SEQUENCE operands per element (value, subreg index) - // 1 = Vector Register Class - SmallVector RegSeqArgs(NumVectorElts * 2 + 1); - - RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); - bool IsRegSeq = true; - unsigned NOps = N->getNumOperands(); - for (unsigned i = 0; i < NOps; i++) { - // XXX: Why is this here? - if (isa(N->getOperand(i))) { - IsRegSeq = false; - break; - } - RegSeqArgs[1 + (2 * i)] = N->getOperand(i); - RegSeqArgs[1 + (2 * i) + 1] = - CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, - MVT::i32); - } - - if (NOps != NumVectorElts) { - // Fill in the missing undef elements if this was a scalar_to_vector. - assert(Opc == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts); - - MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, - DL, EltVT); - for (unsigned i = NOps; i < NumVectorElts; ++i) { - RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0); - RegSeqArgs[1 + (2 * i) + 1] = - CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, MVT::i32); - } - } - - if (!IsRegSeq) - break; - return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), - RegSeqArgs); - } - case ISD::BUILD_PAIR: { - SDValue RC, SubReg0, SubReg1; - if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { - break; - } - SDLoc DL(N); - if (N->getValueType(0) == MVT::i128) { - RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32); - SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32); - SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32); - } else if (N->getValueType(0) == MVT::i64) { - RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32); - SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); - SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); - } else { - llvm_unreachable("Unhandled value type for BUILD_PAIR"); - } - const SDValue Ops[] = { RC, N->getOperand(0), SubReg0, - N->getOperand(1), SubReg1 }; - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, - DL, N->getValueType(0), Ops); - } - - case ISD::Constant: - case ISD::ConstantFP: { - if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || - N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N)) - break; - - uint64_t Imm; - if (ConstantFPSDNode *FP = dyn_cast(N)) - Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue(); - else { - ConstantSDNode *C = cast(N); - Imm = C->getZExtValue(); - } - - SDLoc DL(N); - SDNode *Lo = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, - CurDAG->getConstant(Imm & 0xFFFFFFFF, DL, - MVT::i32)); - SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, - CurDAG->getConstant(Imm >> 32, DL, MVT::i32)); - const SDValue Ops[] = { - CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), - SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), - SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32) - }; - - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, - N->getValueType(0), Ops); - } - - case ISD::LOAD: { - LoadSDNode *LD = cast(N); - SDLoc SL(N); - EVT VT = N->getValueType(0); - - if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) { - N = glueCopyToM0(N); - break; - } - - // To simplify the TableGen patters, we replace all i64 loads with - // v2i32 loads. Alternatively, we could promote i64 loads to v2i32 - // during DAG legalization, however, so places (ExpandUnalignedLoad) - // in the DAG legalizer assume that if i64 is legal, so doing this - // promotion early can cause problems. - - SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SDLoc(N), LD->getChain(), - LD->getBasePtr(), LD->getMemOperand()); - SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL, - MVT::i64, NewLoad); - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLoad.getValue(1)); - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), BitCast); - SDNode *Load = glueCopyToM0(NewLoad.getNode()); - SelectCode(Load); - N = BitCast.getNode(); - break; - } - - case ISD::STORE: { - // Handle i64 stores here for the same reason mentioned above for loads. - StoreSDNode *ST = cast(N); - SDValue Value = ST->getValue(); - if (Value.getValueType() == MVT::i64 && !ST->isTruncatingStore()) { - - SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N), - MVT::v2i32, Value); - SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue, - ST->getBasePtr(), ST->getMemOperand()); - - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore); - - if (NewValue.getOpcode() == ISD::BITCAST) { - Select(NewStore.getNode()); - return SelectCode(NewValue.getNode()); - } - - // getNode() may fold the bitcast if its input was another bitcast. If that - // happens we should only select the new store. - N = NewStore.getNode(); - } - - N = glueCopyToM0(N); - break; - } - - case AMDGPUISD::REGISTER_LOAD: { - if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) - break; - SDValue Addr, Offset; - - SDLoc DL(N); - SelectADDRIndirect(N->getOperand(1), Addr, Offset); - const SDValue Ops[] = { - Addr, - Offset, - CurDAG->getTargetConstant(0, DL, MVT::i32), - N->getOperand(0), - }; - return CurDAG->getMachineNode(AMDGPU::SI_RegisterLoad, DL, - CurDAG->getVTList(MVT::i32, MVT::i64, - MVT::Other), - Ops); - } - case AMDGPUISD::REGISTER_STORE: { - if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) - break; - SDValue Addr, Offset; - SelectADDRIndirect(N->getOperand(2), Addr, Offset); - SDLoc DL(N); - const SDValue Ops[] = { - N->getOperand(1), - Addr, - Offset, - CurDAG->getTargetConstant(0, DL, MVT::i32), - N->getOperand(0), - }; - return CurDAG->getMachineNode(AMDGPU::SI_RegisterStorePseudo, DL, - CurDAG->getVTList(MVT::Other), - Ops); - } - - case AMDGPUISD::BFE_I32: - case AMDGPUISD::BFE_U32: { - if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) - break; - - // There is a scalar version available, but unlike the vector version which - // has a separate operand for the offset and width, the scalar version packs - // the width and offset into a single operand. Try to move to the scalar - // version if the offsets are constant, so that we can try to keep extended - // loads of kernel arguments in SGPRs. - - // TODO: Technically we could try to pattern match scalar bitshifts of - // dynamic values, but it's probably not useful. - ConstantSDNode *Offset = dyn_cast(N->getOperand(1)); - if (!Offset) - break; - - ConstantSDNode *Width = dyn_cast(N->getOperand(2)); - if (!Width) - break; - - bool Signed = Opc == AMDGPUISD::BFE_I32; - - uint32_t OffsetVal = Offset->getZExtValue(); - uint32_t WidthVal = Width->getZExtValue(); - - return getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, SDLoc(N), - N->getOperand(0), OffsetVal, WidthVal); - - } - case AMDGPUISD::DIV_SCALE: { - return SelectDIV_SCALE(N); - } - case ISD::CopyToReg: { - const SITargetLowering& Lowering = - *static_cast(getTargetLowering()); - Lowering.legalizeTargetIndependentNode(N, *CurDAG); - break; - } - case ISD::ADDRSPACECAST: - return SelectAddrSpaceCast(N); - case ISD::AND: - case ISD::SRL: - case ISD::SRA: - if (N->getValueType(0) != MVT::i32 || - Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) - break; - - return SelectS_BFE(N); - } - - return SelectCode(N); -} - - -bool AMDGPUDAGToDAGISel::checkType(const Value *Ptr, unsigned AS) { - assert(AS != 0 && "Use checkPrivateAddress instead."); - if (!Ptr) - return false; - - return Ptr->getType()->getPointerAddressSpace() == AS; -} - -bool AMDGPUDAGToDAGISel::checkPrivateAddress(const MachineMemOperand *Op) { - if (Op->getPseudoValue()) - return true; - - if (PointerType *PT = dyn_cast(Op->getValue()->getType())) - return PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; - - return false; -} - -bool AMDGPUDAGToDAGISel::isGlobalStore(const StoreSDNode *N) { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isPrivateStore(const StoreSDNode *N) { - const Value *MemVal = N->getMemOperand()->getValue(); - return (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) && - !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) && - !checkType(MemVal, AMDGPUAS::REGION_ADDRESS)); -} - -bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isFlatStore(const StoreSDNode *N) { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int CbId) const { - const Value *MemVal = N->getMemOperand()->getValue(); - if (CbId == -1) - return checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS); - - return checkType(MemVal, AMDGPUAS::CONSTANT_BUFFER_0 + CbId); -} - -bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) const { - if (N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) - if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || - N->getMemoryVT().bitsLT(MVT::i32)) - return true; - - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) const { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::PARAM_I_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isLocalLoad(const LoadSDNode *N) const { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isFlatLoad(const LoadSDNode *N) const { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isRegionLoad(const LoadSDNode *N) const { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const { - MachineMemOperand *MMO = N->getMemOperand(); - if (checkPrivateAddress(N->getMemOperand())) { - if (MMO) { - const PseudoSourceValue *PSV = MMO->getPseudoValue(); - if (PSV && PSV == PseudoSourceValue::getConstantPool()) { - return true; - } - } - } - return false; -} - -bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) const { - if (checkPrivateAddress(N->getMemOperand())) { - // Check to make sure we are not a constant pool load or a constant load - // that is marked as a private load - if (isCPLoad(N) || isConstantLoad(N, -1)) { - return false; - } - } - - const Value *MemVal = N->getMemOperand()->getValue(); - if (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) && - !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) && - !checkType(MemVal, AMDGPUAS::FLAT_ADDRESS) && - !checkType(MemVal, AMDGPUAS::REGION_ADDRESS) && - !checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS) && - !checkType(MemVal, AMDGPUAS::PARAM_D_ADDRESS) && - !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)) { - return true; - } - return false; -} - -const char *AMDGPUDAGToDAGISel::getPassName() const { - return "AMDGPU DAG->DAG Pattern Instruction Selection"; -} - -#ifdef DEBUGTMP -#undef INT64_C -#endif -#undef DEBUGTMP - -//===----------------------------------------------------------------------===// -// Complex Patterns -//===----------------------------------------------------------------------===// - -bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr, - SDValue& IntPtr) { - if (ConstantSDNode *Cst = dyn_cast(Addr)) { - IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr), - true); - return true; - } - return false; -} - -bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr, - SDValue& BaseReg, SDValue &Offset) { - if (!isa(Addr)) { - BaseReg = Addr; - Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true); - return true; - } - return false; -} - -bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base, - SDValue &Offset) { - ConstantSDNode *IMMOffset; - - if (Addr.getOpcode() == ISD::ADD - && (IMMOffset = dyn_cast(Addr.getOperand(1))) - && isInt<16>(IMMOffset->getZExtValue())) { - - Base = Addr.getOperand(0); - Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr), - MVT::i32); - return true; - // If the pointer address is constant, we can move it to the offset field. - } else if ((IMMOffset = dyn_cast(Addr)) - && isInt<16>(IMMOffset->getZExtValue())) { - Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), - SDLoc(CurDAG->getEntryNode()), - AMDGPU::ZERO, MVT::i32); - Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr), - MVT::i32); - return true; - } - - // Default case, no offset - Base = Addr; - Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); - return true; -} - -bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, - SDValue &Offset) { - ConstantSDNode *C; - SDLoc DL(Addr); - - if ((C = dyn_cast(Addr))) { - Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32); - Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); - } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && - (C = dyn_cast(Addr.getOperand(1)))) { - Base = Addr.getOperand(0); - Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); - } else { - Base = Addr; - Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); - } - - return true; -} - -SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { - SDLoc DL(N); - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - - bool IsAdd = (N->getOpcode() == ISD::ADD); - - SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); - SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); - - SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, LHS, Sub0); - SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, LHS, Sub1); - - SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, RHS, Sub0); - SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, RHS, Sub1); - - SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue); - SDValue AddLoArgs[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) }; - - - unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; - unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; - - SDNode *AddLo = CurDAG->getMachineNode( Opc, DL, VTList, AddLoArgs); - SDValue Carry(AddLo, 1); - SDNode *AddHi - = CurDAG->getMachineNode(CarryOpc, DL, MVT::i32, - SDValue(Hi0, 0), SDValue(Hi1, 0), Carry); - - SDValue Args[5] = { - CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), - SDValue(AddLo,0), - Sub0, - SDValue(AddHi,0), - Sub1, - }; - return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args); -} - -// We need to handle this here because tablegen doesn't support matching -// instructions with multiple outputs. -SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { - SDLoc SL(N); - EVT VT = N->getValueType(0); - - assert(VT == MVT::f32 || VT == MVT::f64); - - unsigned Opc - = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32; - - // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod - SDValue Ops[8]; - - SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]); - SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]); - SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]); - return CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops); -} - -bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset, - unsigned OffsetBits) const { - if ((OffsetBits == 16 && !isUInt<16>(Offset)) || - (OffsetBits == 8 && !isUInt<8>(Offset))) - return false; - - if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) - return true; - - // On Southern Islands instruction with a negative base value and an offset - // don't seem to work. - return CurDAG->SignBitIsZero(Base); -} - -bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, - SDValue &Offset) const { - if (CurDAG->isBaseWithConstantOffset(Addr)) { - SDValue N0 = Addr.getOperand(0); - SDValue N1 = Addr.getOperand(1); - ConstantSDNode *C1 = cast(N1); - if (isDSOffsetLegal(N0, C1->getSExtValue(), 16)) { - // (add n0, c0) - Base = N0; - Offset = N1; - return true; - } - } - - SDLoc DL(Addr); - - // If we have a constant address, prefer to put the constant into the - // offset. This can save moves to load the constant address since multiple - // operations can share the zero base address register, and enables merging - // into read2 / write2 instructions. - if (const ConstantSDNode *CAddr = dyn_cast(Addr)) { - if (isUInt<16>(CAddr->getZExtValue())) { - SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); - MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, - DL, MVT::i32, Zero); - Base = SDValue(MovZero, 0); - Offset = Addr; - return true; - } - } - - // default case - Base = Addr; - Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); - return true; -} - -bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, - SDValue &Offset0, - SDValue &Offset1) const { - SDLoc DL(Addr); - - if (CurDAG->isBaseWithConstantOffset(Addr)) { - SDValue N0 = Addr.getOperand(0); - SDValue N1 = Addr.getOperand(1); - ConstantSDNode *C1 = cast(N1); - unsigned DWordOffset0 = C1->getZExtValue() / 4; - unsigned DWordOffset1 = DWordOffset0 + 1; - // (add n0, c0) - if (isDSOffsetLegal(N0, DWordOffset1, 8)) { - Base = N0; - Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); - Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); - return true; - } - } - - if (const ConstantSDNode *CAddr = dyn_cast(Addr)) { - unsigned DWordOffset0 = CAddr->getZExtValue() / 4; - unsigned DWordOffset1 = DWordOffset0 + 1; - assert(4 * DWordOffset0 == CAddr->getZExtValue()); - - if (isUInt<8>(DWordOffset0) && isUInt<8>(DWordOffset1)) { - SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); - MachineSDNode *MovZero - = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, - DL, MVT::i32, Zero); - Base = SDValue(MovZero, 0); - Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); - Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); - return true; - } - } - - // default case - Base = Addr; - Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8); - Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8); - return true; -} - -static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) { - return isUInt<12>(Imm->getZExtValue()); -} - -void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, - SDValue &VAddr, SDValue &SOffset, - SDValue &Offset, SDValue &Offen, - SDValue &Idxen, SDValue &Addr64, - SDValue &GLC, SDValue &SLC, - SDValue &TFE) const { - SDLoc DL(Addr); - - GLC = CurDAG->getTargetConstant(0, DL, MVT::i1); - SLC = CurDAG->getTargetConstant(0, DL, MVT::i1); - TFE = CurDAG->getTargetConstant(0, DL, MVT::i1); - - Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1); - Offen = CurDAG->getTargetConstant(0, DL, MVT::i1); - Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1); - SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); - - if (CurDAG->isBaseWithConstantOffset(Addr)) { - SDValue N0 = Addr.getOperand(0); - SDValue N1 = Addr.getOperand(1); - ConstantSDNode *C1 = cast(N1); - - if (N0.getOpcode() == ISD::ADD) { - // (add (add N2, N3), C1) -> addr64 - SDValue N2 = N0.getOperand(0); - SDValue N3 = N0.getOperand(1); - Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); - Ptr = N2; - VAddr = N3; - } else { - - // (add N0, C1) -> offset - VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32); - Ptr = N0; - } - - if (isLegalMUBUFImmOffset(C1)) { - Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); - return; - } else if (isUInt<32>(C1->getZExtValue())) { - // Illegal offset, store it in soffset. - Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); - SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, - CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)), - 0); - return; - } - } - - if (Addr.getOpcode() == ISD::ADD) { - // (add N0, N1) -> addr64 - SDValue N0 = Addr.getOperand(0); - SDValue N1 = Addr.getOperand(1); - Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); - Ptr = N0; - VAddr = N1; - Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); - return; - } - - // default case -> offset - VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32); - Ptr = Addr; - Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); - -} - -bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, - SDValue &VAddr, SDValue &SOffset, - SDValue &Offset, SDValue &GLC, - SDValue &SLC, SDValue &TFE) const { - SDValue Ptr, Offen, Idxen, Addr64; - - SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE); - - ConstantSDNode *C = cast(Addr64); - if (C->getSExtValue()) { - SDLoc DL(Addr); - - const SITargetLowering& Lowering = - *static_cast(getTargetLowering()); - - SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0); - return true; - } - - return false; -} - -bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, - SDValue &VAddr, SDValue &SOffset, - SDValue &Offset, - SDValue &SLC) const { - SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1); - SDValue GLC, TFE; - - return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE); -} - -bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, - SDValue &VAddr, SDValue &SOffset, - SDValue &ImmOffset) const { - - SDLoc DL(Addr); - MachineFunction &MF = CurDAG->getMachineFunction(); - const SIRegisterInfo *TRI = - static_cast(Subtarget->getRegisterInfo()); - MachineRegisterInfo &MRI = MF.getRegInfo(); - const SITargetLowering& Lowering = - *static_cast(getTargetLowering()); - - unsigned ScratchOffsetReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); - Lowering.CreateLiveInRegister(*CurDAG, &AMDGPU::SReg_32RegClass, - ScratchOffsetReg, MVT::i32); - SDValue Sym0 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD0", MVT::i32); - SDValue ScratchRsrcDword0 = - SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym0), 0); - - SDValue Sym1 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD1", MVT::i32); - SDValue ScratchRsrcDword1 = - SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym1), 0); - - const SDValue RsrcOps[] = { - CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), - ScratchRsrcDword0, - CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), - ScratchRsrcDword1, - CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32), - }; - SDValue ScratchPtr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, - MVT::v2i32, RsrcOps), 0); - Rsrc = SDValue(Lowering.buildScratchRSRC(*CurDAG, DL, ScratchPtr), 0); - SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, - MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32); - - // (add n0, c1) - if (CurDAG->isBaseWithConstantOffset(Addr)) { - SDValue N1 = Addr.getOperand(1); - ConstantSDNode *C1 = cast(N1); - - if (isLegalMUBUFImmOffset(C1)) { - VAddr = Addr.getOperand(0); - ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); - return true; - } - } - - // (node) - VAddr = Addr; - ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16); - return true; -} - -bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, - SDValue &SOffset, SDValue &Offset, - SDValue &GLC, SDValue &SLC, - SDValue &TFE) const { - SDValue Ptr, VAddr, Offen, Idxen, Addr64; - const SIInstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); - - SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE); - - if (!cast(Offen)->getSExtValue() && - !cast(Idxen)->getSExtValue() && - !cast(Addr64)->getSExtValue()) { - uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | - APInt::getAllOnesValue(32).getZExtValue(); // Size - SDLoc DL(Addr); - - const SITargetLowering& Lowering = - *static_cast(getTargetLowering()); - - SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0); - return true; - } - return false; -} - -bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, - SDValue &Soffset, SDValue &Offset, - SDValue &GLC) const { - SDValue SLC, TFE; - - return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE); -} - -// FIXME: This is incorrect and only enough to be able to compile. -SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { - AddrSpaceCastSDNode *ASC = cast(N); - SDLoc DL(N); - - assert(Subtarget->hasFlatAddressSpace() && - "addrspacecast only supported with flat address space!"); - - assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS && - ASC->getDestAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) && - "Cannot cast address space to / from constant address!"); - - assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS || - ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) && - "Can only cast to / from flat address space!"); - - // The flat instructions read the address as the index of the VGPR holding the - // address, so casting should just be reinterpreting the base VGPR, so just - // insert trunc / bitcast / zext. - - SDValue Src = ASC->getOperand(0); - EVT DestVT = ASC->getValueType(0); - EVT SrcVT = Src.getValueType(); - - unsigned SrcSize = SrcVT.getSizeInBits(); - unsigned DestSize = DestVT.getSizeInBits(); - - if (SrcSize > DestSize) { - assert(SrcSize == 64 && DestSize == 32); - return CurDAG->getMachineNode( - TargetOpcode::EXTRACT_SUBREG, - DL, - DestVT, - Src, - CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32)); - } - - - if (DestSize > SrcSize) { - assert(SrcSize == 32 && DestSize == 64); - - // FIXME: This is probably wrong, we should never be defining - // a register class with both VGPRs and SGPRs - SDValue RC = CurDAG->getTargetConstant(AMDGPU::VS_64RegClassID, DL, - MVT::i32); - - const SDValue Ops[] = { - RC, - Src, - CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), - SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, - CurDAG->getConstant(0, DL, MVT::i32)), 0), - CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32) - }; - - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, - DL, N->getValueType(0), Ops); - } - - assert(SrcSize == 64 && DestSize == 64); - return CurDAG->getNode(ISD::BITCAST, DL, DestVT, Src).getNode(); -} - -SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val, - uint32_t Offset, uint32_t Width) { - // Transformation function, pack the offset and width of a BFE into - // the format expected by the S_BFE_I32 / S_BFE_U32. In the second - // source, bits [5:0] contain the offset and bits [22:16] the width. - uint32_t PackedVal = Offset | (Width << 16); - SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32); - - return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst); -} - -SDNode *AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) { - // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c) - // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c) - // Predicate: 0 < b <= c < 32 - - const SDValue &Shl = N->getOperand(0); - ConstantSDNode *B = dyn_cast(Shl->getOperand(1)); - ConstantSDNode *C = dyn_cast(N->getOperand(1)); - - if (B && C) { - uint32_t BVal = B->getZExtValue(); - uint32_t CVal = C->getZExtValue(); - - if (0 < BVal && BVal <= CVal && CVal < 32) { - bool Signed = N->getOpcode() == ISD::SRA; - unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; - - return getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0), - CVal - BVal, 32 - CVal); - } - } - return SelectCode(N); -} - -SDNode *AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) { - switch (N->getOpcode()) { - case ISD::AND: - if (N->getOperand(0).getOpcode() == ISD::SRL) { - // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)" - // Predicate: isMask(mask) - const SDValue &Srl = N->getOperand(0); - ConstantSDNode *Shift = dyn_cast(Srl.getOperand(1)); - ConstantSDNode *Mask = dyn_cast(N->getOperand(1)); - - if (Shift && Mask) { - uint32_t ShiftVal = Shift->getZExtValue(); - uint32_t MaskVal = Mask->getZExtValue(); - - if (isMask_32(MaskVal)) { - uint32_t WidthVal = countPopulation(MaskVal); - - return getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), Srl.getOperand(0), - ShiftVal, WidthVal); - } - } - } - break; - case ISD::SRL: - if (N->getOperand(0).getOpcode() == ISD::AND) { - // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)" - // Predicate: isMask(mask >> b) - const SDValue &And = N->getOperand(0); - ConstantSDNode *Shift = dyn_cast(N->getOperand(1)); - ConstantSDNode *Mask = dyn_cast(And->getOperand(1)); - - if (Shift && Mask) { - uint32_t ShiftVal = Shift->getZExtValue(); - uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal; - - if (isMask_32(MaskVal)) { - uint32_t WidthVal = countPopulation(MaskVal); - - return getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), And.getOperand(0), - ShiftVal, WidthVal); - } - } - } else if (N->getOperand(0).getOpcode() == ISD::SHL) - return SelectS_BFEFromShifts(N); - break; - case ISD::SRA: - if (N->getOperand(0).getOpcode() == ISD::SHL) - return SelectS_BFEFromShifts(N); - break; - } - - return SelectCode(N); -} - -bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, - SDValue &SrcMods) const { - - unsigned Mods = 0; - - Src = In; - - if (Src.getOpcode() == ISD::FNEG) { - Mods |= SISrcMods::NEG; - Src = Src.getOperand(0); - } - - if (Src.getOpcode() == ISD::FABS) { - Mods |= SISrcMods::ABS; - Src = Src.getOperand(0); - } - - SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); - - return true; -} - -bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src, - SDValue &SrcMods, SDValue &Clamp, - SDValue &Omod) const { - SDLoc DL(In); - // FIXME: Handle Clamp and Omod - Clamp = CurDAG->getTargetConstant(0, DL, MVT::i32); - Omod = CurDAG->getTargetConstant(0, DL, MVT::i32); - - return SelectVOP3Mods(In, Src, SrcMods); -} - -bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, - SDValue &SrcMods, - SDValue &Omod) const { - // FIXME: Handle Omod - Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32); - - return SelectVOP3Mods(In, Src, SrcMods); -} - -bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, - SDValue &SrcMods, - SDValue &Clamp, - SDValue &Omod) const { - Clamp = Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32); - return SelectVOP3Mods(In, Src, SrcMods); -} - -void AMDGPUDAGToDAGISel::PostprocessISelDAG() { - const AMDGPUTargetLowering& Lowering = - *static_cast(getTargetLowering()); - bool IsModified = false; - do { - IsModified = false; - // Go over all selected nodes and try to fold them a bit more - for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), - E = CurDAG->allnodes_end(); I != E; ++I) { - - SDNode *Node = I; - - MachineSDNode *MachineNode = dyn_cast(I); - if (!MachineNode) - continue; - - SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG); - if (ResNode != Node) { - ReplaceUses(Node, ResNode); - IsModified = true; - } - } - CurDAG->RemoveDeadNodes(); - } while (IsModified); -} diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp deleted file mode 100644 index d56838ec201..00000000000 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ /dev/null @@ -1,2866 +0,0 @@ -//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief This is the parent TargetLowering class for hardware code gen -/// targets. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPUISelLowering.h" -#include "AMDGPU.h" -#include "AMDGPUFrameLowering.h" -#include "AMDGPUIntrinsicInfo.h" -#include "AMDGPURegisterInfo.h" -#include "AMDGPUSubtarget.h" -#include "R600MachineFunctionInfo.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/CallingConvLower.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/DiagnosticInfo.h" -#include "llvm/IR/DiagnosticPrinter.h" - -using namespace llvm; - -namespace { - -/// Diagnostic information for unimplemented or unsupported feature reporting. -class DiagnosticInfoUnsupported : public DiagnosticInfo { -private: - const Twine &Description; - const Function &Fn; - - static int KindID; - - static int getKindID() { - if (KindID == 0) - KindID = llvm::getNextAvailablePluginDiagnosticKind(); - return KindID; - } - -public: - DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc, - DiagnosticSeverity Severity = DS_Error) - : DiagnosticInfo(getKindID(), Severity), - Description(Desc), - Fn(Fn) { } - - const Function &getFunction() const { return Fn; } - const Twine &getDescription() const { return Description; } - - void print(DiagnosticPrinter &DP) const override { - DP << "unsupported " << getDescription() << " in " << Fn.getName(); - } - - static bool classof(const DiagnosticInfo *DI) { - return DI->getKind() == getKindID(); - } -}; - -int DiagnosticInfoUnsupported::KindID = 0; -} - - -static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State) { - unsigned Offset = State.AllocateStack(ValVT.getStoreSize(), - ArgFlags.getOrigAlign()); - State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); - - return true; -} - -#include "AMDGPUGenCallingConv.inc" - -// Find a larger type to do a load / store of a vector with. -EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { - unsigned StoreSize = VT.getStoreSizeInBits(); - if (StoreSize <= 32) - return EVT::getIntegerVT(Ctx, StoreSize); - - assert(StoreSize % 32 == 0 && "Store size not a multiple of 32"); - return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); -} - -// Type for a vector that will be loaded to. -EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) { - unsigned StoreSize = VT.getStoreSizeInBits(); - if (StoreSize <= 32) - return EVT::getIntegerVT(Ctx, 32); - - return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); -} - -AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, - const AMDGPUSubtarget &STI) - : TargetLowering(TM), Subtarget(&STI) { - setOperationAction(ISD::Constant, MVT::i32, Legal); - setOperationAction(ISD::Constant, MVT::i64, Legal); - setOperationAction(ISD::ConstantFP, MVT::f32, Legal); - setOperationAction(ISD::ConstantFP, MVT::f64, Legal); - - setOperationAction(ISD::BR_JT, MVT::Other, Expand); - setOperationAction(ISD::BRIND, MVT::Other, Expand); - - // We need to custom lower some of the intrinsics - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - - // Library functions. These default to Expand, but we have instructions - // for them. - setOperationAction(ISD::FCEIL, MVT::f32, Legal); - setOperationAction(ISD::FEXP2, MVT::f32, Legal); - setOperationAction(ISD::FPOW, MVT::f32, Legal); - setOperationAction(ISD::FLOG2, MVT::f32, Legal); - setOperationAction(ISD::FABS, MVT::f32, Legal); - setOperationAction(ISD::FFLOOR, MVT::f32, Legal); - setOperationAction(ISD::FRINT, MVT::f32, Legal); - setOperationAction(ISD::FTRUNC, MVT::f32, Legal); - setOperationAction(ISD::FMINNUM, MVT::f32, Legal); - setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); - - setOperationAction(ISD::FROUND, MVT::f32, Custom); - setOperationAction(ISD::FROUND, MVT::f64, Custom); - - setOperationAction(ISD::FREM, MVT::f32, Custom); - setOperationAction(ISD::FREM, MVT::f64, Custom); - - // v_mad_f32 does not support denormals according to some sources. - if (!Subtarget->hasFP32Denormals()) - setOperationAction(ISD::FMAD, MVT::f32, Legal); - - // Expand to fneg + fadd. - setOperationAction(ISD::FSUB, MVT::f64, Expand); - - // Lower floating point store/load to integer store/load to reduce the number - // of patterns in tablegen. - setOperationAction(ISD::STORE, MVT::f32, Promote); - AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32); - - setOperationAction(ISD::STORE, MVT::v2f32, Promote); - AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32); - - setOperationAction(ISD::STORE, MVT::v4f32, Promote); - AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); - - setOperationAction(ISD::STORE, MVT::v8f32, Promote); - AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32); - - setOperationAction(ISD::STORE, MVT::v16f32, Promote); - AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32); - - setOperationAction(ISD::STORE, MVT::f64, Promote); - AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64); - - setOperationAction(ISD::STORE, MVT::v2f64, Promote); - AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v2i64); - - // Custom lowering of vector stores is required for local address space - // stores. - setOperationAction(ISD::STORE, MVT::v4i32, Custom); - - setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); - setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); - setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom); - - // XXX: This can be change to Custom, once ExpandVectorStores can - // handle 64-bit stores. - setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); - - setTruncStoreAction(MVT::i64, MVT::i16, Expand); - setTruncStoreAction(MVT::i64, MVT::i8, Expand); - setTruncStoreAction(MVT::i64, MVT::i1, Expand); - setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand); - setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand); - - - setOperationAction(ISD::LOAD, MVT::f32, Promote); - AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); - - setOperationAction(ISD::LOAD, MVT::v2f32, Promote); - AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); - - setOperationAction(ISD::LOAD, MVT::v4f32, Promote); - AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); - - setOperationAction(ISD::LOAD, MVT::v8f32, Promote); - AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32); - - setOperationAction(ISD::LOAD, MVT::v16f32, Promote); - AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32); - - setOperationAction(ISD::LOAD, MVT::f64, Promote); - AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64); - - setOperationAction(ISD::LOAD, MVT::v2f64, Promote); - AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v2i64); - - setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); - - // There are no 64-bit extloads. These should be done as a 32-bit extload and - // an extension to 64-bit. - for (MVT VT : MVT::integer_valuetypes()) { - setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand); - setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand); - setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand); - } - - for (MVT VT : MVT::integer_vector_valuetypes()) { - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand); - } - - setOperationAction(ISD::BR_CC, MVT::i1, Expand); - - if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { - setOperationAction(ISD::FCEIL, MVT::f64, Custom); - setOperationAction(ISD::FTRUNC, MVT::f64, Custom); - setOperationAction(ISD::FRINT, MVT::f64, Custom); - setOperationAction(ISD::FFLOOR, MVT::f64, Custom); - } - - if (!Subtarget->hasBFI()) { - // fcopysign can be done in a single instruction with BFI. - setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); - setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); - } - - setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); - - setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand); - - setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand); - - setTruncStoreAction(MVT::f32, MVT::f16, Expand); - setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand); - setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand); - setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand); - - setTruncStoreAction(MVT::f64, MVT::f16, Expand); - setTruncStoreAction(MVT::f64, MVT::f32, Expand); - - const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; - for (MVT VT : ScalarIntVTs) { - setOperationAction(ISD::SREM, VT, Expand); - setOperationAction(ISD::SDIV, VT, Expand); - - // GPU does not have divrem function for signed or unsigned. - setOperationAction(ISD::SDIVREM, VT, Custom); - setOperationAction(ISD::UDIVREM, VT, Custom); - - // GPU does not have [S|U]MUL_LOHI functions as a single instruction. - setOperationAction(ISD::SMUL_LOHI, VT, Expand); - setOperationAction(ISD::UMUL_LOHI, VT, Expand); - - setOperationAction(ISD::BSWAP, VT, Expand); - setOperationAction(ISD::CTTZ, VT, Expand); - setOperationAction(ISD::CTLZ, VT, Expand); - } - - if (!Subtarget->hasBCNT(32)) - setOperationAction(ISD::CTPOP, MVT::i32, Expand); - - if (!Subtarget->hasBCNT(64)) - setOperationAction(ISD::CTPOP, MVT::i64, Expand); - - // The hardware supports 32-bit ROTR, but not ROTL. - setOperationAction(ISD::ROTL, MVT::i32, Expand); - setOperationAction(ISD::ROTL, MVT::i64, Expand); - setOperationAction(ISD::ROTR, MVT::i64, Expand); - - setOperationAction(ISD::MUL, MVT::i64, Expand); - setOperationAction(ISD::MULHU, MVT::i64, Expand); - setOperationAction(ISD::MULHS, MVT::i64, Expand); - setOperationAction(ISD::UDIV, MVT::i32, Expand); - setOperationAction(ISD::UREM, MVT::i32, Expand); - setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); - setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); - setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); - - setOperationAction(ISD::SMIN, MVT::i32, Legal); - setOperationAction(ISD::UMIN, MVT::i32, Legal); - setOperationAction(ISD::SMAX, MVT::i32, Legal); - setOperationAction(ISD::UMAX, MVT::i32, Legal); - - if (!Subtarget->hasFFBH()) - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); - - if (!Subtarget->hasFFBL()) - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); - - static const MVT::SimpleValueType VectorIntTypes[] = { - MVT::v2i32, MVT::v4i32 - }; - - for (MVT VT : VectorIntTypes) { - // Expand the following operations for the current type by default. - setOperationAction(ISD::ADD, VT, Expand); - setOperationAction(ISD::AND, VT, Expand); - setOperationAction(ISD::FP_TO_SINT, VT, Expand); - setOperationAction(ISD::FP_TO_UINT, VT, Expand); - setOperationAction(ISD::MUL, VT, Expand); - setOperationAction(ISD::OR, VT, Expand); - setOperationAction(ISD::SHL, VT, Expand); - setOperationAction(ISD::SRA, VT, Expand); - setOperationAction(ISD::SRL, VT, Expand); - setOperationAction(ISD::ROTL, VT, Expand); - setOperationAction(ISD::ROTR, VT, Expand); - setOperationAction(ISD::SUB, VT, Expand); - setOperationAction(ISD::SINT_TO_FP, VT, Expand); - setOperationAction(ISD::UINT_TO_FP, VT, Expand); - setOperationAction(ISD::SDIV, VT, Expand); - setOperationAction(ISD::UDIV, VT, Expand); - setOperationAction(ISD::SREM, VT, Expand); - setOperationAction(ISD::UREM, VT, Expand); - setOperationAction(ISD::SMUL_LOHI, VT, Expand); - setOperationAction(ISD::UMUL_LOHI, VT, Expand); - setOperationAction(ISD::SDIVREM, VT, Custom); - setOperationAction(ISD::UDIVREM, VT, Custom); - setOperationAction(ISD::ADDC, VT, Expand); - setOperationAction(ISD::SUBC, VT, Expand); - setOperationAction(ISD::ADDE, VT, Expand); - setOperationAction(ISD::SUBE, VT, Expand); - setOperationAction(ISD::SELECT, VT, Expand); - setOperationAction(ISD::VSELECT, VT, Expand); - setOperationAction(ISD::SELECT_CC, VT, Expand); - setOperationAction(ISD::XOR, VT, Expand); - setOperationAction(ISD::BSWAP, VT, Expand); - setOperationAction(ISD::CTPOP, VT, Expand); - setOperationAction(ISD::CTTZ, VT, Expand); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); - setOperationAction(ISD::CTLZ, VT, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); - } - - static const MVT::SimpleValueType FloatVectorTypes[] = { - MVT::v2f32, MVT::v4f32 - }; - - for (MVT VT : FloatVectorTypes) { - setOperationAction(ISD::FABS, VT, Expand); - setOperationAction(ISD::FMINNUM, VT, Expand); - setOperationAction(ISD::FMAXNUM, VT, Expand); - setOperationAction(ISD::FADD, VT, Expand); - setOperationAction(ISD::FCEIL, VT, Expand); - setOperationAction(ISD::FCOS, VT, Expand); - setOperationAction(ISD::FDIV, VT, Expand); - setOperationAction(ISD::FEXP2, VT, Expand); - setOperationAction(ISD::FLOG2, VT, Expand); - setOperationAction(ISD::FREM, VT, Expand); - setOperationAction(ISD::FPOW, VT, Expand); - setOperationAction(ISD::FFLOOR, VT, Expand); - setOperationAction(ISD::FTRUNC, VT, Expand); - setOperationAction(ISD::FMUL, VT, Expand); - setOperationAction(ISD::FMA, VT, Expand); - setOperationAction(ISD::FRINT, VT, Expand); - setOperationAction(ISD::FNEARBYINT, VT, Expand); - setOperationAction(ISD::FSQRT, VT, Expand); - setOperationAction(ISD::FSIN, VT, Expand); - setOperationAction(ISD::FSUB, VT, Expand); - setOperationAction(ISD::FNEG, VT, Expand); - setOperationAction(ISD::SELECT, VT, Expand); - setOperationAction(ISD::VSELECT, VT, Expand); - setOperationAction(ISD::SELECT_CC, VT, Expand); - setOperationAction(ISD::FCOPYSIGN, VT, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); - } - - setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom); - setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); - - setTargetDAGCombine(ISD::MUL); - setTargetDAGCombine(ISD::SELECT); - setTargetDAGCombine(ISD::SELECT_CC); - setTargetDAGCombine(ISD::STORE); - - setTargetDAGCombine(ISD::FADD); - setTargetDAGCombine(ISD::FSUB); - - setBooleanContents(ZeroOrNegativeOneBooleanContent); - setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); - - setSchedulingPreference(Sched::RegPressure); - setJumpIsExpensive(true); - - // SI at least has hardware support for floating point exceptions, but no way - // of using or handling them is implemented. They are also optional in OpenCL - // (Section 7.3) - setHasFloatingPointExceptions(false); - - setSelectIsExpensive(false); - PredictableSelectIsExpensive = false; - - // There are no integer divide instructions, and these expand to a pretty - // large sequence of instructions. - setIntDivIsCheap(false); - setPow2SDivIsCheap(false); - setFsqrtIsCheap(true); - - // FIXME: Need to really handle these. - MaxStoresPerMemcpy = 4096; - MaxStoresPerMemmove = 4096; - MaxStoresPerMemset = 4096; -} - -//===----------------------------------------------------------------------===// -// Target Information -//===----------------------------------------------------------------------===// - -MVT AMDGPUTargetLowering::getVectorIdxTy() const { - return MVT::i32; -} - -bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const { - return true; -} - -// The backend supports 32 and 64 bit floating point immediates. -// FIXME: Why are we reporting vectors of FP immediates as legal? -bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { - EVT ScalarVT = VT.getScalarType(); - return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64); -} - -// We don't want to shrink f64 / f32 constants. -bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const { - EVT ScalarVT = VT.getScalarType(); - return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64); -} - -bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N, - ISD::LoadExtType, - EVT NewVT) const { - - unsigned NewSize = NewVT.getStoreSizeInBits(); - - // If we are reducing to a 32-bit load, this is always better. - if (NewSize == 32) - return true; - - EVT OldVT = N->getValueType(0); - unsigned OldSize = OldVT.getStoreSizeInBits(); - - // Don't produce extloads from sub 32-bit types. SI doesn't have scalar - // extloads, so doing one requires using a buffer_load. In cases where we - // still couldn't use a scalar load, using the wider load shouldn't really - // hurt anything. - - // If the old size already had to be an extload, there's no harm in continuing - // to reduce the width. - return (OldSize < 32); -} - -bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, - EVT CastTy) const { - if (LoadTy.getSizeInBits() != CastTy.getSizeInBits()) - return true; - - unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits(); - unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits(); - - return ((LScalarSize <= CastScalarSize) || - (CastScalarSize >= 32) || - (LScalarSize < 32)); -} - -// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also -// profitable with the expansion for 64-bit since it's generally good to -// speculate things. -// FIXME: These should really have the size as a parameter. -bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const { - return true; -} - -bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const { - return true; -} - -//===---------------------------------------------------------------------===// -// Target Properties -//===---------------------------------------------------------------------===// - -bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { - assert(VT.isFloatingPoint()); - return VT == MVT::f32 || VT == MVT::f64; -} - -bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { - assert(VT.isFloatingPoint()); - return VT == MVT::f32 || VT == MVT::f64; -} - -bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT, - unsigned NumElem, - unsigned AS) const { - return true; -} - -bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { - // Truncate is just accessing a subregister. - return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0); -} - -bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const { - // Truncate is just accessing a subregister. - return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() && - (Dest->getPrimitiveSizeInBits() % 32 == 0); -} - -bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const { - const DataLayout *DL = getDataLayout(); - unsigned SrcSize = DL->getTypeSizeInBits(Src->getScalarType()); - unsigned DestSize = DL->getTypeSizeInBits(Dest->getScalarType()); - - return SrcSize == 32 && DestSize == 64; -} - -bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const { - // Any register load of a 64-bit value really requires 2 32-bit moves. For all - // practical purposes, the extra mov 0 to load a 64-bit is free. As used, - // this will enable reducing 64-bit operations the 32-bit, which is always - // good. - return Src == MVT::i32 && Dest == MVT::i64; -} - -bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { - return isZExtFree(Val.getValueType(), VT2); -} - -bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { - // There aren't really 64-bit registers, but pairs of 32-bit ones and only a - // limited number of native 64-bit operations. Shrinking an operation to fit - // in a single 32-bit register should always be helpful. As currently used, - // this is much less general than the name suggests, and is only used in - // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is - // not profitable, and may actually be harmful. - return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32; -} - -//===---------------------------------------------------------------------===// -// TargetLowering Callbacks -//===---------------------------------------------------------------------===// - -void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State, - const SmallVectorImpl &Ins) const { - - State.AnalyzeFormalArguments(Ins, CC_AMDGPU); -} - -SDValue AMDGPUTargetLowering::LowerReturn( - SDValue Chain, - CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl &Outs, - const SmallVectorImpl &OutVals, - SDLoc DL, SelectionDAG &DAG) const { - return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain); -} - -//===---------------------------------------------------------------------===// -// Target specific lowering -//===---------------------------------------------------------------------===// - -SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, - SmallVectorImpl &InVals) const { - SDValue Callee = CLI.Callee; - SelectionDAG &DAG = CLI.DAG; - - const Function &Fn = *DAG.getMachineFunction().getFunction(); - - StringRef FuncName(""); - - if (const ExternalSymbolSDNode *G = dyn_cast(Callee)) - FuncName = G->getSymbol(); - else if (const GlobalAddressSDNode *G = dyn_cast(Callee)) - FuncName = G->getGlobal()->getName(); - - DiagnosticInfoUnsupported NoCalls(Fn, "call to function " + FuncName); - DAG.getContext()->diagnose(NoCalls); - return SDValue(); -} - -SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, - SelectionDAG &DAG) const { - switch (Op.getOpcode()) { - default: - Op.getNode()->dump(); - llvm_unreachable("Custom lowering code for this" - "instruction is not implemented yet!"); - break; - case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); - case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); - case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); - case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); - case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); - case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); - case ISD::SDIVREM: return LowerSDIVREM(Op, DAG); - case ISD::FREM: return LowerFREM(Op, DAG); - case ISD::FCEIL: return LowerFCEIL(Op, DAG); - case ISD::FTRUNC: return LowerFTRUNC(Op, DAG); - case ISD::FRINT: return LowerFRINT(Op, DAG); - case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG); - case ISD::FROUND: return LowerFROUND(Op, DAG); - case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); - case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); - case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); - case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); - case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); - } - return Op; -} - -void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, - SmallVectorImpl &Results, - SelectionDAG &DAG) const { - switch (N->getOpcode()) { - case ISD::SIGN_EXTEND_INREG: - // Different parts of legalization seem to interpret which type of - // sign_extend_inreg is the one to check for custom lowering. The extended - // from type is what really matters, but some places check for custom - // lowering of the result type. This results in trying to use - // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do - // nothing here and let the illegal result integer be handled normally. - return; - case ISD::LOAD: { - SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode(); - if (!Node) - return; - - Results.push_back(SDValue(Node, 0)); - Results.push_back(SDValue(Node, 1)); - // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode - // function - DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1)); - return; - } - case ISD::STORE: { - SDValue Lowered = LowerSTORE(SDValue(N, 0), DAG); - if (Lowered.getNode()) - Results.push_back(Lowered); - return; - } - default: - return; - } -} - -// FIXME: This implements accesses to initialized globals in the constant -// address space by copying them to private and accessing that. It does not -// properly handle illegal types or vectors. The private vector loads are not -// scalarized, and the illegal scalars hit an assertion. This technique will not -// work well with large initializers, and this should eventually be -// removed. Initialized globals should be placed into a data section that the -// runtime will load into a buffer before the kernel is executed. Uses of the -// global need to be replaced with a pointer loaded from an implicit kernel -// argument into this buffer holding the copy of the data, which will remove the -// need for any of this. -SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, - const GlobalValue *GV, - const SDValue &InitPtr, - SDValue Chain, - SelectionDAG &DAG) const { - const DataLayout *TD = getDataLayout(); - SDLoc DL(InitPtr); - Type *InitTy = Init->getType(); - - if (const ConstantInt *CI = dyn_cast(Init)) { - EVT VT = EVT::getEVT(InitTy); - PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); - return DAG.getStore(Chain, DL, DAG.getConstant(*CI, DL, VT), InitPtr, - MachinePointerInfo(UndefValue::get(PtrTy)), false, false, - TD->getPrefTypeAlignment(InitTy)); - } - - if (const ConstantFP *CFP = dyn_cast(Init)) { - EVT VT = EVT::getEVT(CFP->getType()); - PointerType *PtrTy = PointerType::get(CFP->getType(), 0); - return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, DL, VT), InitPtr, - MachinePointerInfo(UndefValue::get(PtrTy)), false, false, - TD->getPrefTypeAlignment(CFP->getType())); - } - - if (StructType *ST = dyn_cast(InitTy)) { - const StructLayout *SL = TD->getStructLayout(ST); - - EVT PtrVT = InitPtr.getValueType(); - SmallVector Chains; - - for (unsigned I = 0, N = ST->getNumElements(); I != N; ++I) { - SDValue Offset = DAG.getConstant(SL->getElementOffset(I), DL, PtrVT); - SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset); - - Constant *Elt = Init->getAggregateElement(I); - Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG)); - } - - return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); - } - - if (SequentialType *SeqTy = dyn_cast(InitTy)) { - EVT PtrVT = InitPtr.getValueType(); - - unsigned NumElements; - if (ArrayType *AT = dyn_cast(SeqTy)) - NumElements = AT->getNumElements(); - else if (VectorType *VT = dyn_cast(SeqTy)) - NumElements = VT->getNumElements(); - else - llvm_unreachable("Unexpected type"); - - unsigned EltSize = TD->getTypeAllocSize(SeqTy->getElementType()); - SmallVector Chains; - for (unsigned i = 0; i < NumElements; ++i) { - SDValue Offset = DAG.getConstant(i * EltSize, DL, PtrVT); - SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset); - - Constant *Elt = Init->getAggregateElement(i); - Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG)); - } - - return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); - } - - if (isa(Init)) { - EVT VT = EVT::getEVT(InitTy); - PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); - return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr, - MachinePointerInfo(UndefValue::get(PtrTy)), false, false, - TD->getPrefTypeAlignment(InitTy)); - } - - Init->dump(); - llvm_unreachable("Unhandled constant initializer"); -} - -static bool hasDefinedInitializer(const GlobalValue *GV) { - const GlobalVariable *GVar = dyn_cast(GV); - if (!GVar || !GVar->hasInitializer()) - return false; - - if (isa(GVar->getInitializer())) - return false; - - return true; -} - -SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, - SDValue Op, - SelectionDAG &DAG) const { - - const DataLayout *TD = getDataLayout(); - GlobalAddressSDNode *G = cast(Op); - const GlobalValue *GV = G->getGlobal(); - - switch (G->getAddressSpace()) { - case AMDGPUAS::LOCAL_ADDRESS: { - // XXX: What does the value of G->getOffset() mean? - assert(G->getOffset() == 0 && - "Do not know what to do with an non-zero offset"); - - // TODO: We could emit code to handle the initialization somewhere. - if (hasDefinedInitializer(GV)) - break; - - unsigned Offset; - if (MFI->LocalMemoryObjects.count(GV) == 0) { - uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType()); - Offset = MFI->LDSSize; - MFI->LocalMemoryObjects[GV] = Offset; - // XXX: Account for alignment? - MFI->LDSSize += Size; - } else { - Offset = MFI->LocalMemoryObjects[GV]; - } - - return DAG.getConstant(Offset, SDLoc(Op), - getPointerTy(AMDGPUAS::LOCAL_ADDRESS)); - } - case AMDGPUAS::CONSTANT_ADDRESS: { - MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); - Type *EltType = GV->getType()->getElementType(); - unsigned Size = TD->getTypeAllocSize(EltType); - unsigned Alignment = TD->getPrefTypeAlignment(EltType); - - MVT PrivPtrVT = getPointerTy(AMDGPUAS::PRIVATE_ADDRESS); - MVT ConstPtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS); - - int FI = FrameInfo->CreateStackObject(Size, Alignment, false); - SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT); - - const GlobalVariable *Var = cast(GV); - if (!Var->hasInitializer()) { - // This has no use, but bugpoint will hit it. - return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT); - } - - const Constant *Init = Var->getInitializer(); - SmallVector WorkList; - - for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(), - E = DAG.getEntryNode()->use_end(); I != E; ++I) { - if (I->getOpcode() != AMDGPUISD::REGISTER_LOAD && I->getOpcode() != ISD::LOAD) - continue; - WorkList.push_back(*I); - } - SDValue Chain = LowerConstantInitializer(Init, GV, InitPtr, DAG.getEntryNode(), DAG); - for (SmallVector::iterator I = WorkList.begin(), - E = WorkList.end(); I != E; ++I) { - SmallVector Ops; - Ops.push_back(Chain); - for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) { - Ops.push_back((*I)->getOperand(i)); - } - DAG.UpdateNodeOperands(*I, Ops); - } - return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT); - } - } - - const Function &Fn = *DAG.getMachineFunction().getFunction(); - DiagnosticInfoUnsupported BadInit(Fn, - "initializer for address space"); - DAG.getContext()->diagnose(BadInit); - return SDValue(); -} - -SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, - SelectionDAG &DAG) const { - SmallVector Args; - - for (const SDUse &U : Op->ops()) - DAG.ExtractVectorElements(U.get(), Args); - - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args); -} - -SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, - SelectionDAG &DAG) const { - - SmallVector Args; - unsigned Start = cast(Op.getOperand(1))->getZExtValue(); - EVT VT = Op.getValueType(); - DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, - VT.getVectorNumElements()); - - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args); -} - -SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op, - SelectionDAG &DAG) const { - - MachineFunction &MF = DAG.getMachineFunction(); - const AMDGPUFrameLowering *TFL = Subtarget->getFrameLowering(); - - FrameIndexSDNode *FIN = cast(Op); - - unsigned FrameIndex = FIN->getIndex(); - unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex); - return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op), - Op.getValueType()); -} - -SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, - SelectionDAG &DAG) const { - unsigned IntrinsicID = cast(Op.getOperand(0))->getZExtValue(); - SDLoc DL(Op); - EVT VT = Op.getValueType(); - - switch (IntrinsicID) { - default: return Op; - case AMDGPUIntrinsic::AMDGPU_abs: - case AMDGPUIntrinsic::AMDIL_abs: // Legacy name. - return LowerIntrinsicIABS(Op, DAG); - case AMDGPUIntrinsic::AMDGPU_lrp: - return LowerIntrinsicLRP(Op, DAG); - - case AMDGPUIntrinsic::AMDGPU_clamp: - case AMDGPUIntrinsic::AMDIL_clamp: // Legacy name. - return DAG.getNode(AMDGPUISD::CLAMP, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - - case Intrinsic::AMDGPU_div_scale: { - // 3rd parameter required to be a constant. - const ConstantSDNode *Param = dyn_cast(Op.getOperand(3)); - if (!Param) - return DAG.getUNDEF(VT); - - // Translate to the operands expected by the machine instruction. The - // first parameter must be the same as the first instruction. - SDValue Numerator = Op.getOperand(1); - SDValue Denominator = Op.getOperand(2); - - // Note this order is opposite of the machine instruction's operations, - // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The - // intrinsic has the numerator as the first operand to match a normal - // division operation. - - SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator; - - return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0, - Denominator, Numerator); - } - - case Intrinsic::AMDGPU_div_fmas: - return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), - Op.getOperand(4)); - - case Intrinsic::AMDGPU_div_fixup: - return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - - case Intrinsic::AMDGPU_trig_preop: - return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::AMDGPU_rcp: - return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); - - case Intrinsic::AMDGPU_rsq: - return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); - - case AMDGPUIntrinsic::AMDGPU_legacy_rsq: - return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); - - case Intrinsic::AMDGPU_rsq_clamped: - if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - Type *Type = VT.getTypeForEVT(*DAG.getContext()); - APFloat Max = APFloat::getLargest(Type->getFltSemantics()); - APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true); - - SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); - SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, - DAG.getConstantFP(Max, DL, VT)); - return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp, - DAG.getConstantFP(Min, DL, VT)); - } else { - return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1)); - } - - case Intrinsic::AMDGPU_ldexp: - return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, Op.getOperand(1), - Op.getOperand(2)); - - case AMDGPUIntrinsic::AMDGPU_imax: - return DAG.getNode(ISD::SMAX, DL, VT, Op.getOperand(1), - Op.getOperand(2)); - case AMDGPUIntrinsic::AMDGPU_umax: - return DAG.getNode(ISD::UMAX, DL, VT, Op.getOperand(1), - Op.getOperand(2)); - case AMDGPUIntrinsic::AMDGPU_imin: - return DAG.getNode(ISD::SMIN, DL, VT, Op.getOperand(1), - Op.getOperand(2)); - case AMDGPUIntrinsic::AMDGPU_umin: - return DAG.getNode(ISD::UMIN, DL, VT, Op.getOperand(1), - Op.getOperand(2)); - - case AMDGPUIntrinsic::AMDGPU_umul24: - return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, - Op.getOperand(1), Op.getOperand(2)); - - case AMDGPUIntrinsic::AMDGPU_imul24: - return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, - Op.getOperand(1), Op.getOperand(2)); - - case AMDGPUIntrinsic::AMDGPU_umad24: - return DAG.getNode(AMDGPUISD::MAD_U24, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - - case AMDGPUIntrinsic::AMDGPU_imad24: - return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - - case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte0: - return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Op.getOperand(1)); - - case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte1: - return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE1, DL, VT, Op.getOperand(1)); - - case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte2: - return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE2, DL, VT, Op.getOperand(1)); - - case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte3: - return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE3, DL, VT, Op.getOperand(1)); - - case AMDGPUIntrinsic::AMDGPU_bfe_i32: - return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, - Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3)); - - case AMDGPUIntrinsic::AMDGPU_bfe_u32: - return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, - Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3)); - - case AMDGPUIntrinsic::AMDGPU_bfi: - return DAG.getNode(AMDGPUISD::BFI, DL, VT, - Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3)); - - case AMDGPUIntrinsic::AMDGPU_bfm: - return DAG.getNode(AMDGPUISD::BFM, DL, VT, - Op.getOperand(1), - Op.getOperand(2)); - - case AMDGPUIntrinsic::AMDGPU_brev: - return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1)); - - case Intrinsic::AMDGPU_class: - return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, - Op.getOperand(1), Op.getOperand(2)); - - case AMDGPUIntrinsic::AMDIL_exp: // Legacy name. - return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1)); - - case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name. - return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1)); - case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name. - return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1)); - } -} - -///IABS(a) = SMAX(sub(0, a), a) -SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op, - SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), - Op.getOperand(1)); - - return DAG.getNode(ISD::SMAX, DL, VT, Neg, Op.getOperand(1)); -} - -/// Linear Interpolation -/// LRP(a, b, c) = muladd(a, b, (1 - a) * c) -SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op, - SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT, - DAG.getConstantFP(1.0f, DL, MVT::f32), - Op.getOperand(1)); - SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA, - Op.getOperand(3)); - return DAG.getNode(ISD::FADD, DL, VT, - DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), Op.getOperand(2)), - OneSubAC); -} - -/// \brief Generate Min/Max node -SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL, - EVT VT, - SDValue LHS, - SDValue RHS, - SDValue True, - SDValue False, - SDValue CC, - DAGCombinerInfo &DCI) const { - if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - return SDValue(); - - if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) - return SDValue(); - - SelectionDAG &DAG = DCI.DAG; - ISD::CondCode CCOpcode = cast(CC)->get(); - switch (CCOpcode) { - case ISD::SETOEQ: - case ISD::SETONE: - case ISD::SETUNE: - case ISD::SETNE: - case ISD::SETUEQ: - case ISD::SETEQ: - case ISD::SETFALSE: - case ISD::SETFALSE2: - case ISD::SETTRUE: - case ISD::SETTRUE2: - case ISD::SETUO: - case ISD::SETO: - break; - case ISD::SETULE: - case ISD::SETULT: { - if (LHS == True) - return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); - return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); - } - case ISD::SETOLE: - case ISD::SETOLT: - case ISD::SETLE: - case ISD::SETLT: { - // Ordered. Assume ordered for undefined. - - // Only do this after legalization to avoid interfering with other combines - // which might occur. - if (DCI.getDAGCombineLevel() < AfterLegalizeDAG && - !DCI.isCalledByLegalizer()) - return SDValue(); - - // We need to permute the operands to get the correct NaN behavior. The - // selected operand is the second one based on the failing compare with NaN, - // so permute it based on the compare type the hardware uses. - if (LHS == True) - return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); - return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); - } - case ISD::SETUGE: - case ISD::SETUGT: { - if (LHS == True) - return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS); - return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS); - } - case ISD::SETGT: - case ISD::SETGE: - case ISD::SETOGE: - case ISD::SETOGT: { - if (DCI.getDAGCombineLevel() < AfterLegalizeDAG && - !DCI.isCalledByLegalizer()) - return SDValue(); - - if (LHS == True) - return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS); - return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS); - } - case ISD::SETCC_INVALID: - llvm_unreachable("Invalid setcc condcode!"); - } - return SDValue(); -} - -// FIXME: Remove this when combines added to DAGCombiner. -SDValue AMDGPUTargetLowering::CombineIMinMax(SDLoc DL, - EVT VT, - SDValue LHS, - SDValue RHS, - SDValue True, - SDValue False, - SDValue CC, - SelectionDAG &DAG) const { - if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) - return SDValue(); - - ISD::CondCode CCOpcode = cast(CC)->get(); - switch (CCOpcode) { - case ISD::SETULE: - case ISD::SETULT: { - unsigned Opc = (LHS == True) ? ISD::UMIN : ISD::UMAX; - return DAG.getNode(Opc, DL, VT, LHS, RHS); - } - case ISD::SETLE: - case ISD::SETLT: { - unsigned Opc = (LHS == True) ? ISD::SMIN : ISD::SMAX; - return DAG.getNode(Opc, DL, VT, LHS, RHS); - } - case ISD::SETGT: - case ISD::SETGE: { - unsigned Opc = (LHS == True) ? ISD::SMAX : ISD::SMIN; - return DAG.getNode(Opc, DL, VT, LHS, RHS); - } - case ISD::SETUGE: - case ISD::SETUGT: { - unsigned Opc = (LHS == True) ? ISD::UMAX : ISD::UMIN; - return DAG.getNode(Opc, DL, VT, LHS, RHS); - } - default: - return SDValue(); - } -} - -SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op, - SelectionDAG &DAG) const { - LoadSDNode *Load = cast(Op); - EVT MemVT = Load->getMemoryVT(); - EVT MemEltVT = MemVT.getVectorElementType(); - - EVT LoadVT = Op.getValueType(); - EVT EltVT = LoadVT.getVectorElementType(); - EVT PtrVT = Load->getBasePtr().getValueType(); - - unsigned NumElts = Load->getMemoryVT().getVectorNumElements(); - SmallVector Loads; - SmallVector Chains; - - SDLoc SL(Op); - unsigned MemEltSize = MemEltVT.getStoreSize(); - MachinePointerInfo SrcValue(Load->getMemOperand()->getValue()); - - for (unsigned i = 0; i < NumElts; ++i) { - SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(), - DAG.getConstant(i * MemEltSize, SL, PtrVT)); - - SDValue NewLoad - = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT, - Load->getChain(), Ptr, - SrcValue.getWithOffset(i * MemEltSize), - MemEltVT, Load->isVolatile(), Load->isNonTemporal(), - Load->isInvariant(), Load->getAlignment()); - Loads.push_back(NewLoad.getValue(0)); - Chains.push_back(NewLoad.getValue(1)); - } - - SDValue Ops[] = { - DAG.getNode(ISD::BUILD_VECTOR, SL, LoadVT, Loads), - DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains) - }; - - return DAG.getMergeValues(Ops, SL); -} - -SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, - SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - - // If this is a 2 element vector, we really want to scalarize and not create - // weird 1 element vectors. - if (VT.getVectorNumElements() == 2) - return ScalarizeVectorLoad(Op, DAG); - - LoadSDNode *Load = cast(Op); - SDValue BasePtr = Load->getBasePtr(); - EVT PtrVT = BasePtr.getValueType(); - EVT MemVT = Load->getMemoryVT(); - SDLoc SL(Op); - MachinePointerInfo SrcValue(Load->getMemOperand()->getValue()); - - EVT LoVT, HiVT; - EVT LoMemVT, HiMemVT; - SDValue Lo, Hi; - - std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); - std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); - std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT); - SDValue LoLoad - = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT, - Load->getChain(), BasePtr, - SrcValue, - LoMemVT, Load->isVolatile(), Load->isNonTemporal(), - Load->isInvariant(), Load->getAlignment()); - - SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, - DAG.getConstant(LoMemVT.getStoreSize(), SL, - PtrVT)); - - SDValue HiLoad - = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, - Load->getChain(), HiPtr, - SrcValue.getWithOffset(LoMemVT.getStoreSize()), - HiMemVT, Load->isVolatile(), Load->isNonTemporal(), - Load->isInvariant(), Load->getAlignment()); - - SDValue Ops[] = { - DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad), - DAG.getNode(ISD::TokenFactor, SL, MVT::Other, - LoLoad.getValue(1), HiLoad.getValue(1)) - }; - - return DAG.getMergeValues(Ops, SL); -} - -SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, - SelectionDAG &DAG) const { - StoreSDNode *Store = cast(Op); - EVT MemVT = Store->getMemoryVT(); - unsigned MemBits = MemVT.getSizeInBits(); - - // Byte stores are really expensive, so if possible, try to pack 32-bit vector - // truncating store into an i32 store. - // XXX: We could also handle optimize other vector bitwidths. - if (!MemVT.isVector() || MemBits > 32) { - return SDValue(); - } - - SDLoc DL(Op); - SDValue Value = Store->getValue(); - EVT VT = Value.getValueType(); - EVT ElemVT = VT.getVectorElementType(); - SDValue Ptr = Store->getBasePtr(); - EVT MemEltVT = MemVT.getVectorElementType(); - unsigned MemEltBits = MemEltVT.getSizeInBits(); - unsigned MemNumElements = MemVT.getVectorNumElements(); - unsigned PackedSize = MemVT.getStoreSizeInBits(); - SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, DL, MVT::i32); - - assert(Value.getValueType().getScalarSizeInBits() >= 32); - - SDValue PackedValue; - for (unsigned i = 0; i < MemNumElements; ++i) { - SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value, - DAG.getConstant(i, DL, MVT::i32)); - Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32); - Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg - - SDValue Shift = DAG.getConstant(MemEltBits * i, DL, MVT::i32); - Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift); - - if (i == 0) { - PackedValue = Elt; - } else { - PackedValue = DAG.getNode(ISD::OR, DL, MVT::i32, PackedValue, Elt); - } - } - - if (PackedSize < 32) { - EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize); - return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr, - Store->getMemOperand()->getPointerInfo(), - PackedVT, - Store->isNonTemporal(), Store->isVolatile(), - Store->getAlignment()); - } - - return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr, - Store->getMemOperand()->getPointerInfo(), - Store->isVolatile(), Store->isNonTemporal(), - Store->getAlignment()); -} - -SDValue AMDGPUTargetLowering::ScalarizeVectorStore(SDValue Op, - SelectionDAG &DAG) const { - StoreSDNode *Store = cast(Op); - EVT MemEltVT = Store->getMemoryVT().getVectorElementType(); - EVT EltVT = Store->getValue().getValueType().getVectorElementType(); - EVT PtrVT = Store->getBasePtr().getValueType(); - unsigned NumElts = Store->getMemoryVT().getVectorNumElements(); - SDLoc SL(Op); - - SmallVector Chains; - - unsigned EltSize = MemEltVT.getStoreSize(); - MachinePointerInfo SrcValue(Store->getMemOperand()->getValue()); - - for (unsigned i = 0, e = NumElts; i != e; ++i) { - SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, - Store->getValue(), - DAG.getConstant(i, SL, MVT::i32)); - - SDValue Offset = DAG.getConstant(i * MemEltVT.getStoreSize(), SL, PtrVT); - SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Store->getBasePtr(), Offset); - SDValue NewStore = - DAG.getTruncStore(Store->getChain(), SL, Val, Ptr, - SrcValue.getWithOffset(i * EltSize), - MemEltVT, Store->isNonTemporal(), Store->isVolatile(), - Store->getAlignment()); - Chains.push_back(NewStore); - } - - return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains); -} - -SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, - SelectionDAG &DAG) const { - StoreSDNode *Store = cast(Op); - SDValue Val = Store->getValue(); - EVT VT = Val.getValueType(); - - // If this is a 2 element vector, we really want to scalarize and not create - // weird 1 element vectors. - if (VT.getVectorNumElements() == 2) - return ScalarizeVectorStore(Op, DAG); - - EVT MemVT = Store->getMemoryVT(); - SDValue Chain = Store->getChain(); - SDValue BasePtr = Store->getBasePtr(); - SDLoc SL(Op); - - EVT LoVT, HiVT; - EVT LoMemVT, HiMemVT; - SDValue Lo, Hi; - - std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); - std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); - std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT); - - EVT PtrVT = BasePtr.getValueType(); - SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, - DAG.getConstant(LoMemVT.getStoreSize(), SL, - PtrVT)); - - MachinePointerInfo SrcValue(Store->getMemOperand()->getValue()); - SDValue LoStore - = DAG.getTruncStore(Chain, SL, Lo, - BasePtr, - SrcValue, - LoMemVT, - Store->isNonTemporal(), - Store->isVolatile(), - Store->getAlignment()); - SDValue HiStore - = DAG.getTruncStore(Chain, SL, Hi, - HiPtr, - SrcValue.getWithOffset(LoMemVT.getStoreSize()), - HiMemVT, - Store->isNonTemporal(), - Store->isVolatile(), - Store->getAlignment()); - - return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore); -} - - -SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - LoadSDNode *Load = cast(Op); - ISD::LoadExtType ExtType = Load->getExtensionType(); - EVT VT = Op.getValueType(); - EVT MemVT = Load->getMemoryVT(); - - if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) { - assert(VT == MVT::i1 && "Only i1 non-extloads expected"); - // FIXME: Copied from PPC - // First, load into 32 bits, then truncate to 1 bit. - - SDValue Chain = Load->getChain(); - SDValue BasePtr = Load->getBasePtr(); - MachineMemOperand *MMO = Load->getMemOperand(); - - SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, - BasePtr, MVT::i8, MMO); - - SDValue Ops[] = { - DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD), - NewLD.getValue(1) - }; - - return DAG.getMergeValues(Ops, DL); - } - - if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS || - Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS || - ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32)) - return SDValue(); - - // getBasePtr(), - DAG.getConstant(2, DL, MVT::i32)); - // Load the Register. - SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), - Load->getChain(), Ptr, - DAG.getTargetConstant(0, DL, MVT::i32), - Op.getOperand(2)); - - // Get offset within the register. - SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, - Load->getBasePtr(), - DAG.getConstant(0x3, DL, MVT::i32)); - - // Bit offset of target byte (byteIdx * 8). - SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, - DAG.getConstant(3, DL, MVT::i32)); - - // Shift to the right. - Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt); - - // Eliminate the upper bits by setting them to ... - EVT MemEltVT = MemVT.getScalarType(); - - // ... ones. - if (ExtType == ISD::SEXTLOAD) { - SDValue MemEltVTNode = DAG.getValueType(MemEltVT); - - SDValue Ops[] = { - DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode), - Load->getChain() - }; - - return DAG.getMergeValues(Ops, DL); - } - - // ... or zeros. - SDValue Ops[] = { - DAG.getZeroExtendInReg(Ret, DL, MemEltVT), - Load->getChain() - }; - - return DAG.getMergeValues(Ops, DL); -} - -SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG); - if (Result.getNode()) { - return Result; - } - - StoreSDNode *Store = cast(Op); - SDValue Chain = Store->getChain(); - if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || - Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && - Store->getValue().getValueType().isVector()) { - return ScalarizeVectorStore(Op, DAG); - } - - EVT MemVT = Store->getMemoryVT(); - if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS && - MemVT.bitsLT(MVT::i32)) { - unsigned Mask = 0; - if (Store->getMemoryVT() == MVT::i8) { - Mask = 0xff; - } else if (Store->getMemoryVT() == MVT::i16) { - Mask = 0xffff; - } - SDValue BasePtr = Store->getBasePtr(); - SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr, - DAG.getConstant(2, DL, MVT::i32)); - SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, - Chain, Ptr, - DAG.getTargetConstant(0, DL, MVT::i32)); - - SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr, - DAG.getConstant(0x3, DL, MVT::i32)); - - SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, - DAG.getConstant(3, DL, MVT::i32)); - - SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, - Store->getValue()); - - SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT); - - SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, - MaskedValue, ShiftAmt); - - SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, - DAG.getConstant(Mask, DL, MVT::i32), - ShiftAmt); - DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask, - DAG.getConstant(0xffffffff, DL, MVT::i32)); - Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); - - SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); - return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, - Chain, Value, Ptr, - DAG.getTargetConstant(0, DL, MVT::i32)); - } - return SDValue(); -} - -// This is a shortcut for integer division because we have fast i32<->f32 -// conversions, and fast f32 reciprocal instructions. The fractional part of a -// float is enough to accurately represent up to a 24-bit integer. -SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - MVT IntVT = MVT::i32; - MVT FltVT = MVT::f32; - - ISD::NodeType ToFp = sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP; - ISD::NodeType ToInt = sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT; - - if (VT.isVector()) { - unsigned NElts = VT.getVectorNumElements(); - IntVT = MVT::getVectorVT(MVT::i32, NElts); - FltVT = MVT::getVectorVT(MVT::f32, NElts); - } - - unsigned BitSize = VT.getScalarType().getSizeInBits(); - - SDValue jq = DAG.getConstant(1, DL, IntVT); - - if (sign) { - // char|short jq = ia ^ ib; - jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS); - - // jq = jq >> (bitsize - 2) - jq = DAG.getNode(ISD::SRA, DL, VT, jq, - DAG.getConstant(BitSize - 2, DL, VT)); - - // jq = jq | 0x1 - jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT)); - - // jq = (int)jq - jq = DAG.getSExtOrTrunc(jq, DL, IntVT); - } - - // int ia = (int)LHS; - SDValue ia = sign ? - DAG.getSExtOrTrunc(LHS, DL, IntVT) : DAG.getZExtOrTrunc(LHS, DL, IntVT); - - // int ib, (int)RHS; - SDValue ib = sign ? - DAG.getSExtOrTrunc(RHS, DL, IntVT) : DAG.getZExtOrTrunc(RHS, DL, IntVT); - - // float fa = (float)ia; - SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia); - - // float fb = (float)ib; - SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib); - - // float fq = native_divide(fa, fb); - SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT, - fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb)); - - // fq = trunc(fq); - fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq); - - // float fqneg = -fq; - SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq); - - // float fr = mad(fqneg, fb, fa); - SDValue fr = DAG.getNode(ISD::FADD, DL, FltVT, - DAG.getNode(ISD::FMUL, DL, FltVT, fqneg, fb), fa); - - // int iq = (int)fq; - SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq); - - // fr = fabs(fr); - fr = DAG.getNode(ISD::FABS, DL, FltVT, fr); - - // fb = fabs(fb); - fb = DAG.getNode(ISD::FABS, DL, FltVT, fb); - - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), VT); - - // int cv = fr >= fb; - SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE); - - // jq = (cv ? jq : 0); - jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT)); - - // dst = trunc/extend to legal type - iq = sign ? DAG.getSExtOrTrunc(iq, DL, VT) : DAG.getZExtOrTrunc(iq, DL, VT); - - // dst = iq + jq; - SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq); - - // Rem needs compensation, it's easier to recompute it - SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS); - Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem); - - SDValue Res[2] = { - Div, - Rem - }; - return DAG.getMergeValues(Res, DL); -} - -void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, - SelectionDAG &DAG, - SmallVectorImpl &Results) const { - assert(Op.getValueType() == MVT::i64); - - SDLoc DL(Op); - EVT VT = Op.getValueType(); - EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); - - SDValue one = DAG.getConstant(1, DL, HalfVT); - SDValue zero = DAG.getConstant(0, DL, HalfVT); - - //HiLo split - SDValue LHS = Op.getOperand(0); - SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero); - SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one); - - SDValue RHS = Op.getOperand(1); - SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero); - SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one); - - if (VT == MVT::i64 && - DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) && - DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) { - - SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), - LHS_Lo, RHS_Lo); - - SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(0), zero); - SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(1), zero); - Results.push_back(DIV); - Results.push_back(REM); - return; - } - - // Get Speculative values - SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); - SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); - - SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ); - SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, zero); - - SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ); - SDValue DIV_Lo = zero; - - const unsigned halfBitWidth = HalfVT.getSizeInBits(); - - for (unsigned i = 0; i < halfBitWidth; ++i) { - const unsigned bitPos = halfBitWidth - i - 1; - SDValue POS = DAG.getConstant(bitPos, DL, HalfVT); - // Get value of high bit - SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); - HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one); - HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit); - - // Shift - REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT)); - // Add LHS high bit - REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit); - - SDValue BIT = DAG.getConstant(1 << bitPos, DL, HalfVT); - SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE); - - DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); - - // Update REM - SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS); - REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE); - } - - SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi); - Results.push_back(DIV); - Results.push_back(REM); -} - -SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, - SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - - if (VT == MVT::i64) { - SmallVector Results; - LowerUDIVREM64(Op, DAG, Results); - return DAG.getMergeValues(Results, DL); - } - - SDValue Num = Op.getOperand(0); - SDValue Den = Op.getOperand(1); - - if (VT == MVT::i32) { - if (DAG.MaskedValueIsZero(Num, APInt::getHighBitsSet(32, 8)) && - DAG.MaskedValueIsZero(Den, APInt::getHighBitsSet(32, 8))) { - // TODO: We technically could do this for i64, but shouldn't that just be - // handled by something generally reducing 64-bit division on 32-bit - // values to 32-bit? - return LowerDIVREM24(Op, DAG, false); - } - } - - // RCP = URECIP(Den) = 2^32 / Den + e - // e is rounding error. - SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den); - - // RCP_LO = mul(RCP, Den) */ - SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den); - - // RCP_HI = mulhu (RCP, Den) */ - SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den); - - // NEG_RCP_LO = -RCP_LO - SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), - RCP_LO); - - // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) - SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT), - NEG_RCP_LO, RCP_LO, - ISD::SETEQ); - // Calculate the rounding error from the URECIP instruction - // E = mulhu(ABS_RCP_LO, RCP) - SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP); - - // RCP_A_E = RCP + E - SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E); - - // RCP_S_E = RCP - E - SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E); - - // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) - SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT), - RCP_A_E, RCP_S_E, - ISD::SETEQ); - // Quotient = mulhu(Tmp0, Num) - SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num); - - // Num_S_Remainder = Quotient * Den - SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den); - - // Remainder = Num - Num_S_Remainder - SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder); - - // Remainder_GE_Den = (Remainder >= Den ? -1 : 0) - SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den, - DAG.getConstant(-1, DL, VT), - DAG.getConstant(0, DL, VT), - ISD::SETUGE); - // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0) - SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num, - Num_S_Remainder, - DAG.getConstant(-1, DL, VT), - DAG.getConstant(0, DL, VT), - ISD::SETUGE); - // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero - SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den, - Remainder_GE_Zero); - - // Calculate Division result: - - // Quotient_A_One = Quotient + 1 - SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient, - DAG.getConstant(1, DL, VT)); - - // Quotient_S_One = Quotient - 1 - SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient, - DAG.getConstant(1, DL, VT)); - - // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One) - SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT), - Quotient, Quotient_A_One, ISD::SETEQ); - - // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div) - Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT), - Quotient_S_One, Div, ISD::SETEQ); - - // Calculate Rem result: - - // Remainder_S_Den = Remainder - Den - SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den); - - // Remainder_A_Den = Remainder + Den - SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den); - - // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den) - SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT), - Remainder, Remainder_S_Den, ISD::SETEQ); - - // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) - Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT), - Remainder_A_Den, Rem, ISD::SETEQ); - SDValue Ops[2] = { - Div, - Rem - }; - return DAG.getMergeValues(Ops, DL); -} - -SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, - SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - - SDValue Zero = DAG.getConstant(0, DL, VT); - SDValue NegOne = DAG.getConstant(-1, DL, VT); - - if (VT == MVT::i32 && - DAG.ComputeNumSignBits(LHS) > 8 && - DAG.ComputeNumSignBits(RHS) > 8) { - return LowerDIVREM24(Op, DAG, true); - } - if (VT == MVT::i64 && - DAG.ComputeNumSignBits(LHS) > 32 && - DAG.ComputeNumSignBits(RHS) > 32) { - EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); - - //HiLo split - SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero); - SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero); - SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), - LHS_Lo, RHS_Lo); - SDValue Res[2] = { - DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)), - DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1)) - }; - return DAG.getMergeValues(Res, DL); - } - - SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT); - SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT); - SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign); - SDValue RSign = LHSign; // Remainder sign is the same as LHS - - LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign); - RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign); - - LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign); - RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign); - - SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS); - SDValue Rem = Div.getValue(1); - - Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign); - Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign); - - Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign); - Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign); - - SDValue Res[2] = { - Div, - Rem - }; - return DAG.getMergeValues(Res, DL); -} - -// (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y)) -SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const { - SDLoc SL(Op); - EVT VT = Op.getValueType(); - SDValue X = Op.getOperand(0); - SDValue Y = Op.getOperand(1); - - SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y); - SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div); - SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y); - - return DAG.getNode(ISD::FSUB, SL, VT, X, Mul); -} - -SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { - SDLoc SL(Op); - SDValue Src = Op.getOperand(0); - - // result = trunc(src) - // if (src > 0.0 && src != result) - // result += 1.0 - - SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); - - const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64); - const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); - - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); - - SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT); - SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); - SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); - - SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero); - return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); -} - -static SDValue extractF64Exponent(SDValue Hi, SDLoc SL, SelectionDAG &DAG) { - const unsigned FractBits = 52; - const unsigned ExpBits = 11; - - SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, - Hi, - DAG.getConstant(FractBits - 32, SL, MVT::i32), - DAG.getConstant(ExpBits, SL, MVT::i32)); - SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart, - DAG.getConstant(1023, SL, MVT::i32)); - - return Exp; -} - -SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { - SDLoc SL(Op); - SDValue Src = Op.getOperand(0); - - assert(Op.getValueType() == MVT::f64); - - const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); - const SDValue One = DAG.getConstant(1, SL, MVT::i32); - - SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); - - // Extract the upper half, since this is where we will find the sign and - // exponent. - SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One); - - SDValue Exp = extractF64Exponent(Hi, SL, DAG); - - const unsigned FractBits = 52; - - // Extract the sign bit. - const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32); - SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask); - - // Extend back to to 64-bits. - SDValue SignBit64 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, - Zero, SignBit); - SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64); - - SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src); - const SDValue FractMask - = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64); - - SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp); - SDValue Not = DAG.getNOT(SL, Shr, MVT::i64); - SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not); - - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32); - - const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32); - - SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); - SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); - - SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0); - SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1); - - return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2); -} - -SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { - SDLoc SL(Op); - SDValue Src = Op.getOperand(0); - - assert(Op.getValueType() == MVT::f64); - - APFloat C1Val(APFloat::IEEEdouble, "0x1.0p+52"); - SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64); - SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src); - - SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign); - SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign); - - SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src); - - APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51"); - SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64); - - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); - SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT); - - return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2); -} - -SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const { - // FNEARBYINT and FRINT are the same, except in their handling of FP - // exceptions. Those aren't really meaningful for us, and OpenCL only has - // rint, so just treat them as equivalent. - return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0)); -} - -// XXX - May require not supporting f32 denormals? -SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const { - SDLoc SL(Op); - SDValue X = Op.getOperand(0); - - SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X); - - SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T); - - SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff); - - const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f32); - const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); - const SDValue Half = DAG.getConstantFP(0.5, SL, MVT::f32); - - SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X); - - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); - - SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE); - - SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero); - - return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel); -} - -SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const { - SDLoc SL(Op); - SDValue X = Op.getOperand(0); - - SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X); - - const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); - const SDValue One = DAG.getConstant(1, SL, MVT::i32); - const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32); - const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32); - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32); - - - SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); - - SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One); - - SDValue Exp = extractF64Exponent(Hi, SL, DAG); - - const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL, - MVT::i64); - - SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp); - SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64, - DAG.getConstant(INT64_C(0x0008000000000000), SL, - MVT::i64), - Exp); - - SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M); - SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT, - DAG.getConstant(0, SL, MVT::i64), Tmp0, - ISD::SETNE); - - SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1, - D, DAG.getConstant(0, SL, MVT::i64)); - SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2); - - K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64)); - K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K); - - SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); - SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); - SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ); - - SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64, - ExpEqNegOne, - DAG.getConstantFP(1.0, SL, MVT::f64), - DAG.getConstantFP(0.0, SL, MVT::f64)); - - SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X); - - K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K); - K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K); - - return K; -} - -SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - - if (VT == MVT::f32) - return LowerFROUND32(Op, DAG); - - if (VT == MVT::f64) - return LowerFROUND64(Op, DAG); - - llvm_unreachable("unhandled type"); -} - -SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { - SDLoc SL(Op); - SDValue Src = Op.getOperand(0); - - // result = trunc(src); - // if (src < 0.0 && src != result) - // result += -1.0. - - SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); - - const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64); - const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64); - - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); - - SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT); - SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); - SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); - - SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero); - return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); -} - -SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, - bool Signed) const { - SDLoc SL(Op); - SDValue Src = Op.getOperand(0); - - SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); - - SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, - DAG.getConstant(0, SL, MVT::i32)); - SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, - DAG.getConstant(1, SL, MVT::i32)); - - SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP, - SL, MVT::f64, Hi); - - SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo); - - SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi, - DAG.getConstant(32, SL, MVT::i32)); - - return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo); -} - -SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, - SelectionDAG &DAG) const { - SDValue S0 = Op.getOperand(0); - if (S0.getValueType() != MVT::i64) - return SDValue(); - - EVT DestVT = Op.getValueType(); - if (DestVT == MVT::f64) - return LowerINT_TO_FP64(Op, DAG, false); - - assert(DestVT == MVT::f32); - - SDLoc DL(Op); - - // f32 uint_to_fp i64 - SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, - DAG.getConstant(0, DL, MVT::i32)); - SDValue FloatLo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Lo); - SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, - DAG.getConstant(1, DL, MVT::i32)); - SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi); - FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi, - DAG.getConstantFP(4294967296.0f, DL, MVT::f32)); // 2^32 - return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi); -} - -SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op, - SelectionDAG &DAG) const { - SDValue Src = Op.getOperand(0); - if (Src.getValueType() == MVT::i64 && Op.getValueType() == MVT::f64) - return LowerINT_TO_FP64(Op, DAG, true); - - return SDValue(); -} - -SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, - bool Signed) const { - SDLoc SL(Op); - - SDValue Src = Op.getOperand(0); - - SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); - - SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL, - MVT::f64); - SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL, - MVT::f64); - - SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0); - - SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul); - - - SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc); - - SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL, - MVT::i32, FloorMul); - SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma); - - SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Lo, Hi); - - return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result); -} - -SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op, - SelectionDAG &DAG) const { - SDValue Src = Op.getOperand(0); - - if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) - return LowerFP64_TO_INT(Op, DAG, true); - - return SDValue(); -} - -SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op, - SelectionDAG &DAG) const { - SDValue Src = Op.getOperand(0); - - if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) - return LowerFP64_TO_INT(Op, DAG, false); - - return SDValue(); -} - -SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, - SelectionDAG &DAG) const { - EVT ExtraVT = cast(Op.getOperand(1))->getVT(); - MVT VT = Op.getSimpleValueType(); - MVT ScalarVT = VT.getScalarType(); - - if (!VT.isVector()) - return SDValue(); - - SDValue Src = Op.getOperand(0); - SDLoc DL(Op); - - // TODO: Don't scalarize on Evergreen? - unsigned NElts = VT.getVectorNumElements(); - SmallVector Args; - DAG.ExtractVectorElements(Src, Args, 0, NElts); - - SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType()); - for (unsigned I = 0; I < NElts; ++I) - Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp); - - return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args); -} - -//===----------------------------------------------------------------------===// -// Custom DAG optimizations -//===----------------------------------------------------------------------===// - -static bool isU24(SDValue Op, SelectionDAG &DAG) { - APInt KnownZero, KnownOne; - EVT VT = Op.getValueType(); - DAG.computeKnownBits(Op, KnownZero, KnownOne); - - return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24; -} - -static bool isI24(SDValue Op, SelectionDAG &DAG) { - EVT VT = Op.getValueType(); - - // In order for this to be a signed 24-bit value, bit 23, must - // be a sign bit. - return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated - // as unsigned 24-bit values. - (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24; -} - -static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) { - - SelectionDAG &DAG = DCI.DAG; - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - EVT VT = Op.getValueType(); - - APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24); - APInt KnownZero, KnownOne; - TargetLowering::TargetLoweringOpt TLO(DAG, true, true); - if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO)) - DCI.CommitTargetLoweringOpt(TLO); -} - -template -static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, - uint32_t Offset, uint32_t Width, SDLoc DL) { - if (Width + Offset < 32) { - uint32_t Shl = static_cast(Src0) << (32 - Offset - Width); - IntTy Result = static_cast(Shl) >> (32 - Width); - return DAG.getConstant(Result, DL, MVT::i32); - } - - return DAG.getConstant(Src0 >> Offset, DL, MVT::i32); -} - -static bool usesAllNormalStores(SDNode *LoadVal) { - for (SDNode::use_iterator I = LoadVal->use_begin(); !I.atEnd(); ++I) { - if (!ISD::isNormalStore(*I)) - return false; - } - - return true; -} - -// If we have a copy of an illegal type, replace it with a load / store of an -// equivalently sized legal type. This avoids intermediate bit pack / unpack -// instructions emitted when handling extloads and truncstores. Ideally we could -// recognize the pack / unpack pattern to eliminate it. -SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - if (!DCI.isBeforeLegalize()) - return SDValue(); - - StoreSDNode *SN = cast(N); - SDValue Value = SN->getValue(); - EVT VT = Value.getValueType(); - - if (isTypeLegal(VT) || SN->isVolatile() || - !ISD::isNormalLoad(Value.getNode()) || VT.getSizeInBits() < 8) - return SDValue(); - - LoadSDNode *LoadVal = cast(Value); - if (LoadVal->isVolatile() || !usesAllNormalStores(LoadVal)) - return SDValue(); - - EVT MemVT = LoadVal->getMemoryVT(); - - SDLoc SL(N); - SelectionDAG &DAG = DCI.DAG; - EVT LoadVT = getEquivalentMemType(*DAG.getContext(), MemVT); - - SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, - LoadVT, SL, - LoadVal->getChain(), - LoadVal->getBasePtr(), - LoadVal->getOffset(), - LoadVT, - LoadVal->getMemOperand()); - - SDValue CastLoad = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad.getValue(0)); - DCI.CombineTo(LoadVal, CastLoad, NewLoad.getValue(1), false); - - return DAG.getStore(SN->getChain(), SL, NewLoad, - SN->getBasePtr(), SN->getMemOperand()); -} - -SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - EVT VT = N->getValueType(0); - - if (VT.isVector() || VT.getSizeInBits() > 32) - return SDValue(); - - SelectionDAG &DAG = DCI.DAG; - SDLoc DL(N); - - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - SDValue Mul; - - if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { - N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); - N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); - Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1); - } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { - N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); - N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); - Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1); - } else { - return SDValue(); - } - - // We need to use sext even for MUL_U24, because MUL_U24 is used - // for signed multiply of 8 and 16-bit types. - return DAG.getSExtOrTrunc(Mul, DL, VT); -} - -SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - SelectionDAG &DAG = DCI.DAG; - SDLoc DL(N); - - switch(N->getOpcode()) { - default: break; - case ISD::MUL: - return performMulCombine(N, DCI); - case AMDGPUISD::MUL_I24: - case AMDGPUISD::MUL_U24: { - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - simplifyI24(N0, DCI); - simplifyI24(N1, DCI); - return SDValue(); - } - case ISD::SELECT: { - SDValue Cond = N->getOperand(0); - if (Cond.getOpcode() == ISD::SETCC && Cond.hasOneUse()) { - EVT VT = N->getValueType(0); - SDValue LHS = Cond.getOperand(0); - SDValue RHS = Cond.getOperand(1); - SDValue CC = Cond.getOperand(2); - - SDValue True = N->getOperand(1); - SDValue False = N->getOperand(2); - - if (VT == MVT::f32) - return CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); - - // TODO: Implement min / max Evergreen instructions. - if (VT == MVT::i32 && - Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - return CombineIMinMax(DL, VT, LHS, RHS, True, False, CC, DAG); - } - } - - break; - } - case AMDGPUISD::BFE_I32: - case AMDGPUISD::BFE_U32: { - assert(!N->getValueType(0).isVector() && - "Vector handling of BFE not implemented"); - ConstantSDNode *Width = dyn_cast(N->getOperand(2)); - if (!Width) - break; - - uint32_t WidthVal = Width->getZExtValue() & 0x1f; - if (WidthVal == 0) - return DAG.getConstant(0, DL, MVT::i32); - - ConstantSDNode *Offset = dyn_cast(N->getOperand(1)); - if (!Offset) - break; - - SDValue BitsFrom = N->getOperand(0); - uint32_t OffsetVal = Offset->getZExtValue() & 0x1f; - - bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32; - - if (OffsetVal == 0) { - // This is already sign / zero extended, so try to fold away extra BFEs. - unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal); - - unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom); - if (OpSignBits >= SignBits) - return BitsFrom; - - EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal); - if (Signed) { - // This is a sign_extend_inreg. Replace it to take advantage of existing - // DAG Combines. If not eliminated, we will match back to BFE during - // selection. - - // TODO: The sext_inreg of extended types ends, although we can could - // handle them in a single BFE. - return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom, - DAG.getValueType(SmallVT)); - } - - return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT); - } - - if (ConstantSDNode *CVal = dyn_cast(BitsFrom)) { - if (Signed) { - return constantFoldBFE(DAG, - CVal->getSExtValue(), - OffsetVal, - WidthVal, - DL); - } - - return constantFoldBFE(DAG, - CVal->getZExtValue(), - OffsetVal, - WidthVal, - DL); - } - - if ((OffsetVal + WidthVal) >= 32) { - SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32); - return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32, - BitsFrom, ShiftVal); - } - - if (BitsFrom.hasOneUse()) { - APInt Demanded = APInt::getBitsSet(32, - OffsetVal, - OffsetVal + WidthVal); - - APInt KnownZero, KnownOne; - TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), - !DCI.isBeforeLegalizeOps()); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) || - TLI.SimplifyDemandedBits(BitsFrom, Demanded, - KnownZero, KnownOne, TLO)) { - DCI.CommitTargetLoweringOpt(TLO); - } - } - - break; - } - - case ISD::STORE: - return performStoreCombine(N, DCI); - } - return SDValue(); -} - -//===----------------------------------------------------------------------===// -// Helper functions -//===----------------------------------------------------------------------===// - -void AMDGPUTargetLowering::getOriginalFunctionArgs( - SelectionDAG &DAG, - const Function *F, - const SmallVectorImpl &Ins, - SmallVectorImpl &OrigIns) const { - - for (unsigned i = 0, e = Ins.size(); i < e; ++i) { - if (Ins[i].ArgVT == Ins[i].VT) { - OrigIns.push_back(Ins[i]); - continue; - } - - EVT VT; - if (Ins[i].ArgVT.isVector() && !Ins[i].VT.isVector()) { - // Vector has been split into scalars. - VT = Ins[i].ArgVT.getVectorElementType(); - } else if (Ins[i].VT.isVector() && Ins[i].ArgVT.isVector() && - Ins[i].ArgVT.getVectorElementType() != - Ins[i].VT.getVectorElementType()) { - // Vector elements have been promoted - VT = Ins[i].ArgVT; - } else { - // Vector has been spilt into smaller vectors. - VT = Ins[i].VT; - } - - ISD::InputArg Arg(Ins[i].Flags, VT, VT, Ins[i].Used, - Ins[i].OrigArgIndex, Ins[i].PartOffset); - OrigIns.push_back(Arg); - } -} - -bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const { - if (ConstantFPSDNode * CFP = dyn_cast(Op)) { - return CFP->isExactlyValue(1.0); - } - if (ConstantSDNode *C = dyn_cast(Op)) { - return C->isAllOnesValue(); - } - return false; -} - -bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const { - if (ConstantFPSDNode * CFP = dyn_cast(Op)) { - return CFP->getValueAPF().isZero(); - } - if (ConstantSDNode *C = dyn_cast(Op)) { - return C->isNullValue(); - } - return false; -} - -SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, - const TargetRegisterClass *RC, - unsigned Reg, EVT VT) const { - MachineFunction &MF = DAG.getMachineFunction(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned VirtualRegister; - if (!MRI.isLiveIn(Reg)) { - VirtualRegister = MRI.createVirtualRegister(RC); - MRI.addLiveIn(Reg, VirtualRegister); - } else { - VirtualRegister = MRI.getLiveInVirtReg(Reg); - } - return DAG.getRegister(VirtualRegister, VT); -} - -#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node; - -const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { - switch ((AMDGPUISD::NodeType)Opcode) { - case AMDGPUISD::FIRST_NUMBER: break; - // AMDIL DAG nodes - NODE_NAME_CASE(CALL); - NODE_NAME_CASE(UMUL); - NODE_NAME_CASE(RET_FLAG); - NODE_NAME_CASE(BRANCH_COND); - - // AMDGPU DAG nodes - NODE_NAME_CASE(DWORDADDR) - NODE_NAME_CASE(FRACT) - NODE_NAME_CASE(CLAMP) - NODE_NAME_CASE(COS_HW) - NODE_NAME_CASE(SIN_HW) - NODE_NAME_CASE(FMAX_LEGACY) - NODE_NAME_CASE(FMIN_LEGACY) - NODE_NAME_CASE(FMAX3) - NODE_NAME_CASE(SMAX3) - NODE_NAME_CASE(UMAX3) - NODE_NAME_CASE(FMIN3) - NODE_NAME_CASE(SMIN3) - NODE_NAME_CASE(UMIN3) - NODE_NAME_CASE(URECIP) - NODE_NAME_CASE(DIV_SCALE) - NODE_NAME_CASE(DIV_FMAS) - NODE_NAME_CASE(DIV_FIXUP) - NODE_NAME_CASE(TRIG_PREOP) - NODE_NAME_CASE(RCP) - NODE_NAME_CASE(RSQ) - NODE_NAME_CASE(RSQ_LEGACY) - NODE_NAME_CASE(RSQ_CLAMPED) - NODE_NAME_CASE(LDEXP) - NODE_NAME_CASE(FP_CLASS) - NODE_NAME_CASE(DOT4) - NODE_NAME_CASE(CARRY) - NODE_NAME_CASE(BORROW) - NODE_NAME_CASE(BFE_U32) - NODE_NAME_CASE(BFE_I32) - NODE_NAME_CASE(BFI) - NODE_NAME_CASE(BFM) - NODE_NAME_CASE(BREV) - NODE_NAME_CASE(MUL_U24) - NODE_NAME_CASE(MUL_I24) - NODE_NAME_CASE(MAD_U24) - NODE_NAME_CASE(MAD_I24) - NODE_NAME_CASE(TEXTURE_FETCH) - NODE_NAME_CASE(EXPORT) - NODE_NAME_CASE(CONST_ADDRESS) - NODE_NAME_CASE(REGISTER_LOAD) - NODE_NAME_CASE(REGISTER_STORE) - NODE_NAME_CASE(LOAD_CONSTANT) - NODE_NAME_CASE(LOAD_INPUT) - NODE_NAME_CASE(SAMPLE) - NODE_NAME_CASE(SAMPLEB) - NODE_NAME_CASE(SAMPLED) - NODE_NAME_CASE(SAMPLEL) - NODE_NAME_CASE(CVT_F32_UBYTE0) - NODE_NAME_CASE(CVT_F32_UBYTE1) - NODE_NAME_CASE(CVT_F32_UBYTE2) - NODE_NAME_CASE(CVT_F32_UBYTE3) - NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) - NODE_NAME_CASE(CONST_DATA_PTR) - case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; - NODE_NAME_CASE(SENDMSG) - NODE_NAME_CASE(INTERP_MOV) - NODE_NAME_CASE(INTERP_P1) - NODE_NAME_CASE(INTERP_P2) - NODE_NAME_CASE(STORE_MSKOR) - NODE_NAME_CASE(TBUFFER_STORE_FORMAT) - case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; - } - return nullptr; -} - -SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand, - DAGCombinerInfo &DCI, - unsigned &RefinementSteps, - bool &UseOneConstNR) const { - SelectionDAG &DAG = DCI.DAG; - EVT VT = Operand.getValueType(); - - if (VT == MVT::f32) { - RefinementSteps = 0; - return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand); - } - - // TODO: There is also f64 rsq instruction, but the documentation is less - // clear on its precision. - - return SDValue(); -} - -SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand, - DAGCombinerInfo &DCI, - unsigned &RefinementSteps) const { - SelectionDAG &DAG = DCI.DAG; - EVT VT = Operand.getValueType(); - - if (VT == MVT::f32) { - // Reciprocal, < 1 ulp error. - // - // This reciprocal approximation converges to < 0.5 ulp error with one - // newton rhapson performed with two fused multiple adds (FMAs). - - RefinementSteps = 0; - return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand); - } - - // TODO: There is also f64 rcp instruction, but the documentation is less - // clear on its precision. - - return SDValue(); -} - -static void computeKnownBitsForMinMax(const SDValue Op0, - const SDValue Op1, - APInt &KnownZero, - APInt &KnownOne, - const SelectionDAG &DAG, - unsigned Depth) { - APInt Op0Zero, Op0One; - APInt Op1Zero, Op1One; - DAG.computeKnownBits(Op0, Op0Zero, Op0One, Depth); - DAG.computeKnownBits(Op1, Op1Zero, Op1One, Depth); - - KnownZero = Op0Zero & Op1Zero; - KnownOne = Op0One & Op1One; -} - -void AMDGPUTargetLowering::computeKnownBitsForTargetNode( - const SDValue Op, - APInt &KnownZero, - APInt &KnownOne, - const SelectionDAG &DAG, - unsigned Depth) const { - - KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything. - - APInt KnownZero2; - APInt KnownOne2; - unsigned Opc = Op.getOpcode(); - - switch (Opc) { - default: - break; - case ISD::INTRINSIC_WO_CHAIN: { - // FIXME: The intrinsic should just use the node. - switch (cast(Op.getOperand(0))->getZExtValue()) { - case AMDGPUIntrinsic::AMDGPU_imax: - case AMDGPUIntrinsic::AMDGPU_umax: - case AMDGPUIntrinsic::AMDGPU_imin: - case AMDGPUIntrinsic::AMDGPU_umin: - computeKnownBitsForMinMax(Op.getOperand(1), Op.getOperand(2), - KnownZero, KnownOne, DAG, Depth); - break; - default: - break; - } - - break; - } - case AMDGPUISD::CARRY: - case AMDGPUISD::BORROW: { - KnownZero = APInt::getHighBitsSet(32, 31); - break; - } - - case AMDGPUISD::BFE_I32: - case AMDGPUISD::BFE_U32: { - ConstantSDNode *CWidth = dyn_cast(Op.getOperand(2)); - if (!CWidth) - return; - - unsigned BitWidth = 32; - uint32_t Width = CWidth->getZExtValue() & 0x1f; - - if (Opc == AMDGPUISD::BFE_U32) - KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width); - - break; - } - } -} - -unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( - SDValue Op, - const SelectionDAG &DAG, - unsigned Depth) const { - switch (Op.getOpcode()) { - case AMDGPUISD::BFE_I32: { - ConstantSDNode *Width = dyn_cast(Op.getOperand(2)); - if (!Width) - return 1; - - unsigned SignBits = 32 - Width->getZExtValue() + 1; - ConstantSDNode *Offset = dyn_cast(Op.getOperand(1)); - if (!Offset || !Offset->isNullValue()) - return SignBits; - - // TODO: Could probably figure something out with non-0 offsets. - unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); - return std::max(SignBits, Op0SignBits); - } - - case AMDGPUISD::BFE_U32: { - ConstantSDNode *Width = dyn_cast(Op.getOperand(2)); - return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1; - } - - case AMDGPUISD::CARRY: - case AMDGPUISD::BORROW: - return 31; - - default: - return 1; - } -} diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h deleted file mode 100644 index fbb7d3c8843..00000000000 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ /dev/null @@ -1,307 +0,0 @@ -//===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Interface definition of the TargetLowering class that is common -/// to all AMD GPUs. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_AMDGPUISELLOWERING_H -#define LLVM_LIB_TARGET_R600_AMDGPUISELLOWERING_H - -#include "llvm/Target/TargetLowering.h" - -namespace llvm { - -class AMDGPUMachineFunction; -class AMDGPUSubtarget; -class MachineRegisterInfo; - -class AMDGPUTargetLowering : public TargetLowering { -protected: - const AMDGPUSubtarget *Subtarget; - -private: - SDValue LowerConstantInitializer(const Constant* Init, const GlobalValue *GV, - const SDValue &InitPtr, - SDValue Chain, - SelectionDAG &DAG) const; - SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; - /// \brief Lower vector stores by merging the vector elements into an integer - /// of the same bitwidth. - SDValue MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const; - /// \brief Split a vector store into multiple scalar stores. - /// \returns The resulting chain. - - SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const; - - SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; - - SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const; - SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; - - SDValue LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, bool Signed) const; - SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; - - SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; - - SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; - SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const; - -protected: - static EVT getEquivalentMemType(LLVMContext &Context, EVT VT); - static EVT getEquivalentLoadRegType(LLVMContext &Context, EVT VT); - - virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, - SelectionDAG &DAG) const; - - /// \brief Split a vector load into a scalar load of each component. - SDValue ScalarizeVectorLoad(SDValue Op, SelectionDAG &DAG) const; - - /// \brief Split a vector load into 2 loads of half the vector. - SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const; - - /// \brief Split a vector store into a scalar store of each component. - SDValue ScalarizeVectorStore(SDValue Op, SelectionDAG &DAG) const; - - /// \brief Split a vector store into 2 stores of half the vector. - SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const; - - SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const; - void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, - SmallVectorImpl &Results) const; - bool isHWTrueValue(SDValue Op) const; - bool isHWFalseValue(SDValue Op) const; - - /// The SelectionDAGBuilder will automatically promote function arguments - /// with illegal types. However, this does not work for the AMDGPU targets - /// since the function arguments are stored in memory as these illegal types. - /// In order to handle this properly we need to get the origianl types sizes - /// from the LLVM IR Function and fixup the ISD:InputArg values before - /// passing them to AnalyzeFormalArguments() - void getOriginalFunctionArgs(SelectionDAG &DAG, - const Function *F, - const SmallVectorImpl &Ins, - SmallVectorImpl &OrigIns) const; - void AnalyzeFormalArguments(CCState &State, - const SmallVectorImpl &Ins) const; - -public: - AMDGPUTargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI); - - bool isFAbsFree(EVT VT) const override; - bool isFNegFree(EVT VT) const override; - bool isTruncateFree(EVT Src, EVT Dest) const override; - bool isTruncateFree(Type *Src, Type *Dest) const override; - - bool isZExtFree(Type *Src, Type *Dest) const override; - bool isZExtFree(EVT Src, EVT Dest) const override; - bool isZExtFree(SDValue Val, EVT VT2) const override; - - bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; - - MVT getVectorIdxTy() const override; - bool isSelectSupported(SelectSupportKind) const override; - - bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; - bool ShouldShrinkFPConstant(EVT VT) const override; - bool shouldReduceLoadWidth(SDNode *Load, - ISD::LoadExtType ExtType, - EVT ExtVT) const override; - - bool isLoadBitCastBeneficial(EVT, EVT) const override; - - bool storeOfVectorConstantIsCheap(EVT MemVT, - unsigned NumElem, - unsigned AS) const override; - bool isCheapToSpeculateCttz() const override; - bool isCheapToSpeculateCtlz() const override; - - SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl &Outs, - const SmallVectorImpl &OutVals, - SDLoc DL, SelectionDAG &DAG) const override; - SDValue LowerCall(CallLoweringInfo &CLI, - SmallVectorImpl &InVals) const override; - - SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; - SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; - void ReplaceNodeResults(SDNode * N, - SmallVectorImpl &Results, - SelectionDAG &DAG) const override; - - SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const; - SDValue CombineFMinMaxLegacy(SDLoc DL, - EVT VT, - SDValue LHS, - SDValue RHS, - SDValue True, - SDValue False, - SDValue CC, - DAGCombinerInfo &DCI) const; - SDValue CombineIMinMax(SDLoc DL, - EVT VT, - SDValue LHS, - SDValue RHS, - SDValue True, - SDValue False, - SDValue CC, - SelectionDAG &DAG) const; - - const char* getTargetNodeName(unsigned Opcode) const override; - - SDValue getRsqrtEstimate(SDValue Operand, - DAGCombinerInfo &DCI, - unsigned &RefinementSteps, - bool &UseOneConstNR) const override; - SDValue getRecipEstimate(SDValue Operand, - DAGCombinerInfo &DCI, - unsigned &RefinementSteps) const override; - - virtual SDNode *PostISelFolding(MachineSDNode *N, - SelectionDAG &DAG) const { - return N; - } - - /// \brief Determine which of the bits specified in \p Mask are known to be - /// either zero or one and return them in the \p KnownZero and \p KnownOne - /// bitsets. - void computeKnownBitsForTargetNode(const SDValue Op, - APInt &KnownZero, - APInt &KnownOne, - const SelectionDAG &DAG, - unsigned Depth = 0) const override; - - unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const SelectionDAG &DAG, - unsigned Depth = 0) const override; - - /// \brief Helper function that adds Reg to the LiveIn list of the DAG's - /// MachineFunction. - /// - /// \returns a RegisterSDNode representing Reg. - virtual SDValue CreateLiveInRegister(SelectionDAG &DAG, - const TargetRegisterClass *RC, - unsigned Reg, EVT VT) const; -}; - -namespace AMDGPUISD { - -enum NodeType : unsigned { - // AMDIL ISD Opcodes - FIRST_NUMBER = ISD::BUILTIN_OP_END, - CALL, // Function call based on a single integer - UMUL, // 32bit unsigned multiplication - RET_FLAG, - BRANCH_COND, - // End AMDIL ISD Opcodes - DWORDADDR, - FRACT, - CLAMP, - - // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi. - // Denormals handled on some parts. - COS_HW, - SIN_HW, - FMAX_LEGACY, - FMIN_LEGACY, - FMAX3, - SMAX3, - UMAX3, - FMIN3, - SMIN3, - UMIN3, - URECIP, - DIV_SCALE, - DIV_FMAS, - DIV_FIXUP, - TRIG_PREOP, // 1 ULP max error for f64 - - // RCP, RSQ - For f32, 1 ULP max error, no denormal handling. - // For f64, max error 2^29 ULP, handles denormals. - RCP, - RSQ, - RSQ_LEGACY, - RSQ_CLAMPED, - LDEXP, - FP_CLASS, - DOT4, - CARRY, - BORROW, - BFE_U32, // Extract range of bits with zero extension to 32-bits. - BFE_I32, // Extract range of bits with sign extension to 32-bits. - BFI, // (src0 & src1) | (~src0 & src2) - BFM, // Insert a range of bits into a 32-bit word. - BREV, // Reverse bits. - MUL_U24, - MUL_I24, - MAD_U24, - MAD_I24, - TEXTURE_FETCH, - EXPORT, - CONST_ADDRESS, - REGISTER_LOAD, - REGISTER_STORE, - LOAD_INPUT, - SAMPLE, - SAMPLEB, - SAMPLED, - SAMPLEL, - - // These cvt_f32_ubyte* nodes need to remain consecutive and in order. - CVT_F32_UBYTE0, - CVT_F32_UBYTE1, - CVT_F32_UBYTE2, - CVT_F32_UBYTE3, - /// This node is for VLIW targets and it is used to represent a vector - /// that is stored in consecutive registers with the same channel. - /// For example: - /// |X |Y|Z|W| - /// T0|v.x| | | | - /// T1|v.y| | | | - /// T2|v.z| | | | - /// T3|v.w| | | | - BUILD_VERTICAL_VECTOR, - /// Pointer to the start of the shader's constant data. - CONST_DATA_PTR, - SENDMSG, - INTERP_MOV, - INTERP_P1, - INTERP_P2, - FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, - STORE_MSKOR, - LOAD_CONSTANT, - TBUFFER_STORE_FORMAT, - LAST_AMDGPU_ISD_NUMBER -}; - - -} // End namespace AMDGPUISD - -} // End namespace llvm - -#endif diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp deleted file mode 100644 index 15a3d543a68..00000000000 --- a/lib/Target/R600/AMDGPUInstrInfo.cpp +++ /dev/null @@ -1,369 +0,0 @@ -//===-- AMDGPUInstrInfo.cpp - Base class for AMD GPU InstrInfo ------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Implementation of the TargetInstrInfo class that is common to all -/// AMD GPUs. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPUInstrInfo.h" -#include "AMDGPURegisterInfo.h" -#include "AMDGPUTargetMachine.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" - -using namespace llvm; - -#define GET_INSTRINFO_CTOR_DTOR -#define GET_INSTRINFO_NAMED_OPS -#define GET_INSTRMAP_INFO -#include "AMDGPUGenInstrInfo.inc" - -// Pin the vtable to this file. -void AMDGPUInstrInfo::anchor() {} - -AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &st) - : AMDGPUGenInstrInfo(-1, -1), ST(st) {} - -const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const { - return RI; -} - -bool AMDGPUInstrInfo::isCoalescableExtInstr(const MachineInstr &MI, - unsigned &SrcReg, unsigned &DstReg, - unsigned &SubIdx) const { -// TODO: Implement this function - return false; -} - -unsigned AMDGPUInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, - int &FrameIndex) const { -// TODO: Implement this function - return 0; -} - -unsigned AMDGPUInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI, - int &FrameIndex) const { -// TODO: Implement this function - return 0; -} - -bool AMDGPUInstrInfo::hasLoadFromStackSlot(const MachineInstr *MI, - const MachineMemOperand *&MMO, - int &FrameIndex) const { -// TODO: Implement this function - return false; -} -unsigned AMDGPUInstrInfo::isStoreFromStackSlot(const MachineInstr *MI, - int &FrameIndex) const { -// TODO: Implement this function - return 0; -} -unsigned AMDGPUInstrInfo::isStoreFromStackSlotPostFE(const MachineInstr *MI, - int &FrameIndex) const { -// TODO: Implement this function - return 0; -} -bool AMDGPUInstrInfo::hasStoreFromStackSlot(const MachineInstr *MI, - const MachineMemOperand *&MMO, - int &FrameIndex) const { -// TODO: Implement this function - return false; -} - -MachineInstr * -AMDGPUInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, - MachineBasicBlock::iterator &MBBI, - LiveVariables *LV) const { -// TODO: Implement this function - return nullptr; -} - -void -AMDGPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned SrcReg, bool isKill, - int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { - llvm_unreachable("Not Implemented"); -} - -void -AMDGPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned DestReg, int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { - llvm_unreachable("Not Implemented"); -} - -bool AMDGPUInstrInfo::expandPostRAPseudo (MachineBasicBlock::iterator MI) const { - MachineBasicBlock *MBB = MI->getParent(); - int OffsetOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::addr); - // addr is a custom operand with multiple MI operands, and only the - // first MI operand is given a name. - int RegOpIdx = OffsetOpIdx + 1; - int ChanOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::chan); - if (isRegisterLoad(*MI)) { - int DstOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::dst); - unsigned RegIndex = MI->getOperand(RegOpIdx).getImm(); - unsigned Channel = MI->getOperand(ChanOpIdx).getImm(); - unsigned Address = calculateIndirectAddress(RegIndex, Channel); - unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg(); - if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) { - buildMovInstr(MBB, MI, MI->getOperand(DstOpIdx).getReg(), - getIndirectAddrRegClass()->getRegister(Address)); - } else { - buildIndirectRead(MBB, MI, MI->getOperand(DstOpIdx).getReg(), - Address, OffsetReg); - } - } else if (isRegisterStore(*MI)) { - int ValOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::val); - unsigned RegIndex = MI->getOperand(RegOpIdx).getImm(); - unsigned Channel = MI->getOperand(ChanOpIdx).getImm(); - unsigned Address = calculateIndirectAddress(RegIndex, Channel); - unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg(); - if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) { - buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address), - MI->getOperand(ValOpIdx).getReg()); - } else { - buildIndirectWrite(MBB, MI, MI->getOperand(ValOpIdx).getReg(), - calculateIndirectAddress(RegIndex, Channel), - OffsetReg); - } - } else { - return false; - } - - MBB->erase(MI); - return true; -} - -MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl( - MachineFunction &MF, MachineInstr *MI, ArrayRef Ops, - MachineBasicBlock::iterator InsertPt, int FrameIndex) const { -// TODO: Implement this function - return nullptr; -} -MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl( - MachineFunction &MF, MachineInstr *MI, ArrayRef Ops, - MachineBasicBlock::iterator InsertPt, MachineInstr *LoadMI) const { - // TODO: Implement this function - return nullptr; -} -bool AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI, - ArrayRef Ops) const { - // TODO: Implement this function - return false; -} -bool -AMDGPUInstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, - unsigned Reg, bool UnfoldLoad, - bool UnfoldStore, - SmallVectorImpl &NewMIs) const { - // TODO: Implement this function - return false; -} - -bool -AMDGPUInstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, - SmallVectorImpl &NewNodes) const { - // TODO: Implement this function - return false; -} - -unsigned -AMDGPUInstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc, - bool UnfoldLoad, bool UnfoldStore, - unsigned *LoadRegIndex) const { - // TODO: Implement this function - return 0; -} - -bool AMDGPUInstrInfo::enableClusterLoads() const { - return true; -} - -// FIXME: This behaves strangely. If, for example, you have 32 load + stores, -// the first 16 loads will be interleaved with the stores, and the next 16 will -// be clustered as expected. It should really split into 2 16 store batches. -// -// Loads are clustered until this returns false, rather than trying to schedule -// groups of stores. This also means we have to deal with saying different -// address space loads should be clustered, and ones which might cause bank -// conflicts. -// -// This might be deprecated so it might not be worth that much effort to fix. -bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, - int64_t Offset0, int64_t Offset1, - unsigned NumLoads) const { - assert(Offset1 > Offset0 && - "Second offset should be larger than first offset!"); - // If we have less than 16 loads in a row, and the offsets are within 64 - // bytes, then schedule together. - - // A cacheline is 64 bytes (for global memory). - return (NumLoads <= 16 && (Offset1 - Offset0) < 64); -} - -bool -AMDGPUInstrInfo::ReverseBranchCondition(SmallVectorImpl &Cond) - const { - // TODO: Implement this function - return true; -} -void AMDGPUInstrInfo::insertNoop(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI) const { - // TODO: Implement this function -} - -bool AMDGPUInstrInfo::isPredicated(const MachineInstr *MI) const { - // TODO: Implement this function - return false; -} - -bool AMDGPUInstrInfo::SubsumesPredicate(ArrayRef Pred1, - ArrayRef Pred2) const { - // TODO: Implement this function - return false; -} - -bool AMDGPUInstrInfo::DefinesPredicate(MachineInstr *MI, - std::vector &Pred) const { - // TODO: Implement this function - return false; -} - -bool AMDGPUInstrInfo::isPredicable(MachineInstr *MI) const { - // TODO: Implement this function - return MI->getDesc().isPredicable(); -} - -bool -AMDGPUInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { - // TODO: Implement this function - return true; -} - -bool AMDGPUInstrInfo::isRegisterStore(const MachineInstr &MI) const { - return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_STORE; -} - -bool AMDGPUInstrInfo::isRegisterLoad(const MachineInstr &MI) const { - return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_LOAD; -} - -int AMDGPUInstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const { - const MachineRegisterInfo &MRI = MF.getRegInfo(); - const MachineFrameInfo *MFI = MF.getFrameInfo(); - int Offset = -1; - - if (MFI->getNumObjects() == 0) { - return -1; - } - - if (MRI.livein_empty()) { - return 0; - } - - const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass(); - for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(), - LE = MRI.livein_end(); - LI != LE; ++LI) { - unsigned Reg = LI->first; - if (TargetRegisterInfo::isVirtualRegister(Reg) || - !IndirectRC->contains(Reg)) - continue; - - unsigned RegIndex; - unsigned RegEnd; - for (RegIndex = 0, RegEnd = IndirectRC->getNumRegs(); RegIndex != RegEnd; - ++RegIndex) { - if (IndirectRC->getRegister(RegIndex) == Reg) - break; - } - Offset = std::max(Offset, (int)RegIndex); - } - - return Offset + 1; -} - -int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const { - int Offset = 0; - const MachineFrameInfo *MFI = MF.getFrameInfo(); - - // Variable sized objects are not supported - assert(!MFI->hasVarSizedObjects()); - - if (MFI->getNumObjects() == 0) { - return -1; - } - - Offset = MF.getSubtarget().getFrameLowering()->getFrameIndexOffset(MF, -1); - - return getIndirectIndexBegin(MF) + Offset; -} - -int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const { - switch (Channels) { - default: return Opcode; - case 1: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_1); - case 2: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_2); - case 3: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_3); - } -} - -// Wrapper for Tablegen'd function. enum Subtarget is not defined in any -// header files, so we need to wrap it in a function that takes unsigned -// instead. -namespace llvm { -namespace AMDGPU { -static int getMCOpcode(uint16_t Opcode, unsigned Gen) { - return getMCOpcodeGen(Opcode, (enum Subtarget)Gen); -} -} -} - -// This must be kept in sync with the SISubtarget class in SIInstrInfo.td -enum SISubtarget { - SI = 0, - VI = 1 -}; - -static enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) { - switch (Gen) { - default: - return SI; - case AMDGPUSubtarget::VOLCANIC_ISLANDS: - return VI; - } -} - -int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const { - int MCOp = AMDGPU::getMCOpcode( - Opcode, AMDGPUSubtargetToSISubtarget(ST.getGeneration())); - - // -1 means that Opcode is already a native instruction. - if (MCOp == -1) - return Opcode; - - // (uint16_t)-1 means that Opcode is a pseudo instruction that has - // no encoding in the given subtarget generation. - if (MCOp == (uint16_t)-1) - return -1; - - return MCOp; -} diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h deleted file mode 100644 index 86d3962b385..00000000000 --- a/lib/Target/R600/AMDGPUInstrInfo.h +++ /dev/null @@ -1,206 +0,0 @@ -//===-- AMDGPUInstrInfo.h - AMDGPU Instruction Information ------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Contains the definition of a TargetInstrInfo class that is common -/// to all AMD GPUs. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_AMDGPUINSTRINFO_H -#define LLVM_LIB_TARGET_R600_AMDGPUINSTRINFO_H - -#include "AMDGPURegisterInfo.h" -#include "llvm/Target/TargetInstrInfo.h" -#include - -#define GET_INSTRINFO_HEADER -#define GET_INSTRINFO_ENUM -#define GET_INSTRINFO_OPERAND_ENUM -#include "AMDGPUGenInstrInfo.inc" - -#define OPCODE_IS_ZERO_INT AMDGPU::PRED_SETE_INT -#define OPCODE_IS_NOT_ZERO_INT AMDGPU::PRED_SETNE_INT -#define OPCODE_IS_ZERO AMDGPU::PRED_SETE -#define OPCODE_IS_NOT_ZERO AMDGPU::PRED_SETNE - -namespace llvm { - -class AMDGPUSubtarget; -class MachineFunction; -class MachineInstr; -class MachineInstrBuilder; - -class AMDGPUInstrInfo : public AMDGPUGenInstrInfo { -private: - const AMDGPURegisterInfo RI; - virtual void anchor(); -protected: - const AMDGPUSubtarget &ST; -public: - explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st); - - virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0; - - bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, - unsigned &DstReg, unsigned &SubIdx) const override; - - unsigned isLoadFromStackSlot(const MachineInstr *MI, - int &FrameIndex) const override; - unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI, - int &FrameIndex) const override; - bool hasLoadFromStackSlot(const MachineInstr *MI, - const MachineMemOperand *&MMO, - int &FrameIndex) const override; - unsigned isStoreFromStackSlot(const MachineInstr *MI, int &FrameIndex) const; - unsigned isStoreFromStackSlotPostFE(const MachineInstr *MI, - int &FrameIndex) const; - bool hasStoreFromStackSlot(const MachineInstr *MI, - const MachineMemOperand *&MMO, - int &FrameIndex) const; - - MachineInstr * - convertToThreeAddress(MachineFunction::iterator &MFI, - MachineBasicBlock::iterator &MBBI, - LiveVariables *LV) const override; - - - bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; - - void storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned SrcReg, bool isKill, int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const override; - void loadRegFromStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned DestReg, int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const override; - -protected: - MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, - ArrayRef Ops, - MachineBasicBlock::iterator InsertPt, - int FrameIndex) const override; - MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, - ArrayRef Ops, - MachineBasicBlock::iterator InsertPt, - MachineInstr *LoadMI) const override; - -public: - /// \returns the smallest register index that will be accessed by an indirect - /// read or write or -1 if indirect addressing is not used by this program. - int getIndirectIndexBegin(const MachineFunction &MF) const; - - /// \returns the largest register index that will be accessed by an indirect - /// read or write or -1 if indirect addressing is not used by this program. - int getIndirectIndexEnd(const MachineFunction &MF) const; - - bool canFoldMemoryOperand(const MachineInstr *MI, - ArrayRef Ops) const override; - bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, - unsigned Reg, bool UnfoldLoad, bool UnfoldStore, - SmallVectorImpl &NewMIs) const override; - bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, - SmallVectorImpl &NewNodes) const override; - unsigned getOpcodeAfterMemoryUnfold(unsigned Opc, - bool UnfoldLoad, bool UnfoldStore, - unsigned *LoadRegIndex = nullptr) const override; - - bool enableClusterLoads() const override; - - bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, - int64_t Offset1, int64_t Offset2, - unsigned NumLoads) const override; - - bool - ReverseBranchCondition(SmallVectorImpl &Cond) const override; - void insertNoop(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI) const override; - bool isPredicated(const MachineInstr *MI) const override; - bool SubsumesPredicate(ArrayRef Pred1, - ArrayRef Pred2) const override; - bool DefinesPredicate(MachineInstr *MI, - std::vector &Pred) const override; - bool isPredicable(MachineInstr *MI) const override; - bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override; - - // Helper functions that check the opcode for status information - bool isRegisterStore(const MachineInstr &MI) const; - bool isRegisterLoad(const MachineInstr &MI) const; - - /// \brief Return a target-specific opcode if Opcode is a pseudo instruction. - /// Return -1 if the target-specific opcode for the pseudo instruction does - /// not exist. If Opcode is not a pseudo instruction, this is identity. - int pseudoToMCOpcode(int Opcode) const; - - /// \brief Return the descriptor of the target-specific machine instruction - /// that corresponds to the specified pseudo or native opcode. - const MCInstrDesc &getMCOpcodeFromPseudo(unsigned Opcode) const { - return get(pseudoToMCOpcode(Opcode)); - } - -//===---------------------------------------------------------------------===// -// Pure virtual funtions to be implemented by sub-classes. -//===---------------------------------------------------------------------===// - - virtual bool isMov(unsigned opcode) const = 0; - - /// \brief Calculate the "Indirect Address" for the given \p RegIndex and - /// \p Channel - /// - /// We model indirect addressing using a virtual address space that can be - /// accesed with loads and stores. The "Indirect Address" is the memory - /// address in this virtual address space that maps to the given \p RegIndex - /// and \p Channel. - virtual unsigned calculateIndirectAddress(unsigned RegIndex, - unsigned Channel) const = 0; - - /// \returns The register class to be used for loading and storing values - /// from an "Indirect Address" . - virtual const TargetRegisterClass *getIndirectAddrRegClass() const = 0; - - /// \brief Build instruction(s) for an indirect register write. - /// - /// \returns The instruction that performs the indirect register write - virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, unsigned Address, - unsigned OffsetReg) const = 0; - - /// \brief Build instruction(s) for an indirect register read. - /// - /// \returns The instruction that performs the indirect register read - virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, unsigned Address, - unsigned OffsetReg) const = 0; - - /// \brief Build a MOV instruction. - virtual MachineInstr *buildMovInstr(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned DstReg, unsigned SrcReg) const = 0; - - /// \brief Given a MIMG \p Opcode that writes all 4 channels, return the - /// equivalent opcode that writes \p Channels Channels. - int getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const; - -}; - -namespace AMDGPU { - int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIndex); -} // End namespace AMDGPU - -} // End llvm namespace - -#define AMDGPU_FLAG_REGISTER_LOAD (UINT64_C(1) << 63) -#define AMDGPU_FLAG_REGISTER_STORE (UINT64_C(1) << 62) - -#endif diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td deleted file mode 100644 index b413897d9d2..00000000000 --- a/lib/Target/R600/AMDGPUInstrInfo.td +++ /dev/null @@ -1,245 +0,0 @@ -//===-- AMDGPUInstrInfo.td - AMDGPU DAG nodes --------------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains DAG node defintions for the AMDGPU target. -// -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// AMDGPU DAG Profiles -//===----------------------------------------------------------------------===// - -def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [ - SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3> -]>; - -def AMDGPUTrigPreOp : SDTypeProfile<1, 2, - [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>] ->; - -def AMDGPULdExpOp : SDTypeProfile<1, 2, - [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>] ->; - -def AMDGPUFPClassOp : SDTypeProfile<1, 2, - [SDTCisInt<0>, SDTCisFP<1>, SDTCisInt<2>] ->; - -def AMDGPUDivScaleOp : SDTypeProfile<2, 3, - [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>] ->; - -// float, float, float, vcc -def AMDGPUFmasOp : SDTypeProfile<1, 4, - [SDTCisFP<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<4>] ->; - -//===----------------------------------------------------------------------===// -// AMDGPU DAG Nodes -// - -// This argument to this node is a dword address. -def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>; - -def AMDGPUcos : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>; -def AMDGPUsin : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>; - -// out = a - floor(a) -def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>; - -// out = 1.0 / a -def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>; - -// out = 1.0 / sqrt(a) -def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>; - -// out = 1.0 / sqrt(a) -def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>; - -// out = 1.0 / sqrt(a) result clamped to +/- max_float. -def AMDGPUrsq_clamped : SDNode<"AMDGPUISD::RSQ_CLAMPED", SDTFPUnaryOp>; - -def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>; - -def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>; - -// out = max(a, b) a and b are floats, where a nan comparison fails. -// This is not commutative because this gives the second operand: -// x < nan ? x : nan -> nan -// nan < x ? nan : x -> x -def AMDGPUfmax_legacy : SDNode<"AMDGPUISD::FMAX_LEGACY", SDTFPBinOp, - [] ->; - -def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>; - -// out = max(a, b) a and b are signed ints -def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp, - [SDNPCommutative, SDNPAssociative] ->; - -// out = max(a, b) a and b are unsigned ints -def AMDGPUumax : SDNode<"AMDGPUISD::UMAX", SDTIntBinOp, - [SDNPCommutative, SDNPAssociative] ->; - -// out = min(a, b) a and b are floats, where a nan comparison fails. -def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp, - [] ->; - -// FIXME: TableGen doesn't like commutative instructions with more -// than 2 operands. -// out = max(a, b, c) a, b and c are floats -def AMDGPUfmax3 : SDNode<"AMDGPUISD::FMAX3", SDTFPTernaryOp, - [/*SDNPCommutative, SDNPAssociative*/] ->; - -// out = max(a, b, c) a, b, and c are signed ints -def AMDGPUsmax3 : SDNode<"AMDGPUISD::SMAX3", AMDGPUDTIntTernaryOp, - [/*SDNPCommutative, SDNPAssociative*/] ->; - -// out = max(a, b, c) a, b and c are unsigned ints -def AMDGPUumax3 : SDNode<"AMDGPUISD::UMAX3", AMDGPUDTIntTernaryOp, - [/*SDNPCommutative, SDNPAssociative*/] ->; - -// out = min(a, b, c) a, b and c are floats -def AMDGPUfmin3 : SDNode<"AMDGPUISD::FMIN3", SDTFPTernaryOp, - [/*SDNPCommutative, SDNPAssociative*/] ->; - -// out = min(a, b, c) a, b and c are signed ints -def AMDGPUsmin3 : SDNode<"AMDGPUISD::SMIN3", AMDGPUDTIntTernaryOp, - [/*SDNPCommutative, SDNPAssociative*/] ->; - -// out = min(a, b) a and b are unsigned ints -def AMDGPUumin3 : SDNode<"AMDGPUISD::UMIN3", AMDGPUDTIntTernaryOp, - [/*SDNPCommutative, SDNPAssociative*/] ->; - -// out = (src0 + src1 > 0xFFFFFFFF) ? 1 : 0 -def AMDGPUcarry : SDNode<"AMDGPUISD::CARRY", SDTIntBinOp, []>; - -// out = (src1 > src0) ? 1 : 0 -def AMDGPUborrow : SDNode<"AMDGPUISD::BORROW", SDTIntBinOp, []>; - - -def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0", - SDTIntToFPOp, []>; -def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1", - SDTIntToFPOp, []>; -def AMDGPUcvt_f32_ubyte2 : SDNode<"AMDGPUISD::CVT_F32_UBYTE2", - SDTIntToFPOp, []>; -def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3", - SDTIntToFPOp, []>; - - -// urecip - This operation is a helper for integer division, it returns the -// result of 1 / a as a fractional unsigned integer. -// out = (2^32 / a) + e -// e is rounding error -def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>; - -// Special case divide preop and flags. -def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>; - -// Special case divide FMA with scale and flags (src0 = Quotient, -// src1 = Denominator, src2 = Numerator). -def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp>; - -// Single or double precision division fixup. -// Special case divide fixup and flags(src0 = Quotient, src1 = -// Denominator, src2 = Numerator). -def AMDGPUdiv_fixup : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>; - -// Look Up 2.0 / pi src0 with segment select src1[4:0] -def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>; - -def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD", - SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>, - [SDNPHasChain, SDNPMayLoad]>; - -def AMDGPUregister_store : SDNode<"AMDGPUISD::REGISTER_STORE", - SDTypeProfile<0, 3, [SDTCisPtrTy<1>, SDTCisInt<2>]>, - [SDNPHasChain, SDNPMayStore]>; - -// MSKOR instructions are atomic memory instructions used mainly for storing -// 8-bit and 16-bit values. The definition is: -// -// MSKOR(dst, mask, src) MEM[dst] = ((MEM[dst] & ~mask) | src) -// -// src0: vec4(src, 0, 0, mask) -// src1: dst - rat offset (aka pointer) in dwords -def AMDGPUstore_mskor : SDNode<"AMDGPUISD::STORE_MSKOR", - SDTypeProfile<0, 2, []>, - [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; - -def AMDGPUround : SDNode<"ISD::FROUND", - SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>; - -def AMDGPUbfe_u32 : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>; -def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>; -def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>; -def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>; - -def AMDGPUbrev : SDNode<"AMDGPUISD::BREV", SDTIntUnaryOp>; - -// Signed and unsigned 24-bit mulitply. The highest 8-bits are ignore when -// performing the mulitply. The result is a 32-bit value. -def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp, - [SDNPCommutative] ->; -def AMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp, - [SDNPCommutative] ->; - -def AMDGPUmad_u24 : SDNode<"AMDGPUISD::MAD_U24", AMDGPUDTIntTernaryOp, - [] ->; -def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp, - [] ->; - -def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG", - SDTypeProfile<0, 1, [SDTCisInt<0>]>, - [SDNPHasChain, SDNPInGlue]>; - -def AMDGPUinterp_mov : SDNode<"AMDGPUISD::INTERP_MOV", - SDTypeProfile<1, 3, [SDTCisFP<0>]>, - [SDNPInGlue]>; - -def AMDGPUinterp_p1 : SDNode<"AMDGPUISD::INTERP_P1", - SDTypeProfile<1, 3, [SDTCisFP<0>]>, - [SDNPInGlue, SDNPOutGlue]>; - -def AMDGPUinterp_p2 : SDNode<"AMDGPUISD::INTERP_P2", - SDTypeProfile<1, 4, [SDTCisFP<0>]>, - [SDNPInGlue]>; - -//===----------------------------------------------------------------------===// -// Flow Control Profile Types -//===----------------------------------------------------------------------===// -// Branch instruction where second and third are basic blocks -def SDTIL_BRCond : SDTypeProfile<0, 2, [ - SDTCisVT<0, OtherVT> - ]>; - -//===----------------------------------------------------------------------===// -// Flow Control DAG Nodes -//===----------------------------------------------------------------------===// -def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>; - -//===----------------------------------------------------------------------===// -// Call/Return DAG Nodes -//===----------------------------------------------------------------------===// -def IL_retflag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone, - [SDNPHasChain, SDNPOptInGlue]>; diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td deleted file mode 100644 index 72cab39277c..00000000000 --- a/lib/Target/R600/AMDGPUInstructions.td +++ /dev/null @@ -1,682 +0,0 @@ -//===-- AMDGPUInstructions.td - Common instruction defs ---*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains instruction defs that are common to all hw codegen -// targets. -// -//===----------------------------------------------------------------------===// - -class AMDGPUInst pattern> : Instruction { - field bit isRegisterLoad = 0; - field bit isRegisterStore = 0; - - let Namespace = "AMDGPU"; - let OutOperandList = outs; - let InOperandList = ins; - let AsmString = asm; - let Pattern = pattern; - let Itinerary = NullALU; - - let TSFlags{63} = isRegisterLoad; - let TSFlags{62} = isRegisterStore; -} - -class AMDGPUShaderInst pattern> - : AMDGPUInst { - - field bits<32> Inst = 0xffffffff; - -} - -def FP32Denormals : Predicate<"Subtarget.hasFP32Denormals()">; -def FP64Denormals : Predicate<"Subtarget.hasFP64Denormals()">; -def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">; - -def InstFlag : OperandWithDefaultOps ; -def ADDRIndirect : ComplexPattern; - -let OperandType = "OPERAND_IMMEDIATE" in { - -def u32imm : Operand { - let PrintMethod = "printU32ImmOperand"; -} - -def u16imm : Operand { - let PrintMethod = "printU16ImmOperand"; -} - -def u8imm : Operand { - let PrintMethod = "printU8ImmOperand"; -} - -} // End OperandType = "OPERAND_IMMEDIATE" - -//===--------------------------------------------------------------------===// -// Custom Operands -//===--------------------------------------------------------------------===// -def brtarget : Operand; - -//===----------------------------------------------------------------------===// -// PatLeafs for floating-point comparisons -//===----------------------------------------------------------------------===// - -def COND_OEQ : PatLeaf < - (cond), - [{return N->get() == ISD::SETOEQ || N->get() == ISD::SETEQ;}] ->; - -def COND_ONE : PatLeaf < - (cond), - [{return N->get() == ISD::SETONE || N->get() == ISD::SETNE;}] ->; - -def COND_OGT : PatLeaf < - (cond), - [{return N->get() == ISD::SETOGT || N->get() == ISD::SETGT;}] ->; - -def COND_OGE : PatLeaf < - (cond), - [{return N->get() == ISD::SETOGE || N->get() == ISD::SETGE;}] ->; - -def COND_OLT : PatLeaf < - (cond), - [{return N->get() == ISD::SETOLT || N->get() == ISD::SETLT;}] ->; - -def COND_OLE : PatLeaf < - (cond), - [{return N->get() == ISD::SETOLE || N->get() == ISD::SETLE;}] ->; - - -def COND_O : PatLeaf <(cond), [{return N->get() == ISD::SETO;}]>; -def COND_UO : PatLeaf <(cond), [{return N->get() == ISD::SETUO;}]>; - -//===----------------------------------------------------------------------===// -// PatLeafs for unsigned / unordered comparisons -//===----------------------------------------------------------------------===// - -def COND_UEQ : PatLeaf <(cond), [{return N->get() == ISD::SETUEQ;}]>; -def COND_UNE : PatLeaf <(cond), [{return N->get() == ISD::SETUNE;}]>; -def COND_UGT : PatLeaf <(cond), [{return N->get() == ISD::SETUGT;}]>; -def COND_UGE : PatLeaf <(cond), [{return N->get() == ISD::SETUGE;}]>; -def COND_ULT : PatLeaf <(cond), [{return N->get() == ISD::SETULT;}]>; -def COND_ULE : PatLeaf <(cond), [{return N->get() == ISD::SETULE;}]>; - -// XXX - For some reason R600 version is preferring to use unordered -// for setne? -def COND_UNE_NE : PatLeaf < - (cond), - [{return N->get() == ISD::SETUNE || N->get() == ISD::SETNE;}] ->; - -//===----------------------------------------------------------------------===// -// PatLeafs for signed comparisons -//===----------------------------------------------------------------------===// - -def COND_SGT : PatLeaf <(cond), [{return N->get() == ISD::SETGT;}]>; -def COND_SGE : PatLeaf <(cond), [{return N->get() == ISD::SETGE;}]>; -def COND_SLT : PatLeaf <(cond), [{return N->get() == ISD::SETLT;}]>; -def COND_SLE : PatLeaf <(cond), [{return N->get() == ISD::SETLE;}]>; - -//===----------------------------------------------------------------------===// -// PatLeafs for integer equality -//===----------------------------------------------------------------------===// - -def COND_EQ : PatLeaf < - (cond), - [{return N->get() == ISD::SETEQ || N->get() == ISD::SETUEQ;}] ->; - -def COND_NE : PatLeaf < - (cond), - [{return N->get() == ISD::SETNE || N->get() == ISD::SETUNE;}] ->; - -def COND_NULL : PatLeaf < - (cond), - [{(void)N; return false;}] ->; - -//===----------------------------------------------------------------------===// -// Load/Store Pattern Fragments -//===----------------------------------------------------------------------===// - -class PrivateMemOp : PatFrag (N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; -}]>; - -class PrivateLoad : PrivateMemOp < - (ops node:$ptr), (op node:$ptr) ->; - -class PrivateStore : PrivateMemOp < - (ops node:$value, node:$ptr), (op node:$value, node:$ptr) ->; - -def load_private : PrivateLoad ; - -def truncstorei8_private : PrivateStore ; -def truncstorei16_private : PrivateStore ; -def store_private : PrivateStore ; - -def global_store : PatFrag<(ops node:$val, node:$ptr), - (store node:$val, node:$ptr), [{ - return isGlobalStore(dyn_cast(N)); -}]>; - -// Global address space loads -def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return isGlobalLoad(dyn_cast(N)); -}]>; - -// Constant address space loads -def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return isConstantLoad(dyn_cast(N), -1); -}]>; - -class AZExtLoadBase : PatFrag<(ops node:$ptr), - (ld_node node:$ptr), [{ - LoadSDNode *L = cast(N); - return L->getExtensionType() == ISD::ZEXTLOAD || - L->getExtensionType() == ISD::EXTLOAD; -}]>; - -def az_extload : AZExtLoadBase ; - -def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i8; -}]>; - -def az_extloadi8_global : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ - return isGlobalLoad(dyn_cast(N)); -}]>; - -def sextloadi8_global : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ - return isGlobalLoad(dyn_cast(N)); -}]>; - -def az_extloadi8_flat : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ - return isFlatLoad(dyn_cast(N)); -}]>; - -def sextloadi8_flat : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ - return isFlatLoad(dyn_cast(N)); -}]>; - -def az_extloadi8_constant : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ - return isConstantLoad(dyn_cast(N), -1); -}]>; - -def sextloadi8_constant : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ - return isConstantLoad(dyn_cast(N), -1); -}]>; - -def az_extloadi8_local : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ - return isLocalLoad(dyn_cast(N)); -}]>; - -def sextloadi8_local : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ - return isLocalLoad(dyn_cast(N)); -}]>; - -def extloadi8_private : PrivateLoad ; -def sextloadi8_private : PrivateLoad ; - -def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i16; -}]>; - -def az_extloadi16_global : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ - return isGlobalLoad(dyn_cast(N)); -}]>; - -def sextloadi16_global : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ - return isGlobalLoad(dyn_cast(N)); -}]>; - -def az_extloadi16_flat : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ - return isFlatLoad(dyn_cast(N)); -}]>; - -def sextloadi16_flat : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ - return isFlatLoad(dyn_cast(N)); -}]>; - -def az_extloadi16_constant : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ - return isConstantLoad(dyn_cast(N), -1); -}]>; - -def sextloadi16_constant : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ - return isConstantLoad(dyn_cast(N), -1); -}]>; - -def az_extloadi16_local : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ - return isLocalLoad(dyn_cast(N)); -}]>; - -def sextloadi16_local : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ - return isLocalLoad(dyn_cast(N)); -}]>; - -def extloadi16_private : PrivateLoad ; -def sextloadi16_private : PrivateLoad ; - -def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i32; -}]>; - -def az_extloadi32_global : PatFrag<(ops node:$ptr), - (az_extloadi32 node:$ptr), [{ - return isGlobalLoad(dyn_cast(N)); -}]>; - -def az_extloadi32_flat : PatFrag<(ops node:$ptr), - (az_extloadi32 node:$ptr), [{ - return isFlatLoad(dyn_cast(N)); -}]>; - -def az_extloadi32_constant : PatFrag<(ops node:$ptr), - (az_extloadi32 node:$ptr), [{ - return isConstantLoad(dyn_cast(N), -1); -}]>; - -def truncstorei8_global : PatFrag<(ops node:$val, node:$ptr), - (truncstorei8 node:$val, node:$ptr), [{ - return isGlobalStore(dyn_cast(N)); -}]>; - -def truncstorei16_global : PatFrag<(ops node:$val, node:$ptr), - (truncstorei16 node:$val, node:$ptr), [{ - return isGlobalStore(dyn_cast(N)); -}]>; - -def truncstorei8_flat : PatFrag<(ops node:$val, node:$ptr), - (truncstorei8 node:$val, node:$ptr), [{ - return isFlatStore(dyn_cast(N)); -}]>; - -def truncstorei16_flat : PatFrag<(ops node:$val, node:$ptr), - (truncstorei16 node:$val, node:$ptr), [{ - return isFlatStore(dyn_cast(N)); -}]>; - -def local_store : PatFrag<(ops node:$val, node:$ptr), - (store node:$val, node:$ptr), [{ - return isLocalStore(dyn_cast(N)); -}]>; - -def truncstorei8_local : PatFrag<(ops node:$val, node:$ptr), - (truncstorei8 node:$val, node:$ptr), [{ - return isLocalStore(dyn_cast(N)); -}]>; - -def truncstorei16_local : PatFrag<(ops node:$val, node:$ptr), - (truncstorei16 node:$val, node:$ptr), [{ - return isLocalStore(dyn_cast(N)); -}]>; - -def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return isLocalLoad(dyn_cast(N)); -}]>; - -class Aligned8Bytes : PatFrag (N)->getAlignment() % 8 == 0; -}]>; - -def local_load_aligned8bytes : Aligned8Bytes < - (ops node:$ptr), (local_load node:$ptr) ->; - -def local_store_aligned8bytes : Aligned8Bytes < - (ops node:$val, node:$ptr), (local_store node:$val, node:$ptr) ->; - -class local_binary_atomic_op : - PatFrag<(ops node:$ptr, node:$value), - (atomic_op node:$ptr, node:$value), [{ - return cast(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; -}]>; - - -def atomic_swap_local : local_binary_atomic_op; -def atomic_load_add_local : local_binary_atomic_op; -def atomic_load_sub_local : local_binary_atomic_op; -def atomic_load_and_local : local_binary_atomic_op; -def atomic_load_or_local : local_binary_atomic_op; -def atomic_load_xor_local : local_binary_atomic_op; -def atomic_load_nand_local : local_binary_atomic_op; -def atomic_load_min_local : local_binary_atomic_op; -def atomic_load_max_local : local_binary_atomic_op; -def atomic_load_umin_local : local_binary_atomic_op; -def atomic_load_umax_local : local_binary_atomic_op; - -def mskor_global : PatFrag<(ops node:$val, node:$ptr), - (AMDGPUstore_mskor node:$val, node:$ptr), [{ - return cast(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; -}]>; - -multiclass AtomicCmpSwapLocal { - - def _32_local : PatFrag < - (ops node:$ptr, node:$cmp, node:$swap), - (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{ - AtomicSDNode *AN = cast(N); - return AN->getMemoryVT() == MVT::i32 && - AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; - }]>; - - def _64_local : PatFrag< - (ops node:$ptr, node:$cmp, node:$swap), - (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{ - AtomicSDNode *AN = cast(N); - return AN->getMemoryVT() == MVT::i64 && - AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; - }]>; -} - -defm atomic_cmp_swap : AtomicCmpSwapLocal ; - -def flat_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return isFlatLoad(dyn_cast(N)); -}]>; - -def flat_store : PatFrag<(ops node:$val, node:$ptr), - (store node:$val, node:$ptr), [{ - return isFlatStore(dyn_cast(N)); -}]>; - -def mskor_flat : PatFrag<(ops node:$val, node:$ptr), - (AMDGPUstore_mskor node:$val, node:$ptr), [{ - return cast(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS; -}]>; - -class global_binary_atomic_op : PatFrag< - (ops node:$ptr, node:$value), - (atomic_op node:$ptr, node:$value), - [{return cast(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}] ->; - -def atomic_swap_global : global_binary_atomic_op; -def atomic_add_global : global_binary_atomic_op; -def atomic_and_global : global_binary_atomic_op; -def atomic_max_global : global_binary_atomic_op; -def atomic_min_global : global_binary_atomic_op; -def atomic_or_global : global_binary_atomic_op; -def atomic_sub_global : global_binary_atomic_op; -def atomic_umax_global : global_binary_atomic_op; -def atomic_umin_global : global_binary_atomic_op; -def atomic_xor_global : global_binary_atomic_op; - -//===----------------------------------------------------------------------===// -// Misc Pattern Fragments -//===----------------------------------------------------------------------===// - -class Constants { -int TWO_PI = 0x40c90fdb; -int PI = 0x40490fdb; -int TWO_PI_INV = 0x3e22f983; -int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding -int FP32_NEG_ONE = 0xbf800000; -int FP32_ONE = 0x3f800000; -} -def CONST : Constants; - -def FP_ZERO : PatLeaf < - (fpimm), - [{return N->getValueAPF().isZero();}] ->; - -def FP_ONE : PatLeaf < - (fpimm), - [{return N->isExactlyValue(1.0);}] ->; - -def FP_HALF : PatLeaf < - (fpimm), - [{return N->isExactlyValue(0.5);}] ->; - -let isCodeGenOnly = 1, isPseudo = 1 in { - -let usesCustomInserter = 1 in { - -class CLAMP : AMDGPUShaderInst < - (outs rc:$dst), - (ins rc:$src0), - "CLAMP $dst, $src0", - [(set f32:$dst, (AMDGPUclamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))] ->; - -class FABS : AMDGPUShaderInst < - (outs rc:$dst), - (ins rc:$src0), - "FABS $dst, $src0", - [(set f32:$dst, (fabs f32:$src0))] ->; - -class FNEG : AMDGPUShaderInst < - (outs rc:$dst), - (ins rc:$src0), - "FNEG $dst, $src0", - [(set f32:$dst, (fneg f32:$src0))] ->; - -} // usesCustomInserter = 1 - -multiclass RegisterLoadStore { -let UseNamedOperandTable = 1 in { - - def RegisterLoad : AMDGPUShaderInst < - (outs dstClass:$dst), - (ins addrClass:$addr, i32imm:$chan), - "RegisterLoad $dst, $addr", - [(set i32:$dst, (AMDGPUregister_load addrPat:$addr, (i32 timm:$chan)))] - > { - let isRegisterLoad = 1; - } - - def RegisterStore : AMDGPUShaderInst < - (outs), - (ins dstClass:$val, addrClass:$addr, i32imm:$chan), - "RegisterStore $val, $addr", - [(AMDGPUregister_store i32:$val, addrPat:$addr, (i32 timm:$chan))] - > { - let isRegisterStore = 1; - } -} -} - -} // End isCodeGenOnly = 1, isPseudo = 1 - -/* Generic helper patterns for intrinsics */ -/* -------------------------------------- */ - -class POW_Common - : Pat < - (fpow f32:$src0, f32:$src1), - (exp_ieee (mul f32:$src1, (log_ieee f32:$src0))) ->; - -/* Other helper patterns */ -/* --------------------- */ - -/* Extract element pattern */ -class Extract_Element - : Pat< - (sub_type (vector_extract vec_type:$src, sub_idx)), - (EXTRACT_SUBREG $src, sub_reg) ->; - -/* Insert element pattern */ -class Insert_Element - : Pat < - (vector_insert vec_type:$vec, elem_type:$elem, sub_idx), - (INSERT_SUBREG $vec, $elem, sub_reg) ->; - -// XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer -// can handle COPY instructions. -// bitconvert pattern -class BitConvert : Pat < - (dt (bitconvert (st rc:$src0))), - (dt rc:$src0) ->; - -// XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer -// can handle COPY instructions. -class DwordAddrPat : Pat < - (vt (AMDGPUdwordaddr (vt rc:$addr))), - (vt rc:$addr) ->; - -// BFI_INT patterns - -multiclass BFIPatterns { - // Definition from ISA doc: - // (y & x) | (z & ~x) - def : Pat < - (or (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))), - (BFI_INT $x, $y, $z) - >; - - // SHA-256 Ch function - // z ^ (x & (y ^ z)) - def : Pat < - (xor i32:$z, (and i32:$x, (xor i32:$y, i32:$z))), - (BFI_INT $x, $y, $z) - >; - - def : Pat < - (fcopysign f32:$src0, f32:$src1), - (BFI_INT (LoadImm32 0x7fffffff), $src0, $src1) - >; - - def : Pat < - (f64 (fcopysign f64:$src0, f64:$src1)), - (REG_SEQUENCE RC64, - (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, - (BFI_INT (LoadImm32 0x7fffffff), - (i32 (EXTRACT_SUBREG $src0, sub1)), - (i32 (EXTRACT_SUBREG $src1, sub1))), sub1) - >; -} - -// SHA-256 Ma patterns - -// ((x & z) | (y & (x | z))) -> BFI_INT (XOR x, y), z, y -class SHA256MaPattern : Pat < - (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))), - (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y) ->; - -// Bitfield extract patterns - -def IMMZeroBasedBitfieldMask : PatLeaf <(imm), [{ - return isMask_32(N->getZExtValue()); -}]>; - -def IMMPopCount : SDNodeXFormgetTargetConstant(countPopulation(N->getZExtValue()), SDLoc(N), - MVT::i32); -}]>; - -class BFEPattern : Pat < - (i32 (and (i32 (srl i32:$src, i32:$rshift)), IMMZeroBasedBitfieldMask:$mask)), - (BFE $src, $rshift, (MOV (i32 (IMMPopCount $mask)))) ->; - -// rotr pattern -class ROTRPattern : Pat < - (rotr i32:$src0, i32:$src1), - (BIT_ALIGN $src0, $src0, $src1) ->; - -// 24-bit arithmetic patterns -def umul24 : PatFrag <(ops node:$x, node:$y), (mul node:$x, node:$y)>; - -// Special conversion patterns - -def cvt_rpi_i32_f32 : PatFrag < - (ops node:$src), - (fp_to_sint (ffloor (fadd $src, FP_HALF))), - [{ (void) N; return TM.Options.NoNaNsFPMath; }] ->; - -def cvt_flr_i32_f32 : PatFrag < - (ops node:$src), - (fp_to_sint (ffloor $src)), - [{ (void)N; return TM.Options.NoNaNsFPMath; }] ->; - -/* -class UMUL24Pattern : Pat < - (mul U24:$x, U24:$y), - (UMUL24 $x, $y) ->; -*/ - -class IMad24Pat : Pat < - (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2), - (Inst $src0, $src1, $src2) ->; - -class UMad24Pat : Pat < - (add (AMDGPUmul_u24 i32:$src0, i32:$src1), i32:$src2), - (Inst $src0, $src1, $src2) ->; - -multiclass Expand24IBitOps { - def _expand_imad24 : Pat < - (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2), - (AddInst (MulInst $src0, $src1), $src2) - >; - - def _expand_imul24 : Pat < - (AMDGPUmul_i24 i32:$src0, i32:$src1), - (MulInst $src0, $src1) - >; -} - -multiclass Expand24UBitOps { - def _expand_umad24 : Pat < - (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2), - (AddInst (MulInst $src0, $src1), $src2) - >; - - def _expand_umul24 : Pat < - (AMDGPUmul_u24 i32:$src0, i32:$src1), - (MulInst $src0, $src1) - >; -} - -class RcpPat : Pat < - (fdiv FP_ONE, vt:$src), - (RcpInst $src) ->; - -class RsqPat : Pat < - (AMDGPUrcp (fsqrt vt:$src)), - (RsqInst $src) ->; - -include "R600Instructions.td" -include "R700Instructions.td" -include "EvergreenInstructions.td" -include "CaymanInstructions.td" - -include "SIInstrInfo.td" - diff --git a/lib/Target/R600/AMDGPUIntrinsicInfo.cpp b/lib/Target/R600/AMDGPUIntrinsicInfo.cpp deleted file mode 100644 index e94bb6013d8..00000000000 --- a/lib/Target/R600/AMDGPUIntrinsicInfo.cpp +++ /dev/null @@ -1,77 +0,0 @@ -//===- AMDGPUIntrinsicInfo.cpp - AMDGPU Intrinsic Information ---*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//==-----------------------------------------------------------------------===// -// -/// \file -/// \brief AMDGPU Implementation of the IntrinsicInfo class. -// -//===-----------------------------------------------------------------------===// - -#include "AMDGPUIntrinsicInfo.h" -#include "AMDGPUSubtarget.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/IR/Module.h" - -using namespace llvm; - -#define GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN -#include "AMDGPUGenIntrinsics.inc" -#undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN - -AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo() - : TargetIntrinsicInfo() {} - -std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys, - unsigned numTys) const { - static const char *const names[] = { -#define GET_INTRINSIC_NAME_TABLE -#include "AMDGPUGenIntrinsics.inc" -#undef GET_INTRINSIC_NAME_TABLE - }; - - if (IntrID < Intrinsic::num_intrinsics) { - return nullptr; - } - assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics && - "Invalid intrinsic ID"); - - std::string Result(names[IntrID - Intrinsic::num_intrinsics]); - return Result; -} - -unsigned AMDGPUIntrinsicInfo::lookupName(const char *Name, - unsigned Len) const { - if (!StringRef(Name, Len).startswith("llvm.")) - return 0; // All intrinsics start with 'llvm.' - -#define GET_FUNCTION_RECOGNIZER -#include "AMDGPUGenIntrinsics.inc" -#undef GET_FUNCTION_RECOGNIZER - AMDGPUIntrinsic::ID IntrinsicID = - (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic; - IntrinsicID = getIntrinsicForGCCBuiltin("AMDGPU", Name); - - if (IntrinsicID != (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic) { - return IntrinsicID; - } - return 0; -} - -bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const { -// Overload Table -#define GET_INTRINSIC_OVERLOAD_TABLE -#include "AMDGPUGenIntrinsics.inc" -#undef GET_INTRINSIC_OVERLOAD_TABLE -} - -Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, - Type **Tys, - unsigned numTys) const { - llvm_unreachable("Not implemented"); -} diff --git a/lib/Target/R600/AMDGPUIntrinsicInfo.h b/lib/Target/R600/AMDGPUIntrinsicInfo.h deleted file mode 100644 index 4c95b5ec097..00000000000 --- a/lib/Target/R600/AMDGPUIntrinsicInfo.h +++ /dev/null @@ -1,48 +0,0 @@ -//===- AMDGPUIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//==-----------------------------------------------------------------------===// -// -/// \file -/// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class. -// -//===-----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_AMDGPUINTRINSICINFO_H -#define LLVM_LIB_TARGET_R600_AMDGPUINTRINSICINFO_H - -#include "llvm/IR/Intrinsics.h" -#include "llvm/Target/TargetIntrinsicInfo.h" - -namespace llvm { -class TargetMachine; - -namespace AMDGPUIntrinsic { -enum ID { - last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1, -#define GET_INTRINSIC_ENUM_VALUES -#include "AMDGPUGenIntrinsics.inc" -#undef GET_INTRINSIC_ENUM_VALUES - , num_AMDGPU_intrinsics -}; - -} // end namespace AMDGPUIntrinsic - -class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo { -public: - AMDGPUIntrinsicInfo(); - std::string getName(unsigned IntrId, Type **Tys = nullptr, - unsigned numTys = 0) const override; - unsigned lookupName(const char *Name, unsigned Len) const override; - bool isOverloaded(unsigned IID) const override; - Function *getDeclaration(Module *M, unsigned ID, - Type **Tys = nullptr, - unsigned numTys = 0) const override; -}; - -} // end namespace llvm - -#endif diff --git a/lib/Target/R600/AMDGPUIntrinsics.td b/lib/Target/R600/AMDGPUIntrinsics.td deleted file mode 100644 index ab489cd2a4a..00000000000 --- a/lib/Target/R600/AMDGPUIntrinsics.td +++ /dev/null @@ -1,90 +0,0 @@ -//===-- AMDGPUIntrinsics.td - Common intrinsics -*- tablegen -*-----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines intrinsics that are used by all hw codegen targets. -// -//===----------------------------------------------------------------------===// - -let TargetPrefix = "AMDGPU", isTarget = 1 in { - - def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>; - def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_abs : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_arl : Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_cndlt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_fract : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; - def int_AMDGPU_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; - - // This is named backwards (instead of rsq_legacy) so we don't have - // to define it with the public builtins intrinsics. This is a - // workaround for how intrinsic names are parsed. If the name is - // llvm.AMDGPU.rsq.legacy, the parser assumes that you meant - // llvm.AMDGPU.rsq.{f32 | f64} and incorrectly mangled the name. - def int_AMDGPU_legacy_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; - - def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; - def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>; - def int_AMDGPU_kilp : Intrinsic<[], [], []>; - def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_sle : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_sne : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_mullit : Intrinsic<[llvm_v4f32_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_tex : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_txb : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_txf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_txq : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_txd : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_txl : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_trunc : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_ddx : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_ddy : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_imax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_imin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_umul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_imul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_imad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_umad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_cvt_f32_ubyte0 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_cvt_f32_ubyte1 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_cvt_f32_ubyte2 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_cvt_f32_ubyte3 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; - def int_AMDGPU_bfi : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_bfe_u32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_barrier_local : Intrinsic<[], [], []>; - def int_AMDGPU_barrier_global : Intrinsic<[], [], []>; -} - -// Legacy names for compatibility. -let TargetPrefix = "AMDIL", isTarget = 1 in { - def int_AMDIL_abs : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>; - def int_AMDIL_fraction : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; - def int_AMDIL_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; - def int_AMDIL_exp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; - def int_AMDIL_round_nearest : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; -} - -let TargetPrefix = "TGSI", isTarget = 1 in { - - def int_TGSI_lit_z : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],[IntrNoMem]>; -} - -include "SIIntrinsics.td" diff --git a/lib/Target/R600/AMDGPUMCInstLower.cpp b/lib/Target/R600/AMDGPUMCInstLower.cpp deleted file mode 100644 index 20831460b93..00000000000 --- a/lib/Target/R600/AMDGPUMCInstLower.cpp +++ /dev/null @@ -1,154 +0,0 @@ -//===- AMDGPUMCInstLower.cpp - Lower AMDGPU MachineInstr to an MCInst -----===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Code to lower AMDGPU MachineInstrs to their corresponding MCInst. -// -//===----------------------------------------------------------------------===// -// - -#include "AMDGPUMCInstLower.h" -#include "AMDGPUAsmPrinter.h" -#include "AMDGPUTargetMachine.h" -#include "InstPrinter/AMDGPUInstPrinter.h" -#include "R600InstrInfo.h" -#include "SIInstrInfo.h" -#include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalVariable.h" -#include "llvm/MC/MCCodeEmitter.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCObjectStreamer.h" -#include "llvm/MC/MCStreamer.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/Format.h" -#include - -using namespace llvm; - -AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st): - Ctx(ctx), ST(st) -{ } - -void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { - - int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(MI->getOpcode()); - - if (MCOpcode == -1) { - LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext(); - C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have " - "a target-specific version: " + Twine(MI->getOpcode())); - } - - OutMI.setOpcode(MCOpcode); - - for (const MachineOperand &MO : MI->explicit_operands()) { - MCOperand MCOp; - switch (MO.getType()) { - default: - llvm_unreachable("unknown operand type"); - case MachineOperand::MO_Immediate: - MCOp = MCOperand::createImm(MO.getImm()); - break; - case MachineOperand::MO_Register: - MCOp = MCOperand::createReg(MO.getReg()); - break; - case MachineOperand::MO_MachineBasicBlock: - MCOp = MCOperand::createExpr(MCSymbolRefExpr::create( - MO.getMBB()->getSymbol(), Ctx)); - break; - case MachineOperand::MO_GlobalAddress: { - const GlobalValue *GV = MO.getGlobal(); - MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(GV->getName())); - MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(Sym, Ctx)); - break; - } - case MachineOperand::MO_TargetIndex: { - assert(MO.getIndex() == AMDGPU::TI_CONSTDATA_START); - MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); - const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx); - MCOp = MCOperand::createExpr(Expr); - break; - } - case MachineOperand::MO_ExternalSymbol: { - MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(MO.getSymbolName())); - const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx); - MCOp = MCOperand::createExpr(Expr); - break; - } - } - OutMI.addOperand(MCOp); - } -} - -void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { - const AMDGPUSubtarget &STI = MF->getSubtarget(); - AMDGPUMCInstLower MCInstLowering(OutContext, STI); - -#ifdef _DEBUG - StringRef Err; - if (!STI.getInstrInfo()->verifyInstruction(MI, Err)) { - errs() << "Warning: Illegal instruction detected: " << Err << "\n"; - MI->dump(); - } -#endif - if (MI->isBundle()) { - const MachineBasicBlock *MBB = MI->getParent(); - MachineBasicBlock::const_instr_iterator I = MI; - ++I; - while (I != MBB->end() && I->isInsideBundle()) { - EmitInstruction(I); - ++I; - } - } else { - MCInst TmpInst; - MCInstLowering.lower(MI, TmpInst); - EmitToStreamer(*OutStreamer, TmpInst); - - if (STI.dumpCode()) { - // Disassemble instruction/operands to text. - DisasmLines.resize(DisasmLines.size() + 1); - std::string &DisasmLine = DisasmLines.back(); - raw_string_ostream DisasmStream(DisasmLine); - - AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(), - *MF->getSubtarget().getInstrInfo(), - *MF->getSubtarget().getRegisterInfo()); - InstPrinter.printInst(&TmpInst, DisasmStream, StringRef(), - MF->getSubtarget()); - - // Disassemble instruction/operands to hex representation. - SmallVector Fixups; - SmallVector CodeBytes; - raw_svector_ostream CodeStream(CodeBytes); - - auto &ObjStreamer = static_cast(*OutStreamer); - MCCodeEmitter &InstEmitter = ObjStreamer.getAssembler().getEmitter(); - InstEmitter.encodeInstruction(TmpInst, CodeStream, Fixups, - MF->getSubtarget()); - CodeStream.flush(); - - HexLines.resize(HexLines.size() + 1); - std::string &HexLine = HexLines.back(); - raw_string_ostream HexStream(HexLine); - - for (size_t i = 0; i < CodeBytes.size(); i += 4) { - unsigned int CodeDWord = *(unsigned int *)&CodeBytes[i]; - HexStream << format("%s%08X", (i > 0 ? " " : ""), CodeDWord); - } - - DisasmStream.flush(); - DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLine.size()); - } - } -} diff --git a/lib/Target/R600/AMDGPUMCInstLower.h b/lib/Target/R600/AMDGPUMCInstLower.h deleted file mode 100644 index d322fe072b2..00000000000 --- a/lib/Target/R600/AMDGPUMCInstLower.h +++ /dev/null @@ -1,35 +0,0 @@ -//===- AMDGPUMCInstLower.h MachineInstr Lowering Interface ------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -/// \file -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_AMDGPUMCINSTLOWER_H -#define LLVM_LIB_TARGET_R600_AMDGPUMCINSTLOWER_H - -namespace llvm { - -class AMDGPUSubtarget; -class MachineInstr; -class MCContext; -class MCInst; - -class AMDGPUMCInstLower { - MCContext &Ctx; - const AMDGPUSubtarget &ST; - -public: - AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST); - - /// \brief Lower a MachineInstr to an MCInst - void lower(const MachineInstr *MI, MCInst &OutMI) const; - -}; - -} // End namespace llvm - -#endif diff --git a/lib/Target/R600/AMDGPUMachineFunction.cpp b/lib/Target/R600/AMDGPUMachineFunction.cpp deleted file mode 100644 index 21c7da66323..00000000000 --- a/lib/Target/R600/AMDGPUMachineFunction.cpp +++ /dev/null @@ -1,25 +0,0 @@ -#include "AMDGPUMachineFunction.h" -#include "AMDGPU.h" -#include "llvm/IR/Attributes.h" -#include "llvm/IR/Function.h" -using namespace llvm; - -static const char *const ShaderTypeAttribute = "ShaderType"; - -// Pin the vtable to this file. -void AMDGPUMachineFunction::anchor() {} - -AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : - MachineFunctionInfo(), - ShaderType(ShaderType::COMPUTE), - LDSSize(0), - ScratchSize(0), - IsKernel(true) { - Attribute A = MF.getFunction()->getFnAttribute(ShaderTypeAttribute); - - if (A.isStringAttribute()) { - StringRef Str = A.getValueAsString(); - if (Str.getAsInteger(0, ShaderType)) - llvm_unreachable("Can't parse shader type!"); - } -} diff --git a/lib/Target/R600/AMDGPUMachineFunction.h b/lib/Target/R600/AMDGPUMachineFunction.h deleted file mode 100644 index f5e4694e76f..00000000000 --- a/lib/Target/R600/AMDGPUMachineFunction.h +++ /dev/null @@ -1,45 +0,0 @@ -//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_AMDGPUMACHINEFUNCTION_H -#define LLVM_LIB_TARGET_R600_AMDGPUMACHINEFUNCTION_H - -#include "llvm/CodeGen/MachineFunction.h" -#include - -namespace llvm { - -class AMDGPUMachineFunction : public MachineFunctionInfo { - virtual void anchor(); - unsigned ShaderType; - -public: - AMDGPUMachineFunction(const MachineFunction &MF); - /// A map to keep track of local memory objects and their offsets within - /// the local memory space. - std::map LocalMemoryObjects; - /// Number of bytes in the LDS that are being used. - unsigned LDSSize; - - /// Start of implicit kernel args - unsigned ABIArgOffset; - - unsigned getShaderType() const { - return ShaderType; - } - - unsigned ScratchSize; - bool IsKernel; -}; - -} -#endif diff --git a/lib/Target/R600/AMDGPUPromoteAlloca.cpp b/lib/Target/R600/AMDGPUPromoteAlloca.cpp deleted file mode 100644 index 4a65bfc57f1..00000000000 --- a/lib/Target/R600/AMDGPUPromoteAlloca.cpp +++ /dev/null @@ -1,407 +0,0 @@ -//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This pass eliminates allocas by either converting them into vectors or -// by migrating them to local address space. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InstVisitor.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" - -#define DEBUG_TYPE "amdgpu-promote-alloca" - -using namespace llvm; - -namespace { - -class AMDGPUPromoteAlloca : public FunctionPass, - public InstVisitor { - - static char ID; - Module *Mod; - const AMDGPUSubtarget &ST; - int LocalMemAvailable; - -public: - AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st), - LocalMemAvailable(0) { } - bool doInitialization(Module &M) override; - bool runOnFunction(Function &F) override; - const char *getPassName() const override { return "AMDGPU Promote Alloca"; } - void visitAlloca(AllocaInst &I); -}; - -} // End anonymous namespace - -char AMDGPUPromoteAlloca::ID = 0; - -bool AMDGPUPromoteAlloca::doInitialization(Module &M) { - Mod = &M; - return false; -} - -bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { - - const FunctionType *FTy = F.getFunctionType(); - - LocalMemAvailable = ST.getLocalMemorySize(); - - - // If the function has any arguments in the local address space, then it's - // possible these arguments require the entire local memory space, so - // we cannot use local memory in the pass. - for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) { - const Type *ParamTy = FTy->getParamType(i); - if (ParamTy->isPointerTy() && - ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { - LocalMemAvailable = 0; - DEBUG(dbgs() << "Function has local memory argument. Promoting to " - "local memory disabled.\n"); - break; - } - } - - if (LocalMemAvailable > 0) { - // Check how much local memory is being used by global objects - for (Module::global_iterator I = Mod->global_begin(), - E = Mod->global_end(); I != E; ++I) { - GlobalVariable *GV = I; - PointerType *GVTy = GV->getType(); - if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) - continue; - for (Value::use_iterator U = GV->use_begin(), - UE = GV->use_end(); U != UE; ++U) { - Instruction *Use = dyn_cast(*U); - if (!Use) - continue; - if (Use->getParent()->getParent() == &F) - LocalMemAvailable -= - Mod->getDataLayout().getTypeAllocSize(GVTy->getElementType()); - } - } - } - - LocalMemAvailable = std::max(0, LocalMemAvailable); - DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n"); - - visit(F); - - return false; -} - -static VectorType *arrayTypeToVecType(const Type *ArrayTy) { - return VectorType::get(ArrayTy->getArrayElementType(), - ArrayTy->getArrayNumElements()); -} - -static Value * -calculateVectorIndex(Value *Ptr, - const std::map &GEPIdx) { - if (isa(Ptr)) - return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext())); - - GetElementPtrInst *GEP = cast(Ptr); - - auto I = GEPIdx.find(GEP); - return I == GEPIdx.end() ? nullptr : I->second; -} - -static Value* GEPToVectorIndex(GetElementPtrInst *GEP) { - // FIXME we only support simple cases - if (GEP->getNumOperands() != 3) - return NULL; - - ConstantInt *I0 = dyn_cast(GEP->getOperand(1)); - if (!I0 || !I0->isZero()) - return NULL; - - return GEP->getOperand(2); -} - -// Not an instruction handled below to turn into a vector. -// -// TODO: Check isTriviallyVectorizable for calls and handle other -// instructions. -static bool canVectorizeInst(Instruction *Inst) { - switch (Inst->getOpcode()) { - case Instruction::Load: - case Instruction::Store: - case Instruction::BitCast: - case Instruction::AddrSpaceCast: - return true; - default: - return false; - } -} - -static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { - Type *AllocaTy = Alloca->getAllocatedType(); - - DEBUG(dbgs() << "Alloca Candidate for vectorization \n"); - - // FIXME: There is no reason why we can't support larger arrays, we - // are just being conservative for now. - if (!AllocaTy->isArrayTy() || - AllocaTy->getArrayElementType()->isVectorTy() || - AllocaTy->getArrayNumElements() > 4) { - - DEBUG(dbgs() << " Cannot convert type to vector"); - return false; - } - - std::map GEPVectorIdx; - std::vector WorkList; - for (User *AllocaUser : Alloca->users()) { - GetElementPtrInst *GEP = dyn_cast(AllocaUser); - if (!GEP) { - if (!canVectorizeInst(cast(AllocaUser))) - return false; - - WorkList.push_back(AllocaUser); - continue; - } - - Value *Index = GEPToVectorIndex(GEP); - - // If we can't compute a vector index from this GEP, then we can't - // promote this alloca to vector. - if (!Index) { - DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << '\n'); - return false; - } - - GEPVectorIdx[GEP] = Index; - for (User *GEPUser : AllocaUser->users()) { - if (!canVectorizeInst(cast(GEPUser))) - return false; - - WorkList.push_back(GEPUser); - } - } - - VectorType *VectorTy = arrayTypeToVecType(AllocaTy); - - DEBUG(dbgs() << " Converting alloca to vector " - << *AllocaTy << " -> " << *VectorTy << '\n'); - - for (std::vector::iterator I = WorkList.begin(), - E = WorkList.end(); I != E; ++I) { - Instruction *Inst = cast(*I); - IRBuilder<> Builder(Inst); - switch (Inst->getOpcode()) { - case Instruction::Load: { - Value *Ptr = Inst->getOperand(0); - Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); - Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0)); - Value *VecValue = Builder.CreateLoad(BitCast); - Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index); - Inst->replaceAllUsesWith(ExtractElement); - Inst->eraseFromParent(); - break; - } - case Instruction::Store: { - Value *Ptr = Inst->getOperand(1); - Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); - Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0)); - Value *VecValue = Builder.CreateLoad(BitCast); - Value *NewVecValue = Builder.CreateInsertElement(VecValue, - Inst->getOperand(0), - Index); - Builder.CreateStore(NewVecValue, BitCast); - Inst->eraseFromParent(); - break; - } - case Instruction::BitCast: - case Instruction::AddrSpaceCast: - break; - - default: - Inst->dump(); - llvm_unreachable("Inconsistency in instructions promotable to vector"); - } - } - return true; -} - -static bool collectUsesWithPtrTypes(Value *Val, std::vector &WorkList) { - bool Success = true; - for (User *User : Val->users()) { - if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end()) - continue; - if (isa(User)) { - WorkList.push_back(User); - continue; - } - - // FIXME: Correctly handle ptrtoint instructions. - Instruction *UseInst = dyn_cast(User); - if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt) - return false; - - if (!User->getType()->isPointerTy()) - continue; - - WorkList.push_back(User); - - Success &= collectUsesWithPtrTypes(User, WorkList); - } - return Success; -} - -void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { - IRBuilder<> Builder(&I); - - // First try to replace the alloca with a vector - Type *AllocaTy = I.getAllocatedType(); - - DEBUG(dbgs() << "Trying to promote " << I << '\n'); - - if (tryPromoteAllocaToVector(&I)) - return; - - DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n"); - - // FIXME: This is the maximum work group size. We should try to get - // value from the reqd_work_group_size function attribute if it is - // available. - unsigned WorkGroupSize = 256; - int AllocaSize = - WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy); - - if (AllocaSize > LocalMemAvailable) { - DEBUG(dbgs() << " Not enough local memory to promote alloca.\n"); - return; - } - - std::vector WorkList; - - if (!collectUsesWithPtrTypes(&I, WorkList)) { - DEBUG(dbgs() << " Do not know how to convert all uses\n"); - return; - } - - DEBUG(dbgs() << "Promoting alloca to local memory\n"); - LocalMemAvailable -= AllocaSize; - - Type *GVTy = ArrayType::get(I.getAllocatedType(), 256); - GlobalVariable *GV = new GlobalVariable( - *Mod, GVTy, false, GlobalValue::ExternalLinkage, 0, I.getName(), 0, - GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); - - FunctionType *FTy = FunctionType::get( - Type::getInt32Ty(Mod->getContext()), false); - AttributeSet AttrSet; - AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone); - - Value *ReadLocalSizeY = Mod->getOrInsertFunction( - "llvm.r600.read.local.size.y", FTy, AttrSet); - Value *ReadLocalSizeZ = Mod->getOrInsertFunction( - "llvm.r600.read.local.size.z", FTy, AttrSet); - Value *ReadTIDIGX = Mod->getOrInsertFunction( - "llvm.r600.read.tidig.x", FTy, AttrSet); - Value *ReadTIDIGY = Mod->getOrInsertFunction( - "llvm.r600.read.tidig.y", FTy, AttrSet); - Value *ReadTIDIGZ = Mod->getOrInsertFunction( - "llvm.r600.read.tidig.z", FTy, AttrSet); - - Value *TCntY = Builder.CreateCall(ReadLocalSizeY, {}); - Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ, {}); - Value *TIdX = Builder.CreateCall(ReadTIDIGX, {}); - Value *TIdY = Builder.CreateCall(ReadTIDIGY, {}); - Value *TIdZ = Builder.CreateCall(ReadTIDIGZ, {}); - - Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ); - Tmp0 = Builder.CreateMul(Tmp0, TIdX); - Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ); - Value *TID = Builder.CreateAdd(Tmp0, Tmp1); - TID = Builder.CreateAdd(TID, TIdZ); - - std::vector Indices; - Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext()))); - Indices.push_back(TID); - - Value *Offset = Builder.CreateGEP(GVTy, GV, Indices); - I.mutateType(Offset->getType()); - I.replaceAllUsesWith(Offset); - I.eraseFromParent(); - - for (std::vector::iterator i = WorkList.begin(), - e = WorkList.end(); i != e; ++i) { - Value *V = *i; - CallInst *Call = dyn_cast(V); - if (!Call) { - Type *EltTy = V->getType()->getPointerElementType(); - PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); - - // The operand's value should be corrected on its own. - if (isa(V)) - continue; - - // FIXME: It doesn't really make sense to try to do this for all - // instructions. - V->mutateType(NewTy); - continue; - } - - IntrinsicInst *Intr = dyn_cast(Call); - if (!Intr) { - std::vector ArgTypes; - for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands(); - ArgIdx != ArgEnd; ++ArgIdx) { - ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType()); - } - Function *F = Call->getCalledFunction(); - FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes, - F->isVarArg()); - Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(), - NewType, F->getAttributes()); - Function *NewF = cast(C); - Call->setCalledFunction(NewF); - continue; - } - - Builder.SetInsertPoint(Intr); - switch (Intr->getIntrinsicID()) { - case Intrinsic::lifetime_start: - case Intrinsic::lifetime_end: - // These intrinsics are for address space 0 only - Intr->eraseFromParent(); - continue; - case Intrinsic::memcpy: { - MemCpyInst *MemCpy = cast(Intr); - Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(), - MemCpy->getLength(), MemCpy->getAlignment(), - MemCpy->isVolatile()); - Intr->eraseFromParent(); - continue; - } - case Intrinsic::memset: { - MemSetInst *MemSet = cast(Intr); - Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(), - MemSet->getLength(), MemSet->getAlignment(), - MemSet->isVolatile()); - Intr->eraseFromParent(); - continue; - } - default: - Intr->dump(); - llvm_unreachable("Don't know how to promote alloca intrinsic use."); - } - } -} - -FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) { - return new AMDGPUPromoteAlloca(ST); -} diff --git a/lib/Target/R600/AMDGPURegisterInfo.cpp b/lib/Target/R600/AMDGPURegisterInfo.cpp deleted file mode 100644 index 3ca0eca3417..00000000000 --- a/lib/Target/R600/AMDGPURegisterInfo.cpp +++ /dev/null @@ -1,63 +0,0 @@ -//===-- AMDGPURegisterInfo.cpp - AMDGPU Register Information -------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Parent TargetRegisterInfo class common to all hw codegen targets. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPURegisterInfo.h" -#include "AMDGPUTargetMachine.h" - -using namespace llvm; - -AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {} - -//===----------------------------------------------------------------------===// -// Function handling callbacks - Functions are a seldom used feature of GPUS, so -// they are not supported at this time. -//===----------------------------------------------------------------------===// - -const MCPhysReg AMDGPURegisterInfo::CalleeSavedReg = AMDGPU::NoRegister; - -const MCPhysReg* -AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { - return &CalleeSavedReg; -} - -void AMDGPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, - int SPAdj, - unsigned FIOperandNum, - RegScavenger *RS) const { - llvm_unreachable("Subroutines not supported yet"); -} - -unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const { - return AMDGPU::NoRegister; -} - -unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const { - static const unsigned SubRegs[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, - AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9, - AMDGPU::sub10, AMDGPU::sub11, AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, - AMDGPU::sub15 - }; - - assert(Channel < array_lengthof(SubRegs)); - return SubRegs[Channel]; -} - -unsigned AMDGPURegisterInfo::getIndirectSubReg(unsigned IndirectIndex) const { - - return getSubRegFromChannel(IndirectIndex); -} - -#define GET_REGINFO_TARGET_DESC -#include "AMDGPUGenRegisterInfo.inc" diff --git a/lib/Target/R600/AMDGPURegisterInfo.h b/lib/Target/R600/AMDGPURegisterInfo.h deleted file mode 100644 index cfd800bdc70..00000000000 --- a/lib/Target/R600/AMDGPURegisterInfo.h +++ /dev/null @@ -1,64 +0,0 @@ -//===-- AMDGPURegisterInfo.h - AMDGPURegisterInfo Interface -*- C++ -*-----===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief TargetRegisterInfo interface that is implemented by all hw codegen -/// targets. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_AMDGPUREGISTERINFO_H -#define LLVM_LIB_TARGET_R600_AMDGPUREGISTERINFO_H - -#include "llvm/ADT/BitVector.h" -#include "llvm/Target/TargetRegisterInfo.h" - -#define GET_REGINFO_HEADER -#define GET_REGINFO_ENUM -#include "AMDGPUGenRegisterInfo.inc" - -namespace llvm { - -class AMDGPUSubtarget; -class TargetInstrInfo; - -struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { - static const MCPhysReg CalleeSavedReg; - - AMDGPURegisterInfo(); - - BitVector getReservedRegs(const MachineFunction &MF) const override { - assert(!"Unimplemented"); return BitVector(); - } - - virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const { - assert(!"Unimplemented"); return nullptr; - } - - virtual unsigned getHWRegIndex(unsigned Reg) const { - assert(!"Unimplemented"); return 0; - } - - /// \returns the sub reg enum value for the given \p Channel - /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0) - unsigned getSubRegFromChannel(unsigned Channel) const; - - const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override; - void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, - unsigned FIOperandNum, - RegScavenger *RS) const override; - unsigned getFrameRegister(const MachineFunction &MF) const override; - - unsigned getIndirectSubReg(unsigned IndirectIndex) const; - -}; - -} // End namespace llvm - -#endif diff --git a/lib/Target/R600/AMDGPURegisterInfo.td b/lib/Target/R600/AMDGPURegisterInfo.td deleted file mode 100644 index 835a1464395..00000000000 --- a/lib/Target/R600/AMDGPURegisterInfo.td +++ /dev/null @@ -1,26 +0,0 @@ -//===-- AMDGPURegisterInfo.td - AMDGPU register info -------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Tablegen register definitions common to all hw codegen targets. -// -//===----------------------------------------------------------------------===// - -let Namespace = "AMDGPU" in { - -foreach Index = 0-15 in { - // Indices are used in a variety of ways here, so don't set a size/offset. - def sub#Index : SubRegIndex<-1, -1>; -} - -def INDIRECT_BASE_ADDR : Register <"INDIRECT_BASE_ADDR">; - -} - -include "R600RegisterInfo.td" -include "SIRegisterInfo.td" diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp deleted file mode 100644 index 605ccd0e136..00000000000 --- a/lib/Target/R600/AMDGPUSubtarget.cpp +++ /dev/null @@ -1,133 +0,0 @@ -//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Implements the AMDGPU specific subclass of TargetSubtarget. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPUSubtarget.h" -#include "R600ISelLowering.h" -#include "R600InstrInfo.h" -#include "R600MachineScheduler.h" -#include "SIISelLowering.h" -#include "SIInstrInfo.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/CodeGen/MachineScheduler.h" - -using namespace llvm; - -#define DEBUG_TYPE "amdgpu-subtarget" - -#define GET_SUBTARGETINFO_ENUM -#define GET_SUBTARGETINFO_TARGET_DESC -#define GET_SUBTARGETINFO_CTOR -#include "AMDGPUGenSubtargetInfo.inc" - -AMDGPUSubtarget & -AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, - StringRef GPU, StringRef FS) { - // Determine default and user-specified characteristics - // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be - // enabled, but some instructions do not respect them and they run at the - // double precision rate, so don't enable by default. - // - // We want to be able to turn these off, but making this a subtarget feature - // for SI has the unhelpful behavior that it unsets everything else if you - // disable it. - - SmallString<256> FullFS("+promote-alloca,+fp64-denormals,"); - FullFS += FS; - - if (GPU == "" && TT.getArch() == Triple::amdgcn) - GPU = "SI"; - - ParseSubtargetFeatures(GPU, FullFS); - - // FIXME: I don't think think Evergreen has any useful support for - // denormals, but should be checked. Should we issue a warning somewhere - // if someone tries to enable these? - if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { - FP32Denormals = false; - FP64Denormals = false; - } - return *this; -} - -AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, - TargetMachine &TM) - : AMDGPUGenSubtargetInfo(TT, GPU, FS), DevName(GPU), Is64bit(false), - DumpCode(false), R600ALUInst(false), HasVertexCache(false), - TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false), - FP64Denormals(false), FP32Denormals(false), FastFMAF32(false), - CaymanISA(false), FlatAddressSpace(false), EnableIRStructurizer(true), - EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false), - WavefrontSize(0), CFALUBug(false), LocalMemorySize(0), - EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false), - GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0), - FrameLowering(TargetFrameLowering::StackGrowsUp, - 64 * 16, // Maximum stack alignment (long16) - 0), - InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) { - - initializeSubtargetDependencies(TT, GPU, FS); - - if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { - InstrInfo.reset(new R600InstrInfo(*this)); - TLInfo.reset(new R600TargetLowering(TM, *this)); - } else { - InstrInfo.reset(new SIInstrInfo(*this)); - TLInfo.reset(new SITargetLowering(TM, *this)); - } -} - -unsigned AMDGPUSubtarget::getStackEntrySize() const { - assert(getGeneration() <= NORTHERN_ISLANDS); - switch(getWavefrontSize()) { - case 16: - return 8; - case 32: - return hasCaymanISA() ? 4 : 8; - case 64: - return 4; - default: - llvm_unreachable("Illegal wavefront size."); - } -} - -unsigned AMDGPUSubtarget::getAmdKernelCodeChipID() const { - switch(getGeneration()) { - default: llvm_unreachable("ChipID unknown"); - case SEA_ISLANDS: return 12; - } -} - -bool AMDGPUSubtarget::isVGPRSpillingEnabled( - const SIMachineFunctionInfo *MFI) const { - return MFI->getShaderType() == ShaderType::COMPUTE || EnableVGPRSpilling; -} - -void AMDGPUSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, - MachineInstr *begin, - MachineInstr *end, - unsigned NumRegionInstrs) const { - if (getGeneration() >= SOUTHERN_ISLANDS) { - - // Track register pressure so the scheduler can try to decrease - // pressure once register usage is above the threshold defined by - // SIRegisterInfo::getRegPressureSetLimit() - Policy.ShouldTrackPressure = true; - - // Enabling both top down and bottom up scheduling seems to give us less - // register spills than just using one of these approaches on its own. - Policy.OnlyTopDown = false; - Policy.OnlyBottomUp = false; - } -} diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h deleted file mode 100644 index 0d40d14f820..00000000000 --- a/lib/Target/R600/AMDGPUSubtarget.h +++ /dev/null @@ -1,282 +0,0 @@ -//=====-- AMDGPUSubtarget.h - Define Subtarget for the AMDIL ---*- C++ -*-====// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//==-----------------------------------------------------------------------===// -// -/// \file -/// \brief AMDGPU specific subclass of TargetSubtarget. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H -#define LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H -#include "AMDGPU.h" -#include "AMDGPUFrameLowering.h" -#include "AMDGPUInstrInfo.h" -#include "AMDGPUIntrinsicInfo.h" -#include "AMDGPUSubtarget.h" -#include "R600ISelLowering.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/Target/TargetSubtargetInfo.h" - -#define GET_SUBTARGETINFO_HEADER -#include "AMDGPUGenSubtargetInfo.inc" - -namespace llvm { - -class SIMachineFunctionInfo; - -class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo { - -public: - enum Generation { - R600 = 0, - R700, - EVERGREEN, - NORTHERN_ISLANDS, - SOUTHERN_ISLANDS, - SEA_ISLANDS, - VOLCANIC_ISLANDS, - }; - - enum { - FIXED_SGPR_COUNT_FOR_INIT_BUG = 80 - }; - -private: - std::string DevName; - bool Is64bit; - bool DumpCode; - bool R600ALUInst; - bool HasVertexCache; - short TexVTXClauseSize; - Generation Gen; - bool FP64; - bool FP64Denormals; - bool FP32Denormals; - bool FastFMAF32; - bool CaymanISA; - bool FlatAddressSpace; - bool EnableIRStructurizer; - bool EnablePromoteAlloca; - bool EnableIfCvt; - bool EnableLoadStoreOpt; - unsigned WavefrontSize; - bool CFALUBug; - int LocalMemorySize; - bool EnableVGPRSpilling; - bool SGPRInitBug; - bool IsGCN; - bool GCN1Encoding; - bool GCN3Encoding; - bool CIInsts; - bool FeatureDisable; - int LDSBankCount; - - AMDGPUFrameLowering FrameLowering; - std::unique_ptr TLInfo; - std::unique_ptr InstrInfo; - InstrItineraryData InstrItins; - Triple TargetTriple; - -public: - AMDGPUSubtarget(const Triple &TT, StringRef CPU, StringRef FS, - TargetMachine &TM); - AMDGPUSubtarget &initializeSubtargetDependencies(const Triple &TT, - StringRef GPU, StringRef FS); - - const AMDGPUFrameLowering *getFrameLowering() const override { - return &FrameLowering; - } - const AMDGPUInstrInfo *getInstrInfo() const override { - return InstrInfo.get(); - } - const AMDGPURegisterInfo *getRegisterInfo() const override { - return &InstrInfo->getRegisterInfo(); - } - AMDGPUTargetLowering *getTargetLowering() const override { - return TLInfo.get(); - } - const InstrItineraryData *getInstrItineraryData() const override { - return &InstrItins; - } - - void ParseSubtargetFeatures(StringRef CPU, StringRef FS); - - bool is64bit() const { - return Is64bit; - } - - bool hasVertexCache() const { - return HasVertexCache; - } - - short getTexVTXClauseSize() const { - return TexVTXClauseSize; - } - - Generation getGeneration() const { - return Gen; - } - - bool hasHWFP64() const { - return FP64; - } - - bool hasCaymanISA() const { - return CaymanISA; - } - - bool hasFP32Denormals() const { - return FP32Denormals; - } - - bool hasFP64Denormals() const { - return FP64Denormals; - } - - bool hasFastFMAF32() const { - return FastFMAF32; - } - - bool hasFlatAddressSpace() const { - return FlatAddressSpace; - } - - bool hasBFE() const { - return (getGeneration() >= EVERGREEN); - } - - bool hasBFI() const { - return (getGeneration() >= EVERGREEN); - } - - bool hasBFM() const { - return hasBFE(); - } - - bool hasBCNT(unsigned Size) const { - if (Size == 32) - return (getGeneration() >= EVERGREEN); - - if (Size == 64) - return (getGeneration() >= SOUTHERN_ISLANDS); - - return false; - } - - bool hasMulU24() const { - return (getGeneration() >= EVERGREEN); - } - - bool hasMulI24() const { - return (getGeneration() >= SOUTHERN_ISLANDS || - hasCaymanISA()); - } - - bool hasFFBL() const { - return (getGeneration() >= EVERGREEN); - } - - bool hasFFBH() const { - return (getGeneration() >= EVERGREEN); - } - - bool hasCARRY() const { - return (getGeneration() >= EVERGREEN); - } - - bool hasBORROW() const { - return (getGeneration() >= EVERGREEN); - } - - bool IsIRStructurizerEnabled() const { - return EnableIRStructurizer; - } - - bool isPromoteAllocaEnabled() const { - return EnablePromoteAlloca; - } - - bool isIfCvtEnabled() const { - return EnableIfCvt; - } - - bool loadStoreOptEnabled() const { - return EnableLoadStoreOpt; - } - - unsigned getWavefrontSize() const { - return WavefrontSize; - } - - unsigned getStackEntrySize() const; - - bool hasCFAluBug() const { - assert(getGeneration() <= NORTHERN_ISLANDS); - return CFALUBug; - } - - int getLocalMemorySize() const { - return LocalMemorySize; - } - - bool hasSGPRInitBug() const { - return SGPRInitBug; - } - - int getLDSBankCount() const { - return LDSBankCount; - } - - unsigned getAmdKernelCodeChipID() const; - - bool enableMachineScheduler() const override { - return true; - } - - void overrideSchedPolicy(MachineSchedPolicy &Policy, - MachineInstr *begin, MachineInstr *end, - unsigned NumRegionInstrs) const override; - - // Helper functions to simplify if statements - bool isTargetELF() const { - return false; - } - - StringRef getDeviceName() const { - return DevName; - } - - bool dumpCode() const { - return DumpCode; - } - bool r600ALUEncoding() const { - return R600ALUInst; - } - bool isAmdHsaOS() const { - return TargetTriple.getOS() == Triple::AMDHSA; - } - bool isVGPRSpillingEnabled(const SIMachineFunctionInfo *MFI) const; - - unsigned getMaxWavesPerCU() const { - if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) - return 10; - - // FIXME: Not sure what this is for other subtagets. - llvm_unreachable("do not know max waves per CU for this subtarget."); - } - - bool enableSubRegLiveness() const override { - return true; - } -}; - -} // End namespace llvm - -#endif diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp deleted file mode 100644 index d65c010888a..00000000000 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ /dev/null @@ -1,292 +0,0 @@ -//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief The AMDGPU target machine contains all of the hardware specific -/// information needed to emit code for R600 and SI GPUs. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPUTargetMachine.h" -#include "AMDGPU.h" -#include "AMDGPUTargetTransformInfo.h" -#include "R600ISelLowering.h" -#include "R600InstrInfo.h" -#include "R600MachineScheduler.h" -#include "SIISelLowering.h" -#include "SIInstrInfo.h" -#include "llvm/Analysis/Passes.h" -#include "llvm/CodeGen/MachineFunctionAnalysis.h" -#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" -#include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/IR/Verifier.h" -#include "llvm/MC/MCAsmInfo.h" -#include "llvm/IR/LegacyPassManager.h" -#include "llvm/Support/TargetRegistry.h" -#include "llvm/Support/raw_os_ostream.h" -#include "llvm/Transforms/IPO.h" -#include "llvm/Transforms/Scalar.h" -#include - -using namespace llvm; - -extern "C" void LLVMInitializeR600Target() { - // Register the target - RegisterTargetMachine X(TheAMDGPUTarget); - RegisterTargetMachine Y(TheGCNTarget); -} - -static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { - return new ScheduleDAGMILive(C, make_unique()); -} - -static MachineSchedRegistry -SchedCustomRegistry("r600", "Run R600's custom scheduler", - createR600MachineScheduler); - -static std::string computeDataLayout(const Triple &TT) { - std::string Ret = "e-p:32:32"; - - if (TT.getArch() == Triple::amdgcn) { - // 32-bit private, local, and region pointers. 64-bit global and constant. - Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64"; - } - - Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256" - "-v512:512-v1024:1024-v2048:2048-n32:64"; - - return Ret; -} - -AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, - StringRef CPU, StringRef FS, - TargetOptions Options, Reloc::Model RM, - CodeModel::Model CM, - CodeGenOpt::Level OptLevel) - : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, RM, CM, - OptLevel), - TLOF(new TargetLoweringObjectFileELF()), Subtarget(TT, CPU, FS, *this), - IntrinsicInfo() { - setRequiresStructuredCFG(true); - initAsmInfo(); -} - -AMDGPUTargetMachine::~AMDGPUTargetMachine() { - delete TLOF; -} - -//===----------------------------------------------------------------------===// -// R600 Target Machine (R600 -> Cayman) -//===----------------------------------------------------------------------===// - -R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT, - StringRef FS, StringRef CPU, - TargetOptions Options, Reloc::Model RM, - CodeModel::Model CM, CodeGenOpt::Level OL) - : AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) {} - -//===----------------------------------------------------------------------===// -// GCN Target Machine (SI+) -//===----------------------------------------------------------------------===// - -GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, - StringRef FS, StringRef CPU, - TargetOptions Options, Reloc::Model RM, - CodeModel::Model CM, CodeGenOpt::Level OL) - : AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) {} - -//===----------------------------------------------------------------------===// -// AMDGPU Pass Setup -//===----------------------------------------------------------------------===// - -namespace { -class AMDGPUPassConfig : public TargetPassConfig { -public: - AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM) - : TargetPassConfig(TM, PM) {} - - AMDGPUTargetMachine &getAMDGPUTargetMachine() const { - return getTM(); - } - - ScheduleDAGInstrs * - createMachineScheduler(MachineSchedContext *C) const override { - const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); - if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) - return createR600MachineScheduler(C); - return nullptr; - } - - void addIRPasses() override; - void addCodeGenPrepare() override; - virtual bool addPreISel() override; - virtual bool addInstSelector() override; -}; - -class R600PassConfig : public AMDGPUPassConfig { -public: - R600PassConfig(TargetMachine *TM, PassManagerBase &PM) - : AMDGPUPassConfig(TM, PM) { } - - bool addPreISel() override; - void addPreRegAlloc() override; - void addPreSched2() override; - void addPreEmitPass() override; -}; - -class GCNPassConfig : public AMDGPUPassConfig { -public: - GCNPassConfig(TargetMachine *TM, PassManagerBase &PM) - : AMDGPUPassConfig(TM, PM) { } - bool addPreISel() override; - bool addInstSelector() override; - void addPreRegAlloc() override; - void addPostRegAlloc() override; - void addPreSched2() override; - void addPreEmitPass() override; -}; - -} // End of anonymous namespace - -TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis( - [this](Function &F) { return TargetTransformInfo(AMDGPUTTIImpl(this)); }); -} - -void AMDGPUPassConfig::addIRPasses() { - // Function calls are not supported, so make sure we inline everything. - addPass(createAMDGPUAlwaysInlinePass()); - addPass(createAlwaysInlinerPass()); - // We need to add the barrier noop pass, otherwise adding the function - // inlining pass will cause all of the PassConfigs passes to be run - // one function at a time, which means if we have a nodule with two - // functions, then we will generate code for the first function - // without ever running any passes on the second. - addPass(createBarrierNoopPass()); - TargetPassConfig::addIRPasses(); -} - -void AMDGPUPassConfig::addCodeGenPrepare() { - const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); - if (ST.isPromoteAllocaEnabled()) { - addPass(createAMDGPUPromoteAlloca(ST)); - addPass(createSROAPass()); - } - TargetPassConfig::addCodeGenPrepare(); -} - -bool -AMDGPUPassConfig::addPreISel() { - const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); - addPass(createFlattenCFGPass()); - if (ST.IsIRStructurizerEnabled()) - addPass(createStructurizeCFGPass()); - return false; -} - -bool AMDGPUPassConfig::addInstSelector() { - addPass(createAMDGPUISelDag(getAMDGPUTargetMachine())); - return false; -} - -//===----------------------------------------------------------------------===// -// R600 Pass Setup -//===----------------------------------------------------------------------===// - -bool R600PassConfig::addPreISel() { - AMDGPUPassConfig::addPreISel(); - addPass(createR600TextureIntrinsicsReplacer()); - return false; -} - -void R600PassConfig::addPreRegAlloc() { - addPass(createR600VectorRegMerger(*TM)); -} - -void R600PassConfig::addPreSched2() { - const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); - addPass(createR600EmitClauseMarkers(), false); - if (ST.isIfCvtEnabled()) - addPass(&IfConverterID, false); - addPass(createR600ClauseMergePass(*TM), false); -} - -void R600PassConfig::addPreEmitPass() { - addPass(createAMDGPUCFGStructurizerPass(), false); - addPass(createR600ExpandSpecialInstrsPass(*TM), false); - addPass(&FinalizeMachineBundlesID, false); - addPass(createR600Packetizer(*TM), false); - addPass(createR600ControlFlowFinalizer(*TM), false); -} - -TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { - return new R600PassConfig(this, PM); -} - -//===----------------------------------------------------------------------===// -// GCN Pass Setup -//===----------------------------------------------------------------------===// - -bool GCNPassConfig::addPreISel() { - AMDGPUPassConfig::addPreISel(); - addPass(createSinkingPass()); - addPass(createSITypeRewriter()); - addPass(createSIAnnotateControlFlowPass()); - return false; -} - -bool GCNPassConfig::addInstSelector() { - AMDGPUPassConfig::addInstSelector(); - addPass(createSILowerI1CopiesPass()); - addPass(createSIFixSGPRCopiesPass(*TM)); - addPass(createSIFoldOperandsPass()); - return false; -} - -void GCNPassConfig::addPreRegAlloc() { - const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); - - // This needs to be run directly before register allocation because - // earlier passes might recompute live intervals. - // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass - if (getOptLevel() > CodeGenOpt::None) { - initializeSIFixControlFlowLiveIntervalsPass(*PassRegistry::getPassRegistry()); - insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); - } - - if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) { - // Don't do this with no optimizations since it throws away debug info by - // merging nonadjacent loads. - - // This should be run after scheduling, but before register allocation. It - // also need extra copies to the address operand to be eliminated. - initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); - insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); - } - addPass(createSIShrinkInstructionsPass(), false); - addPass(createSIFixSGPRLiveRangesPass(), false); -} - -void GCNPassConfig::addPostRegAlloc() { - addPass(createSIPrepareScratchRegs(), false); - addPass(createSIShrinkInstructionsPass(), false); -} - -void GCNPassConfig::addPreSched2() { - addPass(createSIInsertWaits(*TM), false); -} - -void GCNPassConfig::addPreEmitPass() { - addPass(createSILowerControlFlowPass(*TM), false); -} - -TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { - return new GCNPassConfig(this, PM); -} diff --git a/lib/Target/R600/AMDGPUTargetMachine.h b/lib/Target/R600/AMDGPUTargetMachine.h deleted file mode 100644 index 14792e347a7..00000000000 --- a/lib/Target/R600/AMDGPUTargetMachine.h +++ /dev/null @@ -1,89 +0,0 @@ -//===-- AMDGPUTargetMachine.h - AMDGPU TargetMachine Interface --*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief The AMDGPU TargetMachine interface definition for hw codgen targets. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_AMDGPUTARGETMACHINE_H -#define LLVM_LIB_TARGET_R600_AMDGPUTARGETMACHINE_H - -#include "AMDGPUFrameLowering.h" -#include "AMDGPUInstrInfo.h" -#include "AMDGPUIntrinsicInfo.h" -#include "AMDGPUSubtarget.h" -#include "R600ISelLowering.h" -#include "llvm/IR/DataLayout.h" - -namespace llvm { - -//===----------------------------------------------------------------------===// -// AMDGPU Target Machine (R600+) -//===----------------------------------------------------------------------===// - -class AMDGPUTargetMachine : public LLVMTargetMachine { -private: - -protected: - TargetLoweringObjectFile *TLOF; - AMDGPUSubtarget Subtarget; - AMDGPUIntrinsicInfo IntrinsicInfo; - -public: - AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef FS, - StringRef CPU, TargetOptions Options, Reloc::Model RM, - CodeModel::Model CM, CodeGenOpt::Level OL); - ~AMDGPUTargetMachine(); - - const AMDGPUSubtarget *getSubtargetImpl() const { return &Subtarget; } - const AMDGPUSubtarget *getSubtargetImpl(const Function &) const override { - return &Subtarget; - } - const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override { - return &IntrinsicInfo; - } - TargetIRAnalysis getTargetIRAnalysis() override; - - TargetLoweringObjectFile *getObjFileLowering() const override { - return TLOF; - } -}; - -//===----------------------------------------------------------------------===// -// R600 Target Machine (R600 -> Cayman) -//===----------------------------------------------------------------------===// - -class R600TargetMachine : public AMDGPUTargetMachine { - -public: - R600TargetMachine(const Target &T, const Triple &TT, StringRef FS, - StringRef CPU, TargetOptions Options, Reloc::Model RM, - CodeModel::Model CM, CodeGenOpt::Level OL); - - TargetPassConfig *createPassConfig(PassManagerBase &PM) override; -}; - -//===----------------------------------------------------------------------===// -// GCN Target Machine (SI+) -//===----------------------------------------------------------------------===// - -class GCNTargetMachine : public AMDGPUTargetMachine { - -public: - GCNTargetMachine(const Target &T, const Triple &TT, StringRef FS, - StringRef CPU, TargetOptions Options, Reloc::Model RM, - CodeModel::Model CM, CodeGenOpt::Level OL); - - TargetPassConfig *createPassConfig(PassManagerBase &PM) override; -}; - -} // End namespace llvm - -#endif diff --git a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp deleted file mode 100644 index 6dacc742b12..00000000000 --- a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp +++ /dev/null @@ -1,82 +0,0 @@ -//===-- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass ---------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// \file -// This file implements a TargetTransformInfo analysis pass specific to the -// AMDGPU target machine. It uses the target's detailed information to provide -// more precise answers to certain TTI queries, while letting the target -// independent and default TTI implementations handle the rest. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPUTargetTransformInfo.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/CodeGen/BasicTTIImpl.h" -#include "llvm/IR/Module.h" -#include "llvm/Support/Debug.h" -#include "llvm/Target/CostTable.h" -#include "llvm/Target/TargetLowering.h" -using namespace llvm; - -#define DEBUG_TYPE "AMDGPUtti" - -void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, - TTI::UnrollingPreferences &UP) { - UP.Threshold = 300; // Twice the default. - UP.MaxCount = UINT_MAX; - UP.Partial = true; - - // TODO: Do we want runtime unrolling? - - for (const BasicBlock *BB : L->getBlocks()) { - const DataLayout &DL = BB->getModule()->getDataLayout(); - for (const Instruction &I : *BB) { - const GetElementPtrInst *GEP = dyn_cast(&I); - if (!GEP || GEP->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) - continue; - - const Value *Ptr = GEP->getPointerOperand(); - const AllocaInst *Alloca = - dyn_cast(GetUnderlyingObject(Ptr, DL)); - if (Alloca) { - // We want to do whatever we can to limit the number of alloca - // instructions that make it through to the code generator. allocas - // require us to use indirect addressing, which is slow and prone to - // compiler bugs. If this loop does an address calculation on an - // alloca ptr, then we want to use a higher than normal loop unroll - // threshold. This will give SROA a better chance to eliminate these - // allocas. - // - // Don't use the maximum allowed value here as it will make some - // programs way too big. - UP.Threshold = 800; - } - } - } -} - -unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) { - if (Vec) - return 0; - - // Number of VGPRs on SI. - if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) - return 256; - - return 4 * 128; // XXX - 4 channels. Should these count as vector instead? -} - -unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool) { return 32; } - -unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { - // Semi-arbitrary large amount. - return 64; -} diff --git a/lib/Target/R600/AMDGPUTargetTransformInfo.h b/lib/Target/R600/AMDGPUTargetTransformInfo.h deleted file mode 100644 index 791c84e6f28..00000000000 --- a/lib/Target/R600/AMDGPUTargetTransformInfo.h +++ /dev/null @@ -1,78 +0,0 @@ -//===-- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI -------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// \file -/// This file a TargetTransformInfo::Concept conforming object specific to the -/// AMDGPU target machine. It uses the target's detailed information to -/// provide more precise answers to certain TTI queries, while letting the -/// target independent and default TTI implementations handle the rest. -/// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_AMDGPUTARGETTRANSFORMINFO_H -#define LLVM_LIB_TARGET_R600_AMDGPUTARGETTRANSFORMINFO_H - -#include "AMDGPU.h" -#include "AMDGPUTargetMachine.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/CodeGen/BasicTTIImpl.h" -#include "llvm/Target/TargetLowering.h" - -namespace llvm { - -class AMDGPUTTIImpl : public BasicTTIImplBase { - typedef BasicTTIImplBase BaseT; - typedef TargetTransformInfo TTI; - friend BaseT; - - const AMDGPUSubtarget *ST; - const AMDGPUTargetLowering *TLI; - - const AMDGPUSubtarget *getST() const { return ST; } - const AMDGPUTargetLowering *getTLI() const { return TLI; } - -public: - explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM) - : BaseT(TM), ST(TM->getSubtargetImpl()), TLI(ST->getTargetLowering()) {} - - // Provide value semantics. MSVC requires that we spell all of these out. - AMDGPUTTIImpl(const AMDGPUTTIImpl &Arg) - : BaseT(static_cast(Arg)), ST(Arg.ST), TLI(Arg.TLI) {} - AMDGPUTTIImpl(AMDGPUTTIImpl &&Arg) - : BaseT(std::move(static_cast(Arg))), ST(std::move(Arg.ST)), - TLI(std::move(Arg.TLI)) {} - AMDGPUTTIImpl &operator=(const AMDGPUTTIImpl &RHS) { - BaseT::operator=(static_cast(RHS)); - ST = RHS.ST; - TLI = RHS.TLI; - return *this; - } - AMDGPUTTIImpl &operator=(AMDGPUTTIImpl &&RHS) { - BaseT::operator=(std::move(static_cast(RHS))); - ST = std::move(RHS.ST); - TLI = std::move(RHS.TLI); - return *this; - } - - bool hasBranchDivergence() { return true; } - - void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); - - TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) { - assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); - return ST->hasBCNT(TyWidth) ? TTI::PSK_FastHardware : TTI::PSK_Software; - } - - unsigned getNumberOfRegisters(bool Vector); - unsigned getRegisterBitWidth(bool Vector); - unsigned getMaxInterleaveFactor(unsigned VF); -}; - -} // end namespace llvm - -#endif diff --git a/lib/Target/R600/AMDILCFGStructurizer.cpp b/lib/Target/R600/AMDILCFGStructurizer.cpp deleted file mode 100644 index c9b25a1a0b8..00000000000 --- a/lib/Target/R600/AMDILCFGStructurizer.cpp +++ /dev/null @@ -1,1912 +0,0 @@ -//===-- AMDILCFGStructurizer.cpp - CFG Structurizer -----------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -/// \file -//==-----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUInstrInfo.h" -#include "AMDGPUSubtarget.h" -#include "R600InstrInfo.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/SCCIterator.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionAnalysis.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineJumpTableInfo.h" -#include "llvm/CodeGen/MachineLoopInfo.h" -#include "llvm/CodeGen/MachinePostDominators.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/Dominators.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetMachine.h" -#include - -using namespace llvm; - -#define DEBUG_TYPE "structcfg" - -#define DEFAULT_VEC_SLOTS 8 - -// TODO: move-begin. - -//===----------------------------------------------------------------------===// -// -// Statistics for CFGStructurizer. -// -//===----------------------------------------------------------------------===// - -STATISTIC(numSerialPatternMatch, "CFGStructurizer number of serial pattern " - "matched"); -STATISTIC(numIfPatternMatch, "CFGStructurizer number of if pattern " - "matched"); -STATISTIC(numLoopcontPatternMatch, "CFGStructurizer number of loop-continue " - "pattern matched"); -STATISTIC(numClonedBlock, "CFGStructurizer cloned blocks"); -STATISTIC(numClonedInstr, "CFGStructurizer cloned instructions"); - -namespace llvm { - void initializeAMDGPUCFGStructurizerPass(PassRegistry&); -} - -//===----------------------------------------------------------------------===// -// -// Miscellaneous utility for CFGStructurizer. -// -//===----------------------------------------------------------------------===// -namespace { -#define SHOWNEWINSTR(i) \ - DEBUG(dbgs() << "New instr: " << *i << "\n"); - -#define SHOWNEWBLK(b, msg) \ -DEBUG( \ - dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \ - dbgs() << "\n"; \ -); - -#define SHOWBLK_DETAIL(b, msg) \ -DEBUG( \ - if (b) { \ - dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \ - b->print(dbgs()); \ - dbgs() << "\n"; \ - } \ -); - -#define INVALIDSCCNUM -1 - -template -void ReverseVector(SmallVectorImpl &Src) { - size_t sz = Src.size(); - for (size_t i = 0; i < sz/2; ++i) { - NodeT *t = Src[i]; - Src[i] = Src[sz - i - 1]; - Src[sz - i - 1] = t; - } -} - -} // end anonymous namespace - -//===----------------------------------------------------------------------===// -// -// supporting data structure for CFGStructurizer -// -//===----------------------------------------------------------------------===// - - -namespace { - -class BlockInformation { -public: - bool IsRetired; - int SccNum; - BlockInformation() : IsRetired(false), SccNum(INVALIDSCCNUM) {} -}; - -} // end anonymous namespace - -//===----------------------------------------------------------------------===// -// -// CFGStructurizer -// -//===----------------------------------------------------------------------===// - -namespace { -class AMDGPUCFGStructurizer : public MachineFunctionPass { -public: - typedef SmallVector MBBVector; - typedef std::map MBBInfoMap; - typedef std::map LoopLandInfoMap; - - enum PathToKind { - Not_SinglePath = 0, - SinglePath_InPath = 1, - SinglePath_NotInPath = 2 - }; - - static char ID; - - AMDGPUCFGStructurizer() : - MachineFunctionPass(ID), TII(nullptr), TRI(nullptr) { - initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry()); - } - - const char *getPassName() const override { - return "AMDGPU Control Flow Graph structurizer Pass"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addPreserved(); - AU.addRequired(); - AU.addRequired(); - AU.addRequired(); - AU.addRequired(); - } - - /// Perform the CFG structurization - bool run(); - - /// Perform the CFG preparation - /// This step will remove every unconditionnal/dead jump instructions and make - /// sure all loops have an exit block - bool prepare(); - - bool runOnMachineFunction(MachineFunction &MF) override { - TII = static_cast(MF.getSubtarget().getInstrInfo()); - TRI = &TII->getRegisterInfo(); - DEBUG(MF.dump();); - OrderedBlks.clear(); - Visited.clear(); - FuncRep = &MF; - MLI = &getAnalysis(); - DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI);); - MDT = &getAnalysis(); - DEBUG(MDT->print(dbgs(), (const llvm::Module*)nullptr);); - PDT = &getAnalysis(); - DEBUG(PDT->print(dbgs());); - prepare(); - run(); - DEBUG(MF.dump();); - return true; - } - -protected: - MachineDominatorTree *MDT; - MachinePostDominatorTree *PDT; - MachineLoopInfo *MLI; - const R600InstrInfo *TII; - const AMDGPURegisterInfo *TRI; - - // PRINT FUNCTIONS - /// Print the ordered Blocks. - void printOrderedBlocks() const { - size_t i = 0; - for (MBBVector::const_iterator iterBlk = OrderedBlks.begin(), - iterBlkEnd = OrderedBlks.end(); iterBlk != iterBlkEnd; ++iterBlk, ++i) { - dbgs() << "BB" << (*iterBlk)->getNumber(); - dbgs() << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")"; - if (i != 0 && i % 10 == 0) { - dbgs() << "\n"; - } else { - dbgs() << " "; - } - } - } - static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) { - for (MachineLoop::iterator iter = LoopInfo.begin(), - iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) { - (*iter)->print(dbgs(), 0); - } - } - - // UTILITY FUNCTIONS - int getSCCNum(MachineBasicBlock *MBB) const; - MachineBasicBlock *getLoopLandInfo(MachineLoop *LoopRep) const; - bool hasBackEdge(MachineBasicBlock *MBB) const; - static unsigned getLoopDepth(MachineLoop *LoopRep); - bool isRetiredBlock(MachineBasicBlock *MBB) const; - bool isActiveLoophead(MachineBasicBlock *MBB) const; - PathToKind singlePathTo(MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB, - bool AllowSideEntry = true) const; - int countActiveBlock(MBBVector::const_iterator It, - MBBVector::const_iterator E) const; - bool needMigrateBlock(MachineBasicBlock *MBB) const; - - // Utility Functions - void reversePredicateSetter(MachineBasicBlock::iterator I); - /// Compute the reversed DFS post order of Blocks - void orderBlocks(MachineFunction *MF); - - // Function originally from CFGStructTraits - void insertInstrEnd(MachineBasicBlock *MBB, int NewOpcode, - DebugLoc DL = DebugLoc()); - MachineInstr *insertInstrBefore(MachineBasicBlock *MBB, int NewOpcode, - DebugLoc DL = DebugLoc()); - MachineInstr *insertInstrBefore(MachineBasicBlock::iterator I, int NewOpcode); - void insertCondBranchBefore(MachineBasicBlock::iterator I, int NewOpcode, - DebugLoc DL); - void insertCondBranchBefore(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, int NewOpcode, int RegNum, - DebugLoc DL); - void insertCondBranchEnd(MachineBasicBlock *MBB, int NewOpcode, int RegNum); - static int getBranchNzeroOpcode(int OldOpcode); - static int getBranchZeroOpcode(int OldOpcode); - static int getContinueNzeroOpcode(int OldOpcode); - static int getContinueZeroOpcode(int OldOpcode); - static MachineBasicBlock *getTrueBranch(MachineInstr *MI); - static void setTrueBranch(MachineInstr *MI, MachineBasicBlock *MBB); - static MachineBasicBlock *getFalseBranch(MachineBasicBlock *MBB, - MachineInstr *MI); - static bool isCondBranch(MachineInstr *MI); - static bool isUncondBranch(MachineInstr *MI); - static DebugLoc getLastDebugLocInBB(MachineBasicBlock *MBB); - static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *MBB); - /// The correct naming for this is getPossibleLoopendBlockBranchInstr. - /// - /// BB with backward-edge could have move instructions after the branch - /// instruction. Such move instruction "belong to" the loop backward-edge. - MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *MBB); - static MachineInstr *getReturnInstr(MachineBasicBlock *MBB); - static MachineInstr *getContinueInstr(MachineBasicBlock *MBB); - static bool isReturnBlock(MachineBasicBlock *MBB); - static void cloneSuccessorList(MachineBasicBlock *DstMBB, - MachineBasicBlock *SrcMBB) ; - static MachineBasicBlock *clone(MachineBasicBlock *MBB); - /// MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose - /// because the AMDGPU instruction is not recognized as terminator fix this - /// and retire this routine - void replaceInstrUseOfBlockWith(MachineBasicBlock *SrcMBB, - MachineBasicBlock *OldMBB, MachineBasicBlock *NewBlk); - static void wrapup(MachineBasicBlock *MBB); - - - int patternMatch(MachineBasicBlock *MBB); - int patternMatchGroup(MachineBasicBlock *MBB); - int serialPatternMatch(MachineBasicBlock *MBB); - int ifPatternMatch(MachineBasicBlock *MBB); - int loopendPatternMatch(); - int mergeLoop(MachineLoop *LoopRep); - int loopcontPatternMatch(MachineLoop *LoopRep, MachineBasicBlock *LoopHeader); - - void handleLoopcontBlock(MachineBasicBlock *ContingMBB, - MachineLoop *ContingLoop, MachineBasicBlock *ContMBB, - MachineLoop *ContLoop); - /// return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in - /// the same loop with LoopLandInfo without explicitly keeping track of - /// loopContBlks and loopBreakBlks, this is a method to get the information. - bool isSameloopDetachedContbreak(MachineBasicBlock *Src1MBB, - MachineBasicBlock *Src2MBB); - int handleJumpintoIf(MachineBasicBlock *HeadMBB, - MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB); - int handleJumpintoIfImp(MachineBasicBlock *HeadMBB, - MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB); - int improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, - MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, - MachineBasicBlock **LandMBBPtr); - void showImproveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, - MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, - MachineBasicBlock *LandMBB, bool Detail = false); - int cloneOnSideEntryTo(MachineBasicBlock *PreMBB, - MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB); - void mergeSerialBlock(MachineBasicBlock *DstMBB, - MachineBasicBlock *SrcMBB); - - void mergeIfthenelseBlock(MachineInstr *BranchMI, - MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB, - MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB); - void mergeLooplandBlock(MachineBasicBlock *DstMBB, - MachineBasicBlock *LandMBB); - void mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB, - MachineBasicBlock *LandMBB); - void settleLoopcontBlock(MachineBasicBlock *ContingMBB, - MachineBasicBlock *ContMBB); - /// normalizeInfiniteLoopExit change - /// B1: - /// uncond_br LoopHeader - /// - /// to - /// B1: - /// cond_br 1 LoopHeader dummyExit - /// and return the newly added dummy exit block - MachineBasicBlock *normalizeInfiniteLoopExit(MachineLoop *LoopRep); - void removeUnconditionalBranch(MachineBasicBlock *MBB); - /// Remove duplicate branches instructions in a block. - /// For instance - /// B0: - /// cond_br X B1 B2 - /// cond_br X B1 B2 - /// is transformed to - /// B0: - /// cond_br X B1 B2 - void removeRedundantConditionalBranch(MachineBasicBlock *MBB); - void addDummyExitBlock(SmallVectorImpl &RetMBB); - void removeSuccessor(MachineBasicBlock *MBB); - MachineBasicBlock *cloneBlockForPredecessor(MachineBasicBlock *MBB, - MachineBasicBlock *PredMBB); - void migrateInstruction(MachineBasicBlock *SrcMBB, - MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I); - void recordSccnum(MachineBasicBlock *MBB, int SCCNum); - void retireBlock(MachineBasicBlock *MBB); - void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = nullptr); - - MachineBasicBlock *findNearestCommonPostDom(std::set&); - /// This is work around solution for findNearestCommonDominator not available - /// to post dom a proper fix should go to Dominators.h. - MachineBasicBlock *findNearestCommonPostDom(MachineBasicBlock *MBB1, - MachineBasicBlock *MBB2); - -private: - MBBInfoMap BlockInfoMap; - LoopLandInfoMap LLInfoMap; - std::map Visited; - MachineFunction *FuncRep; - SmallVector OrderedBlks; -}; - -int AMDGPUCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const { - MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB); - if (It == BlockInfoMap.end()) - return INVALIDSCCNUM; - return (*It).second->SccNum; -} - -MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep) - const { - LoopLandInfoMap::const_iterator It = LLInfoMap.find(LoopRep); - if (It == LLInfoMap.end()) - return nullptr; - return (*It).second; -} - -bool AMDGPUCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const { - MachineLoop *LoopRep = MLI->getLoopFor(MBB); - if (!LoopRep) - return false; - MachineBasicBlock *LoopHeader = LoopRep->getHeader(); - return MBB->isSuccessor(LoopHeader); -} - -unsigned AMDGPUCFGStructurizer::getLoopDepth(MachineLoop *LoopRep) { - return LoopRep ? LoopRep->getLoopDepth() : 0; -} - -bool AMDGPUCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const { - MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB); - if (It == BlockInfoMap.end()) - return false; - return (*It).second->IsRetired; -} - -bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const { - MachineLoop *LoopRep = MLI->getLoopFor(MBB); - while (LoopRep && LoopRep->getHeader() == MBB) { - MachineBasicBlock *LoopLand = getLoopLandInfo(LoopRep); - if(!LoopLand) - return true; - if (!isRetiredBlock(LoopLand)) - return true; - LoopRep = LoopRep->getParentLoop(); - } - return false; -} -AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo( - MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB, - bool AllowSideEntry) const { - assert(DstMBB); - if (SrcMBB == DstMBB) - return SinglePath_InPath; - while (SrcMBB && SrcMBB->succ_size() == 1) { - SrcMBB = *SrcMBB->succ_begin(); - if (SrcMBB == DstMBB) - return SinglePath_InPath; - if (!AllowSideEntry && SrcMBB->pred_size() > 1) - return Not_SinglePath; - } - if (SrcMBB && SrcMBB->succ_size()==0) - return SinglePath_NotInPath; - return Not_SinglePath; -} - -int AMDGPUCFGStructurizer::countActiveBlock(MBBVector::const_iterator It, - MBBVector::const_iterator E) const { - int Count = 0; - while (It != E) { - if (!isRetiredBlock(*It)) - ++Count; - ++It; - } - return Count; -} - -bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const { - unsigned BlockSizeThreshold = 30; - unsigned CloneInstrThreshold = 100; - bool MultiplePreds = MBB && (MBB->pred_size() > 1); - - if(!MultiplePreds) - return false; - unsigned BlkSize = MBB->size(); - return ((BlkSize > BlockSizeThreshold) && - (BlkSize * (MBB->pred_size() - 1) > CloneInstrThreshold)); -} - -void AMDGPUCFGStructurizer::reversePredicateSetter( - MachineBasicBlock::iterator I) { - while (I--) { - if (I->getOpcode() == AMDGPU::PRED_X) { - switch (static_cast(I)->getOperand(2).getImm()) { - case OPCODE_IS_ZERO_INT: - static_cast(I)->getOperand(2) - .setImm(OPCODE_IS_NOT_ZERO_INT); - return; - case OPCODE_IS_NOT_ZERO_INT: - static_cast(I)->getOperand(2) - .setImm(OPCODE_IS_ZERO_INT); - return; - case OPCODE_IS_ZERO: - static_cast(I)->getOperand(2) - .setImm(OPCODE_IS_NOT_ZERO); - return; - case OPCODE_IS_NOT_ZERO: - static_cast(I)->getOperand(2) - .setImm(OPCODE_IS_ZERO); - return; - default: - llvm_unreachable("PRED_X Opcode invalid!"); - } - } - } -} - -void AMDGPUCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB, - int NewOpcode, DebugLoc DL) { - MachineInstr *MI = MBB->getParent() - ->CreateMachineInstr(TII->get(NewOpcode), DL); - MBB->push_back(MI); - //assume the instruction doesn't take any reg operand ... - SHOWNEWINSTR(MI); -} - -MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB, - int NewOpcode, DebugLoc DL) { - MachineInstr *MI = - MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL); - if (MBB->begin() != MBB->end()) - MBB->insert(MBB->begin(), MI); - else - MBB->push_back(MI); - SHOWNEWINSTR(MI); - return MI; -} - -MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore( - MachineBasicBlock::iterator I, int NewOpcode) { - MachineInstr *OldMI = &(*I); - MachineBasicBlock *MBB = OldMI->getParent(); - MachineInstr *NewMBB = - MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DebugLoc()); - MBB->insert(I, NewMBB); - //assume the instruction doesn't take any reg operand ... - SHOWNEWINSTR(NewMBB); - return NewMBB; -} - -void AMDGPUCFGStructurizer::insertCondBranchBefore( - MachineBasicBlock::iterator I, int NewOpcode, DebugLoc DL) { - MachineInstr *OldMI = &(*I); - MachineBasicBlock *MBB = OldMI->getParent(); - MachineFunction *MF = MBB->getParent(); - MachineInstr *NewMI = MF->CreateMachineInstr(TII->get(NewOpcode), DL); - MBB->insert(I, NewMI); - MachineInstrBuilder MIB(*MF, NewMI); - MIB.addReg(OldMI->getOperand(1).getReg(), false); - SHOWNEWINSTR(NewMI); - //erase later oldInstr->eraseFromParent(); -} - -void AMDGPUCFGStructurizer::insertCondBranchBefore(MachineBasicBlock *blk, - MachineBasicBlock::iterator I, int NewOpcode, int RegNum, - DebugLoc DL) { - MachineFunction *MF = blk->getParent(); - MachineInstr *NewInstr = MF->CreateMachineInstr(TII->get(NewOpcode), DL); - //insert before - blk->insert(I, NewInstr); - MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false); - SHOWNEWINSTR(NewInstr); -} - -void AMDGPUCFGStructurizer::insertCondBranchEnd(MachineBasicBlock *MBB, - int NewOpcode, int RegNum) { - MachineFunction *MF = MBB->getParent(); - MachineInstr *NewInstr = - MF->CreateMachineInstr(TII->get(NewOpcode), DebugLoc()); - MBB->push_back(NewInstr); - MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false); - SHOWNEWINSTR(NewInstr); -} - -int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) { - switch(OldOpcode) { - case AMDGPU::JUMP_COND: - case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET; - case AMDGPU::BRANCH_COND_i32: - case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32; - default: llvm_unreachable("internal error"); - } - return -1; -} - -int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) { - switch(OldOpcode) { - case AMDGPU::JUMP_COND: - case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET; - case AMDGPU::BRANCH_COND_i32: - case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32; - default: llvm_unreachable("internal error"); - } - return -1; -} - -int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) { - switch(OldOpcode) { - case AMDGPU::JUMP_COND: - case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32; - default: llvm_unreachable("internal error"); - }; - return -1; -} - -int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) { - switch(OldOpcode) { - case AMDGPU::JUMP_COND: - case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32; - default: llvm_unreachable("internal error"); - } - return -1; -} - -MachineBasicBlock *AMDGPUCFGStructurizer::getTrueBranch(MachineInstr *MI) { - return MI->getOperand(0).getMBB(); -} - -void AMDGPUCFGStructurizer::setTrueBranch(MachineInstr *MI, - MachineBasicBlock *MBB) { - MI->getOperand(0).setMBB(MBB); -} - -MachineBasicBlock * -AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB, - MachineInstr *MI) { - assert(MBB->succ_size() == 2); - MachineBasicBlock *TrueBranch = getTrueBranch(MI); - MachineBasicBlock::succ_iterator It = MBB->succ_begin(); - MachineBasicBlock::succ_iterator Next = It; - ++Next; - return (*It == TrueBranch) ? *Next : *It; -} - -bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) { - switch (MI->getOpcode()) { - case AMDGPU::JUMP_COND: - case AMDGPU::BRANCH_COND_i32: - case AMDGPU::BRANCH_COND_f32: return true; - default: - return false; - } - return false; -} - -bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) { - switch (MI->getOpcode()) { - case AMDGPU::JUMP: - case AMDGPU::BRANCH: - return true; - default: - return false; - } - return false; -} - -DebugLoc AMDGPUCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) { - //get DebugLoc from the first MachineBasicBlock instruction with debug info - DebugLoc DL; - for (MachineBasicBlock::iterator It = MBB->begin(); It != MBB->end(); - ++It) { - MachineInstr *instr = &(*It); - if (instr->getDebugLoc()) - DL = instr->getDebugLoc(); - } - return DL; -} - -MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr( - MachineBasicBlock *MBB) { - MachineBasicBlock::reverse_iterator It = MBB->rbegin(); - MachineInstr *MI = &*It; - if (MI && (isCondBranch(MI) || isUncondBranch(MI))) - return MI; - return nullptr; -} - -MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr( - MachineBasicBlock *MBB) { - for (MachineBasicBlock::reverse_iterator It = MBB->rbegin(), E = MBB->rend(); - It != E; ++It) { - // FIXME: Simplify - MachineInstr *MI = &*It; - if (MI) { - if (isCondBranch(MI) || isUncondBranch(MI)) - return MI; - else if (!TII->isMov(MI->getOpcode())) - break; - } - } - return nullptr; -} - -MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) { - MachineBasicBlock::reverse_iterator It = MBB->rbegin(); - if (It != MBB->rend()) { - MachineInstr *instr = &(*It); - if (instr->getOpcode() == AMDGPU::RETURN) - return instr; - } - return nullptr; -} - -MachineInstr *AMDGPUCFGStructurizer::getContinueInstr(MachineBasicBlock *MBB) { - MachineBasicBlock::reverse_iterator It = MBB->rbegin(); - if (It != MBB->rend()) { - MachineInstr *MI = &(*It); - if (MI->getOpcode() == AMDGPU::CONTINUE) - return MI; - } - return nullptr; -} - -bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) { - MachineInstr *MI = getReturnInstr(MBB); - bool IsReturn = (MBB->succ_size() == 0); - if (MI) - assert(IsReturn); - else if (IsReturn) - DEBUG( - dbgs() << "BB" << MBB->getNumber() - <<" is return block without RETURN instr\n";); - return IsReturn; -} - -void AMDGPUCFGStructurizer::cloneSuccessorList(MachineBasicBlock *DstMBB, - MachineBasicBlock *SrcMBB) { - for (MachineBasicBlock::succ_iterator It = SrcMBB->succ_begin(), - iterEnd = SrcMBB->succ_end(); It != iterEnd; ++It) - DstMBB->addSuccessor(*It); // *iter's predecessor is also taken care of -} - -MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) { - MachineFunction *Func = MBB->getParent(); - MachineBasicBlock *NewMBB = Func->CreateMachineBasicBlock(); - Func->push_back(NewMBB); //insert to function - for (MachineBasicBlock::iterator It = MBB->begin(), E = MBB->end(); - It != E; ++It) { - MachineInstr *MI = Func->CloneMachineInstr(It); - NewMBB->push_back(MI); - } - return NewMBB; -} - -void AMDGPUCFGStructurizer::replaceInstrUseOfBlockWith( - MachineBasicBlock *SrcMBB, MachineBasicBlock *OldMBB, - MachineBasicBlock *NewBlk) { - MachineInstr *BranchMI = getLoopendBlockBranchInstr(SrcMBB); - if (BranchMI && isCondBranch(BranchMI) && - getTrueBranch(BranchMI) == OldMBB) - setTrueBranch(BranchMI, NewBlk); -} - -void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) { - assert((!MBB->getParent()->getJumpTableInfo() - || MBB->getParent()->getJumpTableInfo()->isEmpty()) - && "found a jump table"); - - //collect continue right before endloop - SmallVector ContInstr; - MachineBasicBlock::iterator Pre = MBB->begin(); - MachineBasicBlock::iterator E = MBB->end(); - MachineBasicBlock::iterator It = Pre; - while (It != E) { - if (Pre->getOpcode() == AMDGPU::CONTINUE - && It->getOpcode() == AMDGPU::ENDLOOP) - ContInstr.push_back(Pre); - Pre = It; - ++It; - } - - //delete continue right before endloop - for (unsigned i = 0; i < ContInstr.size(); ++i) - ContInstr[i]->eraseFromParent(); - - // TODO to fix up jump table so later phase won't be confused. if - // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but - // there isn't such an interface yet. alternatively, replace all the other - // blocks in the jump table with the entryBlk //} - -} - - -bool AMDGPUCFGStructurizer::prepare() { - bool Changed = false; - - //FIXME: if not reducible flow graph, make it so ??? - - DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";); - - orderBlocks(FuncRep); - - SmallVector RetBlks; - - // Add an ExitBlk to loop that don't have one - for (MachineLoopInfo::iterator It = MLI->begin(), - E = MLI->end(); It != E; ++It) { - MachineLoop *LoopRep = (*It); - MBBVector ExitingMBBs; - LoopRep->getExitingBlocks(ExitingMBBs); - - if (ExitingMBBs.size() == 0) { - MachineBasicBlock* DummyExitBlk = normalizeInfiniteLoopExit(LoopRep); - if (DummyExitBlk) - RetBlks.push_back(DummyExitBlk); - } - } - - // Remove unconditional branch instr. - // Add dummy exit block iff there are multiple returns. - for (SmallVectorImpl::const_iterator - It = OrderedBlks.begin(), E = OrderedBlks.end(); It != E; ++It) { - MachineBasicBlock *MBB = *It; - removeUnconditionalBranch(MBB); - removeRedundantConditionalBranch(MBB); - if (isReturnBlock(MBB)) { - RetBlks.push_back(MBB); - } - assert(MBB->succ_size() <= 2); - } - - if (RetBlks.size() >= 2) { - addDummyExitBlock(RetBlks); - Changed = true; - } - - return Changed; -} - -bool AMDGPUCFGStructurizer::run() { - - //Assume reducible CFG... - DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n"); - -#ifdef STRESSTEST - //Use the worse block ordering to test the algorithm. - ReverseVector(orderedBlks); -#endif - - DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks();); - int NumIter = 0; - bool Finish = false; - MachineBasicBlock *MBB; - bool MakeProgress = false; - int NumRemainedBlk = countActiveBlock(OrderedBlks.begin(), - OrderedBlks.end()); - - do { - ++NumIter; - DEBUG( - dbgs() << "numIter = " << NumIter - << ", numRemaintedBlk = " << NumRemainedBlk << "\n"; - ); - - SmallVectorImpl::const_iterator It = - OrderedBlks.begin(); - SmallVectorImpl::const_iterator E = - OrderedBlks.end(); - - SmallVectorImpl::const_iterator SccBeginIter = - It; - MachineBasicBlock *SccBeginMBB = nullptr; - int SccNumBlk = 0; // The number of active blocks, init to a - // maximum possible number. - int SccNumIter; // Number of iteration in this SCC. - - while (It != E) { - MBB = *It; - - if (!SccBeginMBB) { - SccBeginIter = It; - SccBeginMBB = MBB; - SccNumIter = 0; - SccNumBlk = NumRemainedBlk; // Init to maximum possible number. - DEBUG( - dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB); - dbgs() << "\n"; - ); - } - - if (!isRetiredBlock(MBB)) - patternMatch(MBB); - - ++It; - - bool ContNextScc = true; - if (It == E - || getSCCNum(SccBeginMBB) != getSCCNum(*It)) { - // Just finish one scc. - ++SccNumIter; - int sccRemainedNumBlk = countActiveBlock(SccBeginIter, It); - if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= SccNumBlk) { - DEBUG( - dbgs() << "Can't reduce SCC " << getSCCNum(MBB) - << ", sccNumIter = " << SccNumIter; - dbgs() << "doesn't make any progress\n"; - ); - ContNextScc = true; - } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < SccNumBlk) { - SccNumBlk = sccRemainedNumBlk; - It = SccBeginIter; - ContNextScc = false; - DEBUG( - dbgs() << "repeat processing SCC" << getSCCNum(MBB) - << "sccNumIter = " << SccNumIter << '\n'; - ); - } else { - // Finish the current scc. - ContNextScc = true; - } - } else { - // Continue on next component in the current scc. - ContNextScc = false; - } - - if (ContNextScc) - SccBeginMBB = nullptr; - } //while, "one iteration" over the function. - - MachineBasicBlock *EntryMBB = - GraphTraits::nodes_begin(FuncRep); - if (EntryMBB->succ_size() == 0) { - Finish = true; - DEBUG( - dbgs() << "Reduce to one block\n"; - ); - } else { - int NewnumRemainedBlk - = countActiveBlock(OrderedBlks.begin(), OrderedBlks.end()); - // consider cloned blocks ?? - if (NewnumRemainedBlk == 1 || NewnumRemainedBlk < NumRemainedBlk) { - MakeProgress = true; - NumRemainedBlk = NewnumRemainedBlk; - } else { - MakeProgress = false; - DEBUG( - dbgs() << "No progress\n"; - ); - } - } - } while (!Finish && MakeProgress); - - // Misc wrap up to maintain the consistency of the Function representation. - wrapup(GraphTraits::nodes_begin(FuncRep)); - - // Detach retired Block, release memory. - for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end(); - It != E; ++It) { - if ((*It).second && (*It).second->IsRetired) { - assert(((*It).first)->getNumber() != -1); - DEBUG( - dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n"; - ); - (*It).first->eraseFromParent(); //Remove from the parent Function. - } - delete (*It).second; - } - BlockInfoMap.clear(); - LLInfoMap.clear(); - - if (!Finish) { - DEBUG(FuncRep->viewCFG()); - llvm_unreachable("IRREDUCIBLE_CFG"); - } - - return true; -} - - - -void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) { - int SccNum = 0; - MachineBasicBlock *MBB; - for (scc_iterator It = scc_begin(MF); !It.isAtEnd(); - ++It, ++SccNum) { - const std::vector &SccNext = *It; - for (std::vector::const_iterator - blockIter = SccNext.begin(), blockEnd = SccNext.end(); - blockIter != blockEnd; ++blockIter) { - MBB = *blockIter; - OrderedBlks.push_back(MBB); - recordSccnum(MBB, SccNum); - } - } - - //walk through all the block in func to check for unreachable - typedef GraphTraits GTM; - MachineFunction::iterator It = GTM::nodes_begin(MF), E = GTM::nodes_end(MF); - for (; It != E; ++It) { - MachineBasicBlock *MBB = &(*It); - SccNum = getSCCNum(MBB); - if (SccNum == INVALIDSCCNUM) - dbgs() << "unreachable block BB" << MBB->getNumber() << "\n"; - } -} - -int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) { - int NumMatch = 0; - int CurMatch; - - DEBUG( - dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n"; - ); - - while ((CurMatch = patternMatchGroup(MBB)) > 0) - NumMatch += CurMatch; - - DEBUG( - dbgs() << "End patternMatch BB" << MBB->getNumber() - << ", numMatch = " << NumMatch << "\n"; - ); - - return NumMatch; -} - -int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) { - int NumMatch = 0; - NumMatch += loopendPatternMatch(); - NumMatch += serialPatternMatch(MBB); - NumMatch += ifPatternMatch(MBB); - return NumMatch; -} - - -int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) { - if (MBB->succ_size() != 1) - return 0; - - MachineBasicBlock *childBlk = *MBB->succ_begin(); - if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk)) - return 0; - - mergeSerialBlock(MBB, childBlk); - ++numSerialPatternMatch; - return 1; -} - -int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) { - //two edges - if (MBB->succ_size() != 2) - return 0; - if (hasBackEdge(MBB)) - return 0; - MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB); - if (!BranchMI) - return 0; - - assert(isCondBranch(BranchMI)); - int NumMatch = 0; - - MachineBasicBlock *TrueMBB = getTrueBranch(BranchMI); - NumMatch += serialPatternMatch(TrueMBB); - NumMatch += ifPatternMatch(TrueMBB); - MachineBasicBlock *FalseMBB = getFalseBranch(MBB, BranchMI); - NumMatch += serialPatternMatch(FalseMBB); - NumMatch += ifPatternMatch(FalseMBB); - MachineBasicBlock *LandBlk; - int Cloned = 0; - - assert (!TrueMBB->succ_empty() || !FalseMBB->succ_empty()); - // TODO: Simplify - if (TrueMBB->succ_size() == 1 && FalseMBB->succ_size() == 1 - && *TrueMBB->succ_begin() == *FalseMBB->succ_begin()) { - // Diamond pattern - LandBlk = *TrueMBB->succ_begin(); - } else if (TrueMBB->succ_size() == 1 && *TrueMBB->succ_begin() == FalseMBB) { - // Triangle pattern, false is empty - LandBlk = FalseMBB; - FalseMBB = nullptr; - } else if (FalseMBB->succ_size() == 1 - && *FalseMBB->succ_begin() == TrueMBB) { - // Triangle pattern, true is empty - // We reverse the predicate to make a triangle, empty false pattern; - std::swap(TrueMBB, FalseMBB); - reversePredicateSetter(MBB->end()); - LandBlk = FalseMBB; - FalseMBB = nullptr; - } else if (FalseMBB->succ_size() == 1 - && isSameloopDetachedContbreak(TrueMBB, FalseMBB)) { - LandBlk = *FalseMBB->succ_begin(); - } else if (TrueMBB->succ_size() == 1 - && isSameloopDetachedContbreak(FalseMBB, TrueMBB)) { - LandBlk = *TrueMBB->succ_begin(); - } else { - return NumMatch + handleJumpintoIf(MBB, TrueMBB, FalseMBB); - } - - // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the - // new BB created for landBlk==NULL may introduce new challenge to the - // reduction process. - if (LandBlk && - ((TrueMBB && TrueMBB->pred_size() > 1) - || (FalseMBB && FalseMBB->pred_size() > 1))) { - Cloned += improveSimpleJumpintoIf(MBB, TrueMBB, FalseMBB, &LandBlk); - } - - if (TrueMBB && TrueMBB->pred_size() > 1) { - TrueMBB = cloneBlockForPredecessor(TrueMBB, MBB); - ++Cloned; - } - - if (FalseMBB && FalseMBB->pred_size() > 1) { - FalseMBB = cloneBlockForPredecessor(FalseMBB, MBB); - ++Cloned; - } - - mergeIfthenelseBlock(BranchMI, MBB, TrueMBB, FalseMBB, LandBlk); - - ++numIfPatternMatch; - - numClonedBlock += Cloned; - - return 1 + Cloned + NumMatch; -} - -int AMDGPUCFGStructurizer::loopendPatternMatch() { - std::deque NestedLoops; - for (auto &It: *MLI) - for (MachineLoop *ML : depth_first(It)) - NestedLoops.push_front(ML); - - if (NestedLoops.size() == 0) - return 0; - - // Process nested loop outside->inside (we did push_front), - // so "continue" to a outside loop won't be mistaken as "break" - // of the current loop. - int Num = 0; - for (MachineLoop *ExaminedLoop : NestedLoops) { - if (ExaminedLoop->getNumBlocks() == 0 || Visited[ExaminedLoop]) - continue; - DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump();); - int NumBreak = mergeLoop(ExaminedLoop); - if (NumBreak == -1) - break; - Num += NumBreak; - } - return Num; -} - -int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) { - MachineBasicBlock *LoopHeader = LoopRep->getHeader(); - MBBVector ExitingMBBs; - LoopRep->getExitingBlocks(ExitingMBBs); - assert(!ExitingMBBs.empty() && "Infinite Loop not supported"); - DEBUG(dbgs() << "Loop has " << ExitingMBBs.size() << " exiting blocks\n";); - // We assume a single ExitBlk - MBBVector ExitBlks; - LoopRep->getExitBlocks(ExitBlks); - SmallPtrSet ExitBlkSet; - for (unsigned i = 0, e = ExitBlks.size(); i < e; ++i) - ExitBlkSet.insert(ExitBlks[i]); - assert(ExitBlkSet.size() == 1); - MachineBasicBlock *ExitBlk = *ExitBlks.begin(); - assert(ExitBlk && "Loop has several exit block"); - MBBVector LatchBlks; - typedef GraphTraits > InvMBBTraits; - InvMBBTraits::ChildIteratorType PI = InvMBBTraits::child_begin(LoopHeader), - PE = InvMBBTraits::child_end(LoopHeader); - for (; PI != PE; PI++) { - if (LoopRep->contains(*PI)) - LatchBlks.push_back(*PI); - } - - for (unsigned i = 0, e = ExitingMBBs.size(); i < e; ++i) - mergeLoopbreakBlock(ExitingMBBs[i], ExitBlk); - for (unsigned i = 0, e = LatchBlks.size(); i < e; ++i) - settleLoopcontBlock(LatchBlks[i], LoopHeader); - int Match = 0; - do { - Match = 0; - Match += serialPatternMatch(LoopHeader); - Match += ifPatternMatch(LoopHeader); - } while (Match > 0); - mergeLooplandBlock(LoopHeader, ExitBlk); - MachineLoop *ParentLoop = LoopRep->getParentLoop(); - if (ParentLoop) - MLI->changeLoopFor(LoopHeader, ParentLoop); - else - MLI->removeBlock(LoopHeader); - Visited[LoopRep] = true; - return 1; -} - -int AMDGPUCFGStructurizer::loopcontPatternMatch(MachineLoop *LoopRep, - MachineBasicBlock *LoopHeader) { - int NumCont = 0; - SmallVector ContMBB; - typedef GraphTraits > GTIM; - GTIM::ChildIteratorType It = GTIM::child_begin(LoopHeader), - E = GTIM::child_end(LoopHeader); - for (; It != E; ++It) { - MachineBasicBlock *MBB = *It; - if (LoopRep->contains(MBB)) { - handleLoopcontBlock(MBB, MLI->getLoopFor(MBB), - LoopHeader, LoopRep); - ContMBB.push_back(MBB); - ++NumCont; - } - } - - for (SmallVectorImpl::iterator It = ContMBB.begin(), - E = ContMBB.end(); It != E; ++It) { - (*It)->removeSuccessor(LoopHeader); - } - - numLoopcontPatternMatch += NumCont; - - return NumCont; -} - - -bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak( - MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) { - if (Src1MBB->succ_size() == 0) { - MachineLoop *LoopRep = MLI->getLoopFor(Src1MBB); - if (LoopRep&& LoopRep == MLI->getLoopFor(Src2MBB)) { - MachineBasicBlock *&TheEntry = LLInfoMap[LoopRep]; - if (TheEntry) { - DEBUG( - dbgs() << "isLoopContBreakBlock yes src1 = BB" - << Src1MBB->getNumber() - << " src2 = BB" << Src2MBB->getNumber() << "\n"; - ); - return true; - } - } - } - return false; -} - -int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB, - MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) { - int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB); - if (Num == 0) { - DEBUG( - dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n"; - ); - Num = handleJumpintoIfImp(HeadMBB, FalseMBB, TrueMBB); - } - return Num; -} - -int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB, - MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) { - int Num = 0; - MachineBasicBlock *DownBlk; - - //trueBlk could be the common post dominator - DownBlk = TrueMBB; - - DEBUG( - dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber() - << " true = BB" << TrueMBB->getNumber() - << ", numSucc=" << TrueMBB->succ_size() - << " false = BB" << FalseMBB->getNumber() << "\n"; - ); - - while (DownBlk) { - DEBUG( - dbgs() << "check down = BB" << DownBlk->getNumber(); - ); - - if (singlePathTo(FalseMBB, DownBlk) == SinglePath_InPath) { - DEBUG( - dbgs() << " working\n"; - ); - - Num += cloneOnSideEntryTo(HeadMBB, TrueMBB, DownBlk); - Num += cloneOnSideEntryTo(HeadMBB, FalseMBB, DownBlk); - - numClonedBlock += Num; - Num += serialPatternMatch(*HeadMBB->succ_begin()); - Num += serialPatternMatch(*std::next(HeadMBB->succ_begin())); - Num += ifPatternMatch(HeadMBB); - assert(Num > 0); - - break; - } - DEBUG( - dbgs() << " not working\n"; - ); - DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : nullptr; - } // walk down the postDomTree - - return Num; -} - -void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf( - MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB, - MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB, bool Detail) { - dbgs() << "head = BB" << HeadMBB->getNumber() - << " size = " << HeadMBB->size(); - if (Detail) { - dbgs() << "\n"; - HeadMBB->print(dbgs()); - dbgs() << "\n"; - } - - if (TrueMBB) { - dbgs() << ", true = BB" << TrueMBB->getNumber() << " size = " - << TrueMBB->size() << " numPred = " << TrueMBB->pred_size(); - if (Detail) { - dbgs() << "\n"; - TrueMBB->print(dbgs()); - dbgs() << "\n"; - } - } - if (FalseMBB) { - dbgs() << ", false = BB" << FalseMBB->getNumber() << " size = " - << FalseMBB->size() << " numPred = " << FalseMBB->pred_size(); - if (Detail) { - dbgs() << "\n"; - FalseMBB->print(dbgs()); - dbgs() << "\n"; - } - } - if (LandMBB) { - dbgs() << ", land = BB" << LandMBB->getNumber() << " size = " - << LandMBB->size() << " numPred = " << LandMBB->pred_size(); - if (Detail) { - dbgs() << "\n"; - LandMBB->print(dbgs()); - dbgs() << "\n"; - } - } - - dbgs() << "\n"; -} - -int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, - MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, - MachineBasicBlock **LandMBBPtr) { - bool MigrateTrue = false; - bool MigrateFalse = false; - - MachineBasicBlock *LandBlk = *LandMBBPtr; - - assert((!TrueMBB || TrueMBB->succ_size() <= 1) - && (!FalseMBB || FalseMBB->succ_size() <= 1)); - - if (TrueMBB == FalseMBB) - return 0; - - MigrateTrue = needMigrateBlock(TrueMBB); - MigrateFalse = needMigrateBlock(FalseMBB); - - if (!MigrateTrue && !MigrateFalse) - return 0; - - // If we need to migrate either trueBlk and falseBlk, migrate the rest that - // have more than one predecessors. without doing this, its predecessor - // rather than headBlk will have undefined value in initReg. - if (!MigrateTrue && TrueMBB && TrueMBB->pred_size() > 1) - MigrateTrue = true; - if (!MigrateFalse && FalseMBB && FalseMBB->pred_size() > 1) - MigrateFalse = true; - - DEBUG( - dbgs() << "before improveSimpleJumpintoIf: "; - showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0); - ); - - // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk - // - // new: headBlk => if () {initReg = 1; org trueBlk branch} else - // {initReg = 0; org falseBlk branch } - // => landBlk => if (initReg) {org trueBlk} else {org falseBlk} - // => org landBlk - // if landBlk->pred_size() > 2, put the about if-else inside - // if (initReg !=2) {...} - // - // add initReg = initVal to headBlk - - const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32); - if (!MigrateTrue || !MigrateFalse) { - // XXX: We have an opportunity here to optimize the "branch into if" case - // here. Branch into if looks like this: - // entry - // / | - // diamond_head branch_from - // / \ | - // diamond_false diamond_true - // \ / - // done - // - // The diamond_head block begins the "if" and the diamond_true block - // is the block being "branched into". - // - // If MigrateTrue is true, then TrueBB is the block being "branched into" - // and if MigrateFalse is true, then FalseBB is the block being - // "branched into" - // - // Here is the pseudo code for how I think the optimization should work: - // 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head. - // 2. Insert MOV GPR0, 1 before the branch instruction in branch_from. - // 3. Move the branch instruction from diamond_head into its own basic - // block (new_block). - // 4. Add an unconditional branch from diamond_head to new_block - // 5. Replace the branch instruction in branch_from with an unconditional - // branch to new_block. If branch_from has multiple predecessors, then - // we need to replace the True/False block in the branch - // instruction instead of replacing it. - // 6. Change the condition of the branch instruction in new_block from - // COND to (COND || GPR0) - // - // In order insert these MOV instruction, we will need to use the - // RegisterScavenger. Usually liveness stops being tracked during - // the late machine optimization passes, however if we implement - // bool TargetRegisterInfo::requiresRegisterScavenging( - // const MachineFunction &MF) - // and have it return true, liveness will be tracked correctly - // by generic optimization passes. We will also need to make sure that - // all of our target-specific passes that run after regalloc and before - // the CFGStructurizer track liveness and we will need to modify this pass - // to correctly track liveness. - // - // After the above changes, the new CFG should look like this: - // entry - // / | - // diamond_head branch_from - // \ / - // new_block - // / | - // diamond_false diamond_true - // \ / - // done - // - // Without this optimization, we are forced to duplicate the diamond_true - // block and we will end up with a CFG like this: - // - // entry - // / | - // diamond_head branch_from - // / \ | - // diamond_false diamond_true diamond_true (duplicate) - // \ / | - // done --------------------| - // - // Duplicating diamond_true can be very costly especially if it has a - // lot of instructions. - return 0; - } - - int NumNewBlk = 0; - - bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2); - - //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL" - MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, AMDGPU::ENDIF); - - if (LandBlkHasOtherPred) { - llvm_unreachable("Extra register needed to handle CFG"); - unsigned CmpResReg = - HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC); - llvm_unreachable("Extra compare instruction needed to handle CFG"); - insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, - CmpResReg, DebugLoc()); - } - - // XXX: We are running this after RA, so creating virtual registers will - // cause an assertion failure in the PostRA scheduling pass. - unsigned InitReg = - HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC); - insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, InitReg, - DebugLoc()); - - if (MigrateTrue) { - migrateInstruction(TrueMBB, LandBlk, I); - // need to uncondionally insert the assignment to ensure a path from its - // predecessor rather than headBlk has valid value in initReg if - // (initVal != 1). - llvm_unreachable("Extra register needed to handle CFG"); - } - insertInstrBefore(I, AMDGPU::ELSE); - - if (MigrateFalse) { - migrateInstruction(FalseMBB, LandBlk, I); - // need to uncondionally insert the assignment to ensure a path from its - // predecessor rather than headBlk has valid value in initReg if - // (initVal != 0) - llvm_unreachable("Extra register needed to handle CFG"); - } - - if (LandBlkHasOtherPred) { - // add endif - insertInstrBefore(I, AMDGPU::ENDIF); - - // put initReg = 2 to other predecessors of landBlk - for (MachineBasicBlock::pred_iterator PI = LandBlk->pred_begin(), - PE = LandBlk->pred_end(); PI != PE; ++PI) { - MachineBasicBlock *MBB = *PI; - if (MBB != TrueMBB && MBB != FalseMBB) - llvm_unreachable("Extra register needed to handle CFG"); - } - } - DEBUG( - dbgs() << "result from improveSimpleJumpintoIf: "; - showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0); - ); - - // update landBlk - *LandMBBPtr = LandBlk; - - return NumNewBlk; -} - -void AMDGPUCFGStructurizer::handleLoopcontBlock(MachineBasicBlock *ContingMBB, - MachineLoop *ContingLoop, MachineBasicBlock *ContMBB, - MachineLoop *ContLoop) { - DEBUG(dbgs() << "loopcontPattern cont = BB" << ContingMBB->getNumber() - << " header = BB" << ContMBB->getNumber() << "\n"; - dbgs() << "Trying to continue loop-depth = " - << getLoopDepth(ContLoop) - << " from loop-depth = " << getLoopDepth(ContingLoop) << "\n";); - settleLoopcontBlock(ContingMBB, ContMBB); -} - -void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB, - MachineBasicBlock *SrcMBB) { - DEBUG( - dbgs() << "serialPattern BB" << DstMBB->getNumber() - << " <= BB" << SrcMBB->getNumber() << "\n"; - ); - DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end()); - - DstMBB->removeSuccessor(SrcMBB); - cloneSuccessorList(DstMBB, SrcMBB); - - removeSuccessor(SrcMBB); - MLI->removeBlock(SrcMBB); - retireBlock(SrcMBB); -} - -void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI, - MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB, - MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) { - assert (TrueMBB); - DEBUG( - dbgs() << "ifPattern BB" << MBB->getNumber(); - dbgs() << "{ "; - if (TrueMBB) { - dbgs() << "BB" << TrueMBB->getNumber(); - } - dbgs() << " } else "; - dbgs() << "{ "; - if (FalseMBB) { - dbgs() << "BB" << FalseMBB->getNumber(); - } - dbgs() << " }\n "; - dbgs() << "landBlock: "; - if (!LandMBB) { - dbgs() << "NULL"; - } else { - dbgs() << "BB" << LandMBB->getNumber(); - } - dbgs() << "\n"; - ); - - int OldOpcode = BranchMI->getOpcode(); - DebugLoc BranchDL = BranchMI->getDebugLoc(); - -// transform to -// if cond -// trueBlk -// else -// falseBlk -// endif -// landBlk - - MachineBasicBlock::iterator I = BranchMI; - insertCondBranchBefore(I, getBranchNzeroOpcode(OldOpcode), - BranchDL); - - if (TrueMBB) { - MBB->splice(I, TrueMBB, TrueMBB->begin(), TrueMBB->end()); - MBB->removeSuccessor(TrueMBB); - if (LandMBB && TrueMBB->succ_size()!=0) - TrueMBB->removeSuccessor(LandMBB); - retireBlock(TrueMBB); - MLI->removeBlock(TrueMBB); - } - - if (FalseMBB) { - insertInstrBefore(I, AMDGPU::ELSE); - MBB->splice(I, FalseMBB, FalseMBB->begin(), - FalseMBB->end()); - MBB->removeSuccessor(FalseMBB); - if (LandMBB && FalseMBB->succ_size() != 0) - FalseMBB->removeSuccessor(LandMBB); - retireBlock(FalseMBB); - MLI->removeBlock(FalseMBB); - } - insertInstrBefore(I, AMDGPU::ENDIF); - - BranchMI->eraseFromParent(); - - if (LandMBB && TrueMBB && FalseMBB) - MBB->addSuccessor(LandMBB); - -} - -void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk, - MachineBasicBlock *LandMBB) { - DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber() - << " land = BB" << LandMBB->getNumber() << "\n";); - - insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc()); - insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc()); - DstBlk->addSuccessor(LandMBB); - DstBlk->removeSuccessor(DstBlk); -} - - -void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB, - MachineBasicBlock *LandMBB) { - DEBUG(dbgs() << "loopbreakPattern exiting = BB" << ExitingMBB->getNumber() - << " land = BB" << LandMBB->getNumber() << "\n";); - MachineInstr *BranchMI = getLoopendBlockBranchInstr(ExitingMBB); - assert(BranchMI && isCondBranch(BranchMI)); - DebugLoc DL = BranchMI->getDebugLoc(); - MachineBasicBlock *TrueBranch = getTrueBranch(BranchMI); - MachineBasicBlock::iterator I = BranchMI; - if (TrueBranch != LandMBB) - reversePredicateSetter(I); - insertCondBranchBefore(ExitingMBB, I, AMDGPU::IF_PREDICATE_SET, AMDGPU::PREDICATE_BIT, DL); - insertInstrBefore(I, AMDGPU::BREAK); - insertInstrBefore(I, AMDGPU::ENDIF); - //now branchInst can be erase safely - BranchMI->eraseFromParent(); - //now take care of successors, retire blocks - ExitingMBB->removeSuccessor(LandMBB); -} - -void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB, - MachineBasicBlock *ContMBB) { - DEBUG(dbgs() << "settleLoopcontBlock conting = BB" - << ContingMBB->getNumber() - << ", cont = BB" << ContMBB->getNumber() << "\n";); - - MachineInstr *MI = getLoopendBlockBranchInstr(ContingMBB); - if (MI) { - assert(isCondBranch(MI)); - MachineBasicBlock::iterator I = MI; - MachineBasicBlock *TrueBranch = getTrueBranch(MI); - int OldOpcode = MI->getOpcode(); - DebugLoc DL = MI->getDebugLoc(); - - bool UseContinueLogical = ((&*ContingMBB->rbegin()) == MI); - - if (!UseContinueLogical) { - int BranchOpcode = - TrueBranch == ContMBB ? getBranchNzeroOpcode(OldOpcode) : - getBranchZeroOpcode(OldOpcode); - insertCondBranchBefore(I, BranchOpcode, DL); - // insertEnd to ensure phi-moves, if exist, go before the continue-instr. - insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, DL); - insertInstrEnd(ContingMBB, AMDGPU::ENDIF, DL); - } else { - int BranchOpcode = - TrueBranch == ContMBB ? getContinueNzeroOpcode(OldOpcode) : - getContinueZeroOpcode(OldOpcode); - insertCondBranchBefore(I, BranchOpcode, DL); - } - - MI->eraseFromParent(); - } else { - // if we've arrived here then we've already erased the branch instruction - // travel back up the basic block to see the last reference of our debug - // location we've just inserted that reference here so it should be - // representative insertEnd to ensure phi-moves, if exist, go before the - // continue-instr. - insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, - getLastDebugLocInBB(ContingMBB)); - } -} - -int AMDGPUCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB, - MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB) { - int Cloned = 0; - assert(PreMBB->isSuccessor(SrcMBB)); - while (SrcMBB && SrcMBB != DstMBB) { - assert(SrcMBB->succ_size() == 1); - if (SrcMBB->pred_size() > 1) { - SrcMBB = cloneBlockForPredecessor(SrcMBB, PreMBB); - ++Cloned; - } - - PreMBB = SrcMBB; - SrcMBB = *SrcMBB->succ_begin(); - } - - return Cloned; -} - -MachineBasicBlock * -AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB, - MachineBasicBlock *PredMBB) { - assert(PredMBB->isSuccessor(MBB) && - "succBlk is not a prececessor of curBlk"); - - MachineBasicBlock *CloneMBB = clone(MBB); //clone instructions - replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB); - //srcBlk, oldBlk, newBlk - - PredMBB->removeSuccessor(MBB); - PredMBB->addSuccessor(CloneMBB); - - // add all successor to cloneBlk - cloneSuccessorList(CloneMBB, MBB); - - numClonedInstr += MBB->size(); - - DEBUG( - dbgs() << "Cloned block: " << "BB" - << MBB->getNumber() << "size " << MBB->size() << "\n"; - ); - - SHOWNEWBLK(CloneMBB, "result of Cloned block: "); - - return CloneMBB; -} - -void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB, - MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I) { - MachineBasicBlock::iterator SpliceEnd; - //look for the input branchinstr, not the AMDGPU branchinstr - MachineInstr *BranchMI = getNormalBlockBranchInstr(SrcMBB); - if (!BranchMI) { - DEBUG( - dbgs() << "migrateInstruction don't see branch instr\n" ; - ); - SpliceEnd = SrcMBB->end(); - } else { - DEBUG( - dbgs() << "migrateInstruction see branch instr\n" ; - BranchMI->dump(); - ); - SpliceEnd = BranchMI; - } - DEBUG( - dbgs() << "migrateInstruction before splice dstSize = " << DstMBB->size() - << "srcSize = " << SrcMBB->size() << "\n"; - ); - - //splice insert before insertPos - DstMBB->splice(I, SrcMBB, SrcMBB->begin(), SpliceEnd); - - DEBUG( - dbgs() << "migrateInstruction after splice dstSize = " << DstMBB->size() - << "srcSize = " << SrcMBB->size() << "\n"; - ); -} - -MachineBasicBlock * -AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) { - MachineBasicBlock *LoopHeader = LoopRep->getHeader(); - MachineBasicBlock *LoopLatch = LoopRep->getLoopLatch(); - const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32); - - if (!LoopHeader || !LoopLatch) - return nullptr; - MachineInstr *BranchMI = getLoopendBlockBranchInstr(LoopLatch); - // Is LoopRep an infinite loop ? - if (!BranchMI || !isUncondBranch(BranchMI)) - return nullptr; - - MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock(); - FuncRep->push_back(DummyExitBlk); //insert to function - SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: "); - DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";); - MachineBasicBlock::iterator I = BranchMI; - unsigned ImmReg = FuncRep->getRegInfo().createVirtualRegister(I32RC); - llvm_unreachable("Extra register needed to handle CFG"); - MachineInstr *NewMI = insertInstrBefore(I, AMDGPU::BRANCH_COND_i32); - MachineInstrBuilder MIB(*FuncRep, NewMI); - MIB.addMBB(LoopHeader); - MIB.addReg(ImmReg, false); - SHOWNEWINSTR(NewMI); - BranchMI->eraseFromParent(); - LoopLatch->addSuccessor(DummyExitBlk); - - return DummyExitBlk; -} - -void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) { - MachineInstr *BranchMI; - - // I saw two unconditional branch in one basic block in example - // test_fc_do_while_or.c need to fix the upstream on this to remove the loop. - while ((BranchMI = getLoopendBlockBranchInstr(MBB)) - && isUncondBranch(BranchMI)) { - DEBUG(dbgs() << "Removing uncond branch instr"; BranchMI->dump();); - BranchMI->eraseFromParent(); - } -} - -void AMDGPUCFGStructurizer::removeRedundantConditionalBranch( - MachineBasicBlock *MBB) { - if (MBB->succ_size() != 2) - return; - MachineBasicBlock *MBB1 = *MBB->succ_begin(); - MachineBasicBlock *MBB2 = *std::next(MBB->succ_begin()); - if (MBB1 != MBB2) - return; - - MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB); - assert(BranchMI && isCondBranch(BranchMI)); - DEBUG(dbgs() << "Removing unneeded cond branch instr"; BranchMI->dump();); - BranchMI->eraseFromParent(); - SHOWNEWBLK(MBB1, "Removing redundant successor"); - MBB->removeSuccessor(MBB1); -} - -void AMDGPUCFGStructurizer::addDummyExitBlock( - SmallVectorImpl &RetMBB) { - MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock(); - FuncRep->push_back(DummyExitBlk); //insert to function - insertInstrEnd(DummyExitBlk, AMDGPU::RETURN); - - for (SmallVectorImpl::iterator It = RetMBB.begin(), - E = RetMBB.end(); It != E; ++It) { - MachineBasicBlock *MBB = *It; - MachineInstr *MI = getReturnInstr(MBB); - if (MI) - MI->eraseFromParent(); - MBB->addSuccessor(DummyExitBlk); - DEBUG( - dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber() - << " successors\n"; - ); - } - SHOWNEWBLK(DummyExitBlk, "DummyExitBlock: "); -} - -void AMDGPUCFGStructurizer::removeSuccessor(MachineBasicBlock *MBB) { - while (MBB->succ_size()) - MBB->removeSuccessor(*MBB->succ_begin()); -} - -void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB, - int SccNum) { - BlockInformation *&srcBlkInfo = BlockInfoMap[MBB]; - if (!srcBlkInfo) - srcBlkInfo = new BlockInformation(); - srcBlkInfo->SccNum = SccNum; -} - -void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) { - DEBUG( - dbgs() << "Retiring BB" << MBB->getNumber() << "\n"; - ); - - BlockInformation *&SrcBlkInfo = BlockInfoMap[MBB]; - - if (!SrcBlkInfo) - SrcBlkInfo = new BlockInformation(); - - SrcBlkInfo->IsRetired = true; - assert(MBB->succ_size() == 0 && MBB->pred_size() == 0 - && "can't retire block yet"); -} - -void AMDGPUCFGStructurizer::setLoopLandBlock(MachineLoop *loopRep, - MachineBasicBlock *MBB) { - MachineBasicBlock *&TheEntry = LLInfoMap[loopRep]; - if (!MBB) { - MBB = FuncRep->CreateMachineBasicBlock(); - FuncRep->push_back(MBB); //insert to function - SHOWNEWBLK(MBB, "DummyLandingBlock for loop without break: "); - } - TheEntry = MBB; - DEBUG( - dbgs() << "setLoopLandBlock loop-header = BB" - << loopRep->getHeader()->getNumber() - << " landing-block = BB" << MBB->getNumber() << "\n"; - ); -} - -MachineBasicBlock * -AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1, - MachineBasicBlock *MBB2) { - - if (PDT->dominates(MBB1, MBB2)) - return MBB1; - if (PDT->dominates(MBB2, MBB1)) - return MBB2; - - MachineDomTreeNode *Node1 = PDT->getNode(MBB1); - MachineDomTreeNode *Node2 = PDT->getNode(MBB2); - - // Handle newly cloned node. - if (!Node1 && MBB1->succ_size() == 1) - return findNearestCommonPostDom(*MBB1->succ_begin(), MBB2); - if (!Node2 && MBB2->succ_size() == 1) - return findNearestCommonPostDom(MBB1, *MBB2->succ_begin()); - - if (!Node1 || !Node2) - return nullptr; - - Node1 = Node1->getIDom(); - while (Node1) { - if (PDT->dominates(Node1, Node2)) - return Node1->getBlock(); - Node1 = Node1->getIDom(); - } - - return nullptr; -} - -MachineBasicBlock * -AMDGPUCFGStructurizer::findNearestCommonPostDom( - std::set &MBBs) { - MachineBasicBlock *CommonDom; - std::set::const_iterator It = MBBs.begin(); - std::set::const_iterator E = MBBs.end(); - for (CommonDom = *It; It != E && CommonDom; ++It) { - MachineBasicBlock *MBB = *It; - if (MBB != CommonDom) - CommonDom = findNearestCommonPostDom(MBB, CommonDom); - } - - DEBUG( - dbgs() << "Common post dominator for exit blocks is "; - if (CommonDom) - dbgs() << "BB" << CommonDom->getNumber() << "\n"; - else - dbgs() << "NULL\n"; - ); - - return CommonDom; -} - -char AMDGPUCFGStructurizer::ID = 0; - -} // end anonymous namespace - - -INITIALIZE_PASS_BEGIN(AMDGPUCFGStructurizer, "amdgpustructurizer", - "AMDGPU CFG Structurizer", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) -INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_PASS_END(AMDGPUCFGStructurizer, "amdgpustructurizer", - "AMDGPU CFG Structurizer", false, false) - -FunctionPass *llvm::createAMDGPUCFGStructurizerPass() { - return new AMDGPUCFGStructurizer(); -} diff --git a/lib/Target/R600/AMDKernelCodeT.h b/lib/Target/R600/AMDKernelCodeT.h deleted file mode 100644 index 4d3041ff3db..00000000000 --- a/lib/Target/R600/AMDKernelCodeT.h +++ /dev/null @@ -1,704 +0,0 @@ -//===-- AMDGPUKernelCodeT.h - Print AMDGPU assembly code ---------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// \file AMDKernelCodeT.h -//===----------------------------------------------------------------------===// - -#ifndef AMDKERNELCODET_H -#define AMDKERNELCODET_H - -#include -#include - -//---------------------------------------------------------------------------// -// AMD Kernel Code, and its dependencies // -//---------------------------------------------------------------------------// - -typedef uint8_t hsa_powertwo8_t; -typedef uint32_t hsa_ext_code_kind_t; -typedef uint8_t hsa_ext_brig_profile8_t; -typedef uint8_t hsa_ext_brig_machine_model8_t; -typedef uint64_t hsa_ext_control_directive_present64_t; -typedef uint16_t hsa_ext_exception_kind16_t; -typedef uint32_t hsa_ext_code_kind32_t; - -typedef struct hsa_dim3_s { - uint32_t x; - uint32_t y; - uint32_t z; -} hsa_dim3_t; - -/// The version of the amd_*_code_t struct. Minor versions must be -/// backward compatible. -typedef uint32_t amd_code_version32_t; -enum amd_code_version_t { - AMD_CODE_VERSION_MAJOR = 0, - AMD_CODE_VERSION_MINOR = 1 -}; - -/// The values used to define the number of bytes to use for the -/// swizzle element size. -enum amd_element_byte_size_t { - AMD_ELEMENT_2_BYTES = 0, - AMD_ELEMENT_4_BYTES = 1, - AMD_ELEMENT_8_BYTES = 2, - AMD_ELEMENT_16_BYTES = 3 -}; - -/// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and -/// COMPUTE_PGM_RSRC2 registers. -typedef uint64_t amd_compute_pgm_resource_register64_t; - -/// Every amd_*_code_t has the following properties, which are composed of -/// a number of bit fields. Every bit field has a mask (AMD_CODE_PROPERTY_*), -/// bit width (AMD_CODE_PROPERTY_*_WIDTH, and bit shift amount -/// (AMD_CODE_PROPERTY_*_SHIFT) for convenient access. Unused bits must be 0. -/// -/// (Note that bit fields cannot be used as their layout is -/// implementation defined in the C standard and so cannot be used to -/// specify an ABI) -typedef uint32_t amd_code_property32_t; -enum amd_code_property_mask_t { - - /// Enable the setup of the SGPR user data registers - /// (AMD_CODE_PROPERTY_ENABLE_SGPR_*), see documentation of amd_kernel_code_t - /// for initial register state. - /// - /// The total number of SGPRuser data registers requested must not - /// exceed 16. Any requests beyond 16 will be ignored. - /// - /// Used to set COMPUTE_PGM_RSRC2.USER_SGPR (set to total count of - /// SGPR user data registers enabled up to 16). - - AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT = 0, - AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT = 2, - AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT = 3, - AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT = 4, - AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT = 5, - AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT = 6, - AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT = 7, - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT = 8, - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT, - - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT = 9, - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT, - - /// Control wave ID base counter for GDS ordered-append. Used to set - /// COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if - /// ORDERED_APPEND_MODE also needs to be settable) - AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 10, - AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1, - AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS = ((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT, - - /// The interleave (swizzle) element size in bytes required by the - /// code for private memory. This must be 2, 4, 8 or 16. This value - /// is provided to the finalizer when it is invoked and is recorded - /// here. The hardware will interleave the memory requests of each - /// lane of a wavefront by this element size to ensure each - /// work-item gets a distinct memory memory location. Therefore, the - /// finalizer ensures that all load and store operations done to - /// private memory do not exceed this size. For example, if the - /// element size is 4 (32-bits or dword) and a 64-bit value must be - /// loaded, the finalizer will generate two 32-bit loads. This - /// ensures that the interleaving will get the the work-item - /// specific dword for both halves of the 64-bit value. If it just - /// did a 64-bit load then it would get one dword which belonged to - /// its own work-item, but the second dword would belong to the - /// adjacent lane work-item since the interleaving is in dwords. - /// - /// The value used must match the value that the runtime configures - /// the GPU flat scratch (SH_STATIC_MEM_CONFIG.ELEMENT_SIZE). This - /// is generally DWORD. - /// - /// Use values from the amd_element_byte_size_t enum. - AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 11, - AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2, - AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE = ((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT, - - /// Are global memory addresses 64 bits. Must match - /// amd_kernel_code_t.hsail_machine_model == - /// HSA_MACHINE_LARGE. Must also match - /// SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)), - /// SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+). - AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 13, - AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1, - AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_PTR64_SHIFT, - - /// Indicate if the generated ISA is using a dynamically sized call - /// stack. This can happen if calls are implemented using a call - /// stack and recursion, alloca or calls to indirect functions are - /// present. In these cases the Finalizer cannot compute the total - /// private segment size at compile time. In this case the - /// workitem_private_segment_byte_size only specifies the statically - /// know private segment size, and additional space must be added - /// for the call stack. - AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 14, - AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1, - AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK = ((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT, - - /// Indicate if code generated has support for debugging. - AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 15, - AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1, - AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT -}; - -/// @brief The hsa_ext_control_directives_t specifies the values for the HSAIL -/// control directives. These control how the finalizer generates code. This -/// struct is used both as an argument to hsaFinalizeKernel to specify values for -/// the control directives, and is used in HsaKernelCode to record the values of -/// the control directives that the finalize used when generating the code which -/// either came from the finalizer argument or explicit HSAIL control -/// directives. See the definition of the control directives in HSA Programmer's -/// Reference Manual which also defines how the values specified as finalizer -/// arguments have to agree with the control directives in the HSAIL code. -typedef struct hsa_ext_control_directives_s { - /// This is a bit set indicating which control directives have been - /// specified. If the value is 0 then there are no control directives specified - /// and the rest of the fields can be ignored. The bits are accessed using the - /// hsa_ext_control_directives_present_mask_t. Any control directive that is not - /// enabled in this bit set must have the value of all 0s. - hsa_ext_control_directive_present64_t enabled_control_directives; - - /// If enableBreakExceptions is not enabled then must be 0, otherwise must be - /// non-0 and specifies the set of HSAIL exceptions that must have the BREAK - /// policy enabled. If this set is not empty then the generated code may have - /// lower performance than if the set is empty. If the kernel being finalized - /// has any enablebreakexceptions control directives, then the values specified - /// by this argument are unioned with the values in these control - /// directives. If any of the functions the kernel calls have an - /// enablebreakexceptions control directive, then they must be equal or a - /// subset of, this union. - hsa_ext_exception_kind16_t enable_break_exceptions; - - /// If enableDetectExceptions is not enabled then must be 0, otherwise must be - /// non-0 and specifies the set of HSAIL exceptions that must have the DETECT - /// policy enabled. If this set is not empty then the generated code may have - /// lower performance than if the set is empty. However, an implementation - /// should endeavour to make the performance impact small. If the kernel being - /// finalized has any enabledetectexceptions control directives, then the - /// values specified by this argument are unioned with the values in these - /// control directives. If any of the functions the kernel calls have an - /// enabledetectexceptions control directive, then they must be equal or a - /// subset of, this union. - hsa_ext_exception_kind16_t enable_detect_exceptions; - - /// If maxDynamicGroupSize is not enabled then must be 0, and any amount of - /// dynamic group segment can be allocated for a dispatch, otherwise the value - /// specifies the maximum number of bytes of dynamic group segment that can be - /// allocated for a dispatch. If the kernel being finalized has any - /// maxdynamicsize control directives, then the values must be the same, and - /// must be the same as this argument if it is enabled. This value can be used - /// by the finalizer to determine the maximum number of bytes of group memory - /// used by each work-group by adding this value to the group memory required - /// for all group segment variables used by the kernel and all functions it - /// calls, and group memory used to implement other HSAIL features such as - /// fbarriers and the detect exception operations. This can allow the finalizer - /// to determine the expected number of work-groups that can be executed by a - /// compute unit and allow more resources to be allocated to the work-items if - /// it is known that fewer work-groups can be executed due to group memory - /// limitations. - uint32_t max_dynamic_group_size; - - /// If maxFlatGridSize is not enabled then must be 0, otherwise must be greater - /// than 0. See HSA Programmer's Reference Manual description of - /// maxflatgridsize control directive. - uint32_t max_flat_grid_size; - - /// If maxFlatWorkgroupSize is not enabled then must be 0, otherwise must be - /// greater than 0. See HSA Programmer's Reference Manual description of - /// maxflatworkgroupsize control directive. - uint32_t max_flat_workgroup_size; - - /// If requestedWorkgroupsPerCu is not enabled then must be 0, and the - /// finalizer is free to generate ISA that may result in any number of - /// work-groups executing on a single compute unit. Otherwise, the finalizer - /// should attempt to generate ISA that will allow the specified number of - /// work-groups to execute on a single compute unit. This is only a hint and - /// can be ignored by the finalizer. If the kernel being finalized, or any of - /// the functions it calls, has a requested control directive, then the values - /// must be the same. This can be used to determine the number of resources - /// that should be allocated to a single work-group and work-item. For example, - /// a low value may allow more resources to be allocated, resulting in higher - /// per work-item performance, as it is known there will never be more than the - /// specified number of work-groups actually executing on the compute - /// unit. Conversely, a high value may allocate fewer resources, resulting in - /// lower per work-item performance, which is offset by the fact it allows more - /// work-groups to actually execute on the compute unit. - uint32_t requested_workgroups_per_cu; - - /// If not enabled then all elements for Dim3 must be 0, otherwise every - /// element must be greater than 0. See HSA Programmer's Reference Manual - /// description of requiredgridsize control directive. - hsa_dim3_t required_grid_size; - - /// If requiredWorkgroupSize is not enabled then all elements for Dim3 must be - /// 0, and the produced code can be dispatched with any legal work-group range - /// consistent with the dispatch dimensions. Otherwise, the code produced must - /// always be dispatched with the specified work-group range. No element of the - /// specified range must be 0. It must be consistent with required_dimensions - /// and max_flat_workgroup_size. If the kernel being finalized, or any of the - /// functions it calls, has a requiredworkgroupsize control directive, then the - /// values must be the same. Specifying a value can allow the finalizer to - /// optimize work-group id operations, and if the number of work-items in the - /// work-group is less than the WAVESIZE then barrier operations can be - /// optimized to just a memory fence. - hsa_dim3_t required_workgroup_size; - - /// If requiredDim is not enabled then must be 0 and the produced kernel code - /// can be dispatched with 1, 2 or 3 dimensions. If enabled then the value is - /// 1..3 and the code produced must only be dispatched with a dimension that - /// matches. Other values are illegal. If the kernel being finalized, or any of - /// the functions it calls, has a requireddimsize control directive, then the - /// values must be the same. This can be used to optimize the code generated to - /// compute the absolute and flat work-group and work-item id, and the dim - /// HSAIL operations. - uint8_t required_dim; - - /// Reserved. Must be 0. - uint8_t reserved[75]; -} hsa_ext_control_directives_t; - -/// AMD Kernel Code Object (amd_kernel_code_t). GPU CP uses the AMD Kernel -/// Code Object to set up the hardware to execute the kernel dispatch. -/// -/// Initial Kernel Register State. -/// -/// Initial kernel register state will be set up by CP/SPI prior to the start -/// of execution of every wavefront. This is limited by the constraints of the -/// current hardware. -/// -/// The order of the SGPR registers is defined, but the Finalizer can specify -/// which ones are actually setup in the amd_kernel_code_t object using the -/// enable_sgpr_* bit fields. The register numbers used for enabled registers -/// are dense starting at SGPR0: the first enabled register is SGPR0, the next -/// enabled register is SGPR1 etc.; disabled registers do not have an SGPR -/// number. -/// -/// The initial SGPRs comprise up to 16 User SRGPs that are set up by CP and -/// apply to all waves of the grid. It is possible to specify more than 16 User -/// SGPRs using the enable_sgpr_* bit fields, in which case only the first 16 -/// are actually initialized. These are then immediately followed by the System -/// SGPRs that are set up by ADC/SPI and can have different values for each wave -/// of the grid dispatch. -/// -/// SGPR register initial state is defined as follows: -/// -/// Private Segment Buffer (enable_sgpr_private_segment_buffer): -/// Number of User SGPR registers: 4. V# that can be used, together with -/// Scratch Wave Offset as an offset, to access the Private/Spill/Arg -/// segments using a segment address. It must be set as follows: -/// - Base address: of the scratch memory area used by the dispatch. It -/// does not include the scratch wave offset. It will be the per process -/// SH_HIDDEN_PRIVATE_BASE_VMID plus any offset from this dispatch (for -/// example there may be a per pipe offset, or per AQL Queue offset). -/// - Stride + data_format: Element Size * Index Stride (???) -/// - Cache swizzle: ??? -/// - Swizzle enable: SH_STATIC_MEM_CONFIG.SWIZZLE_ENABLE (must be 1 for -/// scratch) -/// - Num records: Flat Scratch Work Item Size / Element Size (???) -/// - Dst_sel_*: ??? -/// - Num_format: ??? -/// - Element_size: SH_STATIC_MEM_CONFIG.ELEMENT_SIZE (will be DWORD, must -/// agree with amd_kernel_code_t.privateElementSize) -/// - Index_stride: SH_STATIC_MEM_CONFIG.INDEX_STRIDE (will be 64 as must -/// be number of wavefront lanes for scratch, must agree with -/// amd_kernel_code_t.wavefrontSize) -/// - Add tid enable: 1 -/// - ATC: from SH_MEM_CONFIG.PRIVATE_ATC, -/// - Hash_enable: ??? -/// - Heap: ??? -/// - Mtype: from SH_STATIC_MEM_CONFIG.PRIVATE_MTYPE -/// - Type: 0 (a buffer) (???) -/// -/// Dispatch Ptr (enable_sgpr_dispatch_ptr): -/// Number of User SGPR registers: 2. 64 bit address of AQL dispatch packet -/// for kernel actually executing. -/// -/// Queue Ptr (enable_sgpr_queue_ptr): -/// Number of User SGPR registers: 2. 64 bit address of AmdQueue object for -/// AQL queue on which the dispatch packet was queued. -/// -/// Kernarg Segment Ptr (enable_sgpr_kernarg_segment_ptr): -/// Number of User SGPR registers: 2. 64 bit address of Kernarg segment. This -/// is directly copied from the kernargPtr in the dispatch packet. Having CP -/// load it once avoids loading it at the beginning of every wavefront. -/// -/// Dispatch Id (enable_sgpr_dispatch_id): -/// Number of User SGPR registers: 2. 64 bit Dispatch ID of the dispatch -/// packet being executed. -/// -/// Flat Scratch Init (enable_sgpr_flat_scratch_init): -/// Number of User SGPR registers: 2. This is 2 SGPRs. -/// -/// For CI/VI: -/// The first SGPR is a 32 bit byte offset from SH_MEM_HIDDEN_PRIVATE_BASE -/// to base of memory for scratch for this dispatch. This is the same offset -/// used in computing the Scratch Segment Buffer base address. The value of -/// Scratch Wave Offset must be added by the kernel code and moved to -/// SGPRn-4 for use as the FLAT SCRATCH BASE in flat memory instructions. -/// -/// The second SGPR is 32 bit byte size of a single work-itemÂ’s scratch -/// memory usage. This is directly loaded from the dispatch packet Private -/// Segment Byte Size and rounded up to a multiple of DWORD. -/// -/// \todo [Does CP need to round this to >4 byte alignment?] -/// -/// The kernel code must move to SGPRn-3 for use as the FLAT SCRATCH SIZE in -/// flat memory instructions. Having CP load it once avoids loading it at -/// the beginning of every wavefront. -/// -/// For PI: -/// This is the 64 bit base address of the scratch backing memory for -/// allocated by CP for this dispatch. -/// -/// Private Segment Size (enable_sgpr_private_segment_size): -/// Number of User SGPR registers: 1. The 32 bit byte size of a single -/// work-itemÂ’s scratch memory allocation. This is the value from the dispatch -/// packet. Private Segment Byte Size rounded up by CP to a multiple of DWORD. -/// -/// \todo [Does CP need to round this to >4 byte alignment?] -/// -/// Having CP load it once avoids loading it at the beginning of every -/// wavefront. -/// -/// \todo [This will not be used for CI/VI since it is the same value as -/// the second SGPR of Flat Scratch Init. However, it is need for PI which -/// changes meaning of Flat Scratchg Init..] -/// -/// Grid Work-Group Count X (enable_sgpr_grid_workgroup_count_x): -/// Number of User SGPR registers: 1. 32 bit count of the number of -/// work-groups in the X dimension for the grid being executed. Computed from -/// the fields in the HsaDispatchPacket as -/// ((gridSize.x+workgroupSize.x-1)/workgroupSize.x). -/// -/// Grid Work-Group Count Y (enable_sgpr_grid_workgroup_count_y): -/// Number of User SGPR registers: 1. 32 bit count of the number of -/// work-groups in the Y dimension for the grid being executed. Computed from -/// the fields in the HsaDispatchPacket as -/// ((gridSize.y+workgroupSize.y-1)/workgroupSize.y). -/// -/// Only initialized if <16 previous SGPRs initialized. -/// -/// Grid Work-Group Count Z (enable_sgpr_grid_workgroup_count_z): -/// Number of User SGPR registers: 1. 32 bit count of the number of -/// work-groups in the Z dimension for the grid being executed. Computed -/// from the fields in the HsaDispatchPacket as -/// ((gridSize.z+workgroupSize.z-1)/workgroupSize.z). -/// -/// Only initialized if <16 previous SGPRs initialized. -/// -/// Work-Group Id X (enable_sgpr_workgroup_id_x): -/// Number of System SGPR registers: 1. 32 bit work group id in X dimension -/// of grid for wavefront. Always present. -/// -/// Work-Group Id Y (enable_sgpr_workgroup_id_y): -/// Number of System SGPR registers: 1. 32 bit work group id in Y dimension -/// of grid for wavefront. -/// -/// Work-Group Id Z (enable_sgpr_workgroup_id_z): -/// Number of System SGPR registers: 1. 32 bit work group id in Z dimension -/// of grid for wavefront. If present then Work-group Id Y will also be -/// present -/// -/// Work-Group Info (enable_sgpr_workgroup_info): -/// Number of System SGPR registers: 1. {first_wave, 14Â’b0000, -/// ordered_append_term[10:0], threadgroup_size_in_waves[5:0]} -/// -/// Private Segment Wave Byte Offset -/// (enable_sgpr_private_segment_wave_byte_offset): -/// Number of System SGPR registers: 1. 32 bit byte offset from base of -/// dispatch scratch base. Must be used as an offset with Private/Spill/Arg -/// segment address when using Scratch Segment Buffer. It must be added to -/// Flat Scratch Offset if setting up FLAT SCRATCH for flat addressing. -/// -/// -/// The order of the VGPR registers is defined, but the Finalizer can specify -/// which ones are actually setup in the amd_kernel_code_t object using the -/// enableVgpr* bit fields. The register numbers used for enabled registers -/// are dense starting at VGPR0: the first enabled register is VGPR0, the next -/// enabled register is VGPR1 etc.; disabled registers do not have an VGPR -/// number. -/// -/// VGPR register initial state is defined as follows: -/// -/// Work-Item Id X (always initialized): -/// Number of registers: 1. 32 bit work item id in X dimension of work-group -/// for wavefront lane. -/// -/// Work-Item Id X (enable_vgpr_workitem_id > 0): -/// Number of registers: 1. 32 bit work item id in Y dimension of work-group -/// for wavefront lane. -/// -/// Work-Item Id X (enable_vgpr_workitem_id > 0): -/// Number of registers: 1. 32 bit work item id in Z dimension of work-group -/// for wavefront lane. -/// -/// -/// The setting of registers is being done by existing GPU hardware as follows: -/// 1) SGPRs before the Work-Group Ids are set by CP using the 16 User Data -/// registers. -/// 2) Work-group Id registers X, Y, Z are set by SPI which supports any -/// combination including none. -/// 3) Scratch Wave Offset is also set by SPI which is why its value cannot -/// be added into the value Flat Scratch Offset which would avoid the -/// Finalizer generated prolog having to do the add. -/// 4) The VGPRs are set by SPI which only supports specifying either (X), -/// (X, Y) or (X, Y, Z). -/// -/// Flat Scratch Dispatch Offset and Flat Scratch Size are adjacent SGRRs so -/// they can be moved as a 64 bit value to the hardware required SGPRn-3 and -/// SGPRn-4 respectively using the Finalizer ?FLAT_SCRATCH? Register. -/// -/// The global segment can be accessed either using flat operations or buffer -/// operations. If buffer operations are used then the Global Buffer used to -/// access HSAIL Global/Readonly/Kernarg (which are combine) segments using a -/// segment address is not passed into the kernel code by CP since its base -/// address is always 0. Instead the Finalizer generates prolog code to -/// initialize 4 SGPRs with a V# that has the following properties, and then -/// uses that in the buffer instructions: -/// - base address of 0 -/// - no swizzle -/// - ATC=1 -/// - MTYPE set to support memory coherence specified in -/// amd_kernel_code_t.globalMemoryCoherence -/// -/// When the Global Buffer is used to access the Kernarg segment, must add the -/// dispatch packet kernArgPtr to a kernarg segment address before using this V#. -/// Alternatively scalar loads can be used if the kernarg offset is uniform, as -/// the kernarg segment is constant for the duration of the kernel execution. -/// -typedef struct amd_kernel_code_s { - /// The AMD major version of the Code Object. Must be the value - /// AMD_CODE_VERSION_MAJOR. - amd_code_version32_t amd_code_version_major; - - /// The AMD minor version of the Code Object. Minor versions must be - /// backward compatible. Must be the value - /// AMD_CODE_VERSION_MINOR. - amd_code_version32_t amd_code_version_minor; - - /// The byte size of this struct. Must be set to - /// sizeof(amd_kernel_code_t). Used for backward - /// compatibility. - uint32_t struct_byte_size; - - /// The target chip instruction set for which code has been - /// generated. Values are from the E_SC_INSTRUCTION_SET enumeration - /// in sc/Interface/SCCommon.h. - uint32_t target_chip; - - /// Byte offset (possibly negative) from start of amd_kernel_code_t - /// object to kernel's entry point instruction. The actual code for - /// the kernel is required to be 256 byte aligned to match hardware - /// requirements (SQ cache line is 16). The code must be position - /// independent code (PIC) for AMD devices to give runtime the - /// option of copying code to discrete GPU memory or APU L2 - /// cache. The Finalizer should endeavour to allocate all kernel - /// machine code in contiguous memory pages so that a device - /// pre-fetcher will tend to only pre-fetch Kernel Code objects, - /// improving cache performance. - int64_t kernel_code_entry_byte_offset; - - /// Range of bytes to consider prefetching expressed as an offset - /// and size. The offset is from the start (possibly negative) of - /// amd_kernel_code_t object. Set both to 0 if no prefetch - /// information is available. - /// - /// \todo ttye 11/15/2013 Is the prefetch definition we want? Did - /// not make the size a uint64_t as prefetching more than 4GiB seems - /// excessive. - int64_t kernel_code_prefetch_byte_offset; - uint64_t kernel_code_prefetch_byte_size; - - /// Number of bytes of scratch backing memory required for full - /// occupancy of target chip. This takes into account the number of - /// bytes of scratch per work-item, the wavefront size, the maximum - /// number of wavefronts per CU, and the number of CUs. This is an - /// upper limit on scratch. If the grid being dispatched is small it - /// may only need less than this. If the kernel uses no scratch, or - /// the Finalizer has not computed this value, it must be 0. - uint64_t max_scratch_backing_memory_byte_size; - - /// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and - /// COMPUTE_PGM_RSRC2 registers. - amd_compute_pgm_resource_register64_t compute_pgm_resource_registers; - - /// Code properties. See amd_code_property_mask_t for a full list of - /// properties. - amd_code_property32_t code_properties; - - /// The amount of memory required for the combined private, spill - /// and arg segments for a work-item in bytes. If - /// is_dynamic_callstack is 1 then additional space must be added to - /// this value for the call stack. - uint32_t workitem_private_segment_byte_size; - - /// The amount of group segment memory required by a work-group in - /// bytes. This does not include any dynamically allocated group - /// segment memory that may be added when the kernel is - /// dispatched. - uint32_t workgroup_group_segment_byte_size; - - /// Number of byte of GDS required by kernel dispatch. Must be 0 if - /// not using GDS. - uint32_t gds_segment_byte_size; - - /// The size in bytes of the kernarg segment that holds the values - /// of the arguments to the kernel. This could be used by CP to - /// prefetch the kernarg segment pointed to by the dispatch packet. - uint64_t kernarg_segment_byte_size; - - /// Number of fbarrier's used in the kernel and all functions it - /// calls. If the implementation uses group memory to allocate the - /// fbarriers then that amount must already be included in the - /// workgroup_group_segment_byte_size total. - uint32_t workgroup_fbarrier_count; - - /// Number of scalar registers used by a wavefront. This includes - /// the special SGPRs for VCC, Flat Scratch Base, Flat Scratch Size - /// and XNACK (for GFX8 (VI)). It does not include the 16 SGPR added if a - /// trap handler is enabled. Used to set COMPUTE_PGM_RSRC1.SGPRS. - uint16_t wavefront_sgpr_count; - - /// Number of vector registers used by each work-item. Used to set - /// COMPUTE_PGM_RSRC1.VGPRS. - uint16_t workitem_vgpr_count; - - /// If reserved_vgpr_count is 0 then must be 0. Otherwise, this is the - /// first fixed VGPR number reserved. - uint16_t reserved_vgpr_first; - - /// The number of consecutive VGPRs reserved by the client. If - /// is_debug_supported then this count includes VGPRs reserved - /// for debugger use. - uint16_t reserved_vgpr_count; - - /// If reserved_sgpr_count is 0 then must be 0. Otherwise, this is the - /// first fixed SGPR number reserved. - uint16_t reserved_sgpr_first; - - /// The number of consecutive SGPRs reserved by the client. If - /// is_debug_supported then this count includes SGPRs reserved - /// for debugger use. - uint16_t reserved_sgpr_count; - - /// If is_debug_supported is 0 then must be 0. Otherwise, this is the - /// fixed SGPR number used to hold the wave scratch offset for the - /// entire kernel execution, or uint16_t(-1) if the register is not - /// used or not known. - uint16_t debug_wavefront_private_segment_offset_sgpr; - - /// If is_debug_supported is 0 then must be 0. Otherwise, this is the - /// fixed SGPR number of the first of 4 SGPRs used to hold the - /// scratch V# used for the entire kernel execution, or uint16_t(-1) - /// if the registers are not used or not known. - uint16_t debug_private_segment_buffer_sgpr; - - /// The maximum byte alignment of variables used by the kernel in - /// the specified memory segment. Expressed as a power of two. Must - /// be at least HSA_POWERTWO_16. - hsa_powertwo8_t kernarg_segment_alignment; - hsa_powertwo8_t group_segment_alignment; - hsa_powertwo8_t private_segment_alignment; - - uint8_t reserved3; - - /// Type of code object. - hsa_ext_code_kind32_t code_type; - - /// Reserved for code properties if any are defined in the future. - /// There are currently no code properties so this field must be 0. - uint32_t reserved4; - - /// Wavefront size expressed as a power of two. Must be a power of 2 - /// in range 1..64 inclusive. Used to support runtime query that - /// obtains wavefront size, which may be used by application to - /// allocated dynamic group memory and set the dispatch work-group - /// size. - hsa_powertwo8_t wavefront_size; - - /// The optimization level specified when the kernel was - /// finalized. - uint8_t optimization_level; - - /// The HSAIL profile defines which features are used. This - /// information is from the HSAIL version directive. If this - /// amd_kernel_code_t is not generated from an HSAIL compilation - /// unit then must be 0. - hsa_ext_brig_profile8_t hsail_profile; - - /// The HSAIL machine model gives the address sizes used by the - /// code. This information is from the HSAIL version directive. If - /// not generated from an HSAIL compilation unit then must still - /// indicate for what machine mode the code is generated. - hsa_ext_brig_machine_model8_t hsail_machine_model; - - /// The HSAIL major version. This information is from the HSAIL - /// version directive. If this amd_kernel_code_t is not - /// generated from an HSAIL compilation unit then must be 0. - uint32_t hsail_version_major; - - /// The HSAIL minor version. This information is from the HSAIL - /// version directive. If this amd_kernel_code_t is not - /// generated from an HSAIL compilation unit then must be 0. - uint32_t hsail_version_minor; - - /// Reserved for HSAIL target options if any are defined in the - /// future. There are currently no target options so this field - /// must be 0. - uint16_t reserved5; - - /// Reserved. Must be 0. - uint16_t reserved6; - - /// The values should be the actually values used by the finalizer - /// in generating the code. This may be the union of values - /// specified as finalizer arguments and explicit HSAIL control - /// directives. If the finalizer chooses to ignore a control - /// directive, and not generate constrained code, then the control - /// directive should not be marked as enabled even though it was - /// present in the HSAIL or finalizer argument. The values are - /// intended to reflect the constraints that the code actually - /// requires to correctly execute, not the values that were - /// actually specified at finalize time. - hsa_ext_control_directives_t control_directive; - - /// The code can immediately follow the amd_kernel_code_t, or can - /// come after subsequent amd_kernel_code_t structs when there are - /// multiple kernels in the compilation unit. - -} amd_kernel_code_t; - -#endif // AMDKERNELCODET_H diff --git a/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp deleted file mode 100644 index 02a63604970..00000000000 --- a/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp +++ /dev/null @@ -1,1380 +0,0 @@ -//===-- AMDGPUAsmParser.cpp - Parse SI asm to MCInst instructions ----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "SIDefines.h" -#include "llvm/ADT/APFloat.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/StringSwitch.h" -#include "llvm/ADT/Twine.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCParser/MCAsmLexer.h" -#include "llvm/MC/MCParser/MCAsmParser.h" -#include "llvm/MC/MCParser/MCParsedAsmOperand.h" -#include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCStreamer.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/MC/MCTargetAsmParser.h" -#include "llvm/Support/SourceMgr.h" -#include "llvm/Support/TargetRegistry.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Support/Debug.h" - -using namespace llvm; - -namespace { - -struct OptionalOperand; - -class AMDGPUOperand : public MCParsedAsmOperand { - enum KindTy { - Token, - Immediate, - Register, - Expression - } Kind; - - SMLoc StartLoc, EndLoc; - -public: - AMDGPUOperand(enum KindTy K) : MCParsedAsmOperand(), Kind(K) {} - - MCContext *Ctx; - - enum ImmTy { - ImmTyNone, - ImmTyDSOffset0, - ImmTyDSOffset1, - ImmTyGDS, - ImmTyOffset, - ImmTyGLC, - ImmTySLC, - ImmTyTFE, - ImmTyClamp, - ImmTyOMod - }; - - struct TokOp { - const char *Data; - unsigned Length; - }; - - struct ImmOp { - bool IsFPImm; - ImmTy Type; - int64_t Val; - }; - - struct RegOp { - unsigned RegNo; - int Modifiers; - const MCRegisterInfo *TRI; - bool IsForcedVOP3; - }; - - union { - TokOp Tok; - ImmOp Imm; - RegOp Reg; - const MCExpr *Expr; - }; - - void addImmOperands(MCInst &Inst, unsigned N) const { - Inst.addOperand(MCOperand::createImm(getImm())); - } - - StringRef getToken() const { - return StringRef(Tok.Data, Tok.Length); - } - - void addRegOperands(MCInst &Inst, unsigned N) const { - Inst.addOperand(MCOperand::createReg(getReg())); - } - - void addRegOrImmOperands(MCInst &Inst, unsigned N) const { - if (isReg()) - addRegOperands(Inst, N); - else - addImmOperands(Inst, N); - } - - void addRegWithInputModsOperands(MCInst &Inst, unsigned N) const { - Inst.addOperand(MCOperand::createImm( - Reg.Modifiers == -1 ? 0 : Reg.Modifiers)); - addRegOperands(Inst, N); - } - - void addSoppBrTargetOperands(MCInst &Inst, unsigned N) const { - if (isImm()) - addImmOperands(Inst, N); - else { - assert(isExpr()); - Inst.addOperand(MCOperand::createExpr(Expr)); - } - } - - bool defaultTokenHasSuffix() const { - StringRef Token(Tok.Data, Tok.Length); - - return Token.endswith("_e32") || Token.endswith("_e64"); - } - - bool isToken() const override { - return Kind == Token; - } - - bool isImm() const override { - return Kind == Immediate; - } - - bool isInlineImm() const { - float F = BitsToFloat(Imm.Val); - // TODO: Add 0.5pi for VI - return isImm() && ((Imm.Val <= 64 && Imm.Val >= -16) || - (F == 0.0 || F == 0.5 || F == -0.5 || F == 1.0 || F == -1.0 || - F == 2.0 || F == -2.0 || F == 4.0 || F == -4.0)); - } - - bool isDSOffset0() const { - assert(isImm()); - return Imm.Type == ImmTyDSOffset0; - } - - bool isDSOffset1() const { - assert(isImm()); - return Imm.Type == ImmTyDSOffset1; - } - - int64_t getImm() const { - return Imm.Val; - } - - enum ImmTy getImmTy() const { - assert(isImm()); - return Imm.Type; - } - - bool isRegKind() const { - return Kind == Register; - } - - bool isReg() const override { - return Kind == Register && Reg.Modifiers == -1; - } - - bool isRegWithInputMods() const { - return Kind == Register && (Reg.IsForcedVOP3 || Reg.Modifiers != -1); - } - - void setModifiers(unsigned Mods) { - assert(isReg()); - Reg.Modifiers = Mods; - } - - bool hasModifiers() const { - assert(isRegKind()); - return Reg.Modifiers != -1; - } - - unsigned getReg() const override { - return Reg.RegNo; - } - - bool isRegOrImm() const { - return isReg() || isImm(); - } - - bool isRegClass(unsigned RCID) const { - return Reg.TRI->getRegClass(RCID).contains(getReg()); - } - - bool isSCSrc32() const { - return isInlineImm() || (isReg() && isRegClass(AMDGPU::SReg_32RegClassID)); - } - - bool isSSrc32() const { - return isImm() || (isReg() && isRegClass(AMDGPU::SReg_32RegClassID)); - } - - bool isSSrc64() const { - return isImm() || isInlineImm() || - (isReg() && isRegClass(AMDGPU::SReg_64RegClassID)); - } - - bool isVCSrc32() const { - return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID)); - } - - bool isVCSrc64() const { - return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_64RegClassID)); - } - - bool isVSrc32() const { - return isImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID)); - } - - bool isVSrc64() const { - return isImm() || (isReg() && isRegClass(AMDGPU::VS_64RegClassID)); - } - - bool isMem() const override { - return false; - } - - bool isExpr() const { - return Kind == Expression; - } - - bool isSoppBrTarget() const { - return isExpr() || isImm(); - } - - SMLoc getStartLoc() const override { - return StartLoc; - } - - SMLoc getEndLoc() const override { - return EndLoc; - } - - void print(raw_ostream &OS) const override { } - - static std::unique_ptr CreateImm(int64_t Val, SMLoc Loc, - enum ImmTy Type = ImmTyNone, - bool IsFPImm = false) { - auto Op = llvm::make_unique(Immediate); - Op->Imm.Val = Val; - Op->Imm.IsFPImm = IsFPImm; - Op->Imm.Type = Type; - Op->StartLoc = Loc; - Op->EndLoc = Loc; - return Op; - } - - static std::unique_ptr CreateToken(StringRef Str, SMLoc Loc, - bool HasExplicitEncodingSize = true) { - auto Res = llvm::make_unique(Token); - Res->Tok.Data = Str.data(); - Res->Tok.Length = Str.size(); - Res->StartLoc = Loc; - Res->EndLoc = Loc; - return Res; - } - - static std::unique_ptr CreateReg(unsigned RegNo, SMLoc S, - SMLoc E, - const MCRegisterInfo *TRI, - bool ForceVOP3) { - auto Op = llvm::make_unique(Register); - Op->Reg.RegNo = RegNo; - Op->Reg.TRI = TRI; - Op->Reg.Modifiers = -1; - Op->Reg.IsForcedVOP3 = ForceVOP3; - Op->StartLoc = S; - Op->EndLoc = E; - return Op; - } - - static std::unique_ptr CreateExpr(const class MCExpr *Expr, SMLoc S) { - auto Op = llvm::make_unique(Expression); - Op->Expr = Expr; - Op->StartLoc = S; - Op->EndLoc = S; - return Op; - } - - bool isDSOffset() const; - bool isDSOffset01() const; - bool isSWaitCnt() const; - bool isMubufOffset() const; -}; - -class AMDGPUAsmParser : public MCTargetAsmParser { - MCSubtargetInfo &STI; - const MCInstrInfo &MII; - MCAsmParser &Parser; - - unsigned ForcedEncodingSize; - /// @name Auto-generated Match Functions - /// { - -#define GET_ASSEMBLER_HEADER -#include "AMDGPUGenAsmMatcher.inc" - - /// } - -public: - AMDGPUAsmParser(MCSubtargetInfo &STI, MCAsmParser &_Parser, - const MCInstrInfo &MII, - const MCTargetOptions &Options) - : MCTargetAsmParser(), STI(STI), MII(MII), Parser(_Parser), - ForcedEncodingSize(0){ - - if (STI.getFeatureBits().none()) { - // Set default features. - STI.ToggleFeature("SOUTHERN_ISLANDS"); - } - - setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); - } - - unsigned getForcedEncodingSize() const { - return ForcedEncodingSize; - } - - void setForcedEncodingSize(unsigned Size) { - ForcedEncodingSize = Size; - } - - bool isForcedVOP3() const { - return ForcedEncodingSize == 64; - } - - bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; - unsigned checkTargetMatchPredicate(MCInst &Inst) override; - bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, - OperandVector &Operands, MCStreamer &Out, - uint64_t &ErrorInfo, - bool MatchingInlineAsm) override; - bool ParseDirective(AsmToken DirectiveID) override; - OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic); - bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, - SMLoc NameLoc, OperandVector &Operands) override; - - OperandMatchResultTy parseIntWithPrefix(const char *Prefix, int64_t &Int, - int64_t Default = 0); - OperandMatchResultTy parseIntWithPrefix(const char *Prefix, - OperandVector &Operands, - enum AMDGPUOperand::ImmTy ImmTy = - AMDGPUOperand::ImmTyNone); - OperandMatchResultTy parseNamedBit(const char *Name, OperandVector &Operands, - enum AMDGPUOperand::ImmTy ImmTy = - AMDGPUOperand::ImmTyNone); - OperandMatchResultTy parseOptionalOps( - const ArrayRef &OptionalOps, - OperandVector &Operands); - - - void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands); - void cvtDS(MCInst &Inst, const OperandVector &Operands); - OperandMatchResultTy parseDSOptionalOps(OperandVector &Operands); - OperandMatchResultTy parseDSOff01OptionalOps(OperandVector &Operands); - OperandMatchResultTy parseDSOffsetOptional(OperandVector &Operands); - - bool parseCnt(int64_t &IntVal); - OperandMatchResultTy parseSWaitCntOps(OperandVector &Operands); - OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands); - - OperandMatchResultTy parseFlatOptionalOps(OperandVector &Operands); - OperandMatchResultTy parseFlatAtomicOptionalOps(OperandVector &Operands); - void cvtFlat(MCInst &Inst, const OperandVector &Operands); - - void cvtMubuf(MCInst &Inst, const OperandVector &Operands); - OperandMatchResultTy parseOffset(OperandVector &Operands); - OperandMatchResultTy parseMubufOptionalOps(OperandVector &Operands); - OperandMatchResultTy parseGLC(OperandVector &Operands); - OperandMatchResultTy parseSLC(OperandVector &Operands); - OperandMatchResultTy parseTFE(OperandVector &Operands); - - OperandMatchResultTy parseDMask(OperandVector &Operands); - OperandMatchResultTy parseUNorm(OperandVector &Operands); - OperandMatchResultTy parseR128(OperandVector &Operands); - - void cvtVOP3(MCInst &Inst, const OperandVector &Operands); - OperandMatchResultTy parseVOP3OptionalOps(OperandVector &Operands); -}; - -struct OptionalOperand { - const char *Name; - AMDGPUOperand::ImmTy Type; - bool IsBit; - int64_t Default; - bool (*ConvertResult)(int64_t&); -}; - -} - -static unsigned getRegClass(bool IsVgpr, unsigned RegWidth) { - if (IsVgpr) { - switch (RegWidth) { - default: llvm_unreachable("Unknown register width"); - case 1: return AMDGPU::VGPR_32RegClassID; - case 2: return AMDGPU::VReg_64RegClassID; - case 3: return AMDGPU::VReg_96RegClassID; - case 4: return AMDGPU::VReg_128RegClassID; - case 8: return AMDGPU::VReg_256RegClassID; - case 16: return AMDGPU::VReg_512RegClassID; - } - } - - switch (RegWidth) { - default: llvm_unreachable("Unknown register width"); - case 1: return AMDGPU::SGPR_32RegClassID; - case 2: return AMDGPU::SGPR_64RegClassID; - case 4: return AMDGPU::SReg_128RegClassID; - case 8: return AMDGPU::SReg_256RegClassID; - case 16: return AMDGPU::SReg_512RegClassID; - } -} - -static unsigned getRegForName(const StringRef &RegName) { - - return StringSwitch(RegName) - .Case("exec", AMDGPU::EXEC) - .Case("vcc", AMDGPU::VCC) - .Case("flat_scr", AMDGPU::FLAT_SCR) - .Case("m0", AMDGPU::M0) - .Case("scc", AMDGPU::SCC) - .Case("flat_scr_lo", AMDGPU::FLAT_SCR_LO) - .Case("flat_scr_hi", AMDGPU::FLAT_SCR_HI) - .Case("vcc_lo", AMDGPU::VCC_LO) - .Case("vcc_hi", AMDGPU::VCC_HI) - .Case("exec_lo", AMDGPU::EXEC_LO) - .Case("exec_hi", AMDGPU::EXEC_HI) - .Default(0); -} - -bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) { - const AsmToken Tok = Parser.getTok(); - StartLoc = Tok.getLoc(); - EndLoc = Tok.getEndLoc(); - const StringRef &RegName = Tok.getString(); - RegNo = getRegForName(RegName); - - if (RegNo) { - Parser.Lex(); - return false; - } - - // Match vgprs and sgprs - if (RegName[0] != 's' && RegName[0] != 'v') - return true; - - bool IsVgpr = RegName[0] == 'v'; - unsigned RegWidth; - unsigned RegIndexInClass; - if (RegName.size() > 1) { - // We have a 32-bit register - RegWidth = 1; - if (RegName.substr(1).getAsInteger(10, RegIndexInClass)) - return true; - Parser.Lex(); - } else { - // We have a register greater than 32-bits. - - int64_t RegLo, RegHi; - Parser.Lex(); - if (getLexer().isNot(AsmToken::LBrac)) - return true; - - Parser.Lex(); - if (getParser().parseAbsoluteExpression(RegLo)) - return true; - - if (getLexer().isNot(AsmToken::Colon)) - return true; - - Parser.Lex(); - if (getParser().parseAbsoluteExpression(RegHi)) - return true; - - if (getLexer().isNot(AsmToken::RBrac)) - return true; - - Parser.Lex(); - RegWidth = (RegHi - RegLo) + 1; - if (IsVgpr) { - // VGPR registers aren't aligned. - RegIndexInClass = RegLo; - } else { - // SGPR registers are aligned. Max alignment is 4 dwords. - RegIndexInClass = RegLo / std::min(RegWidth, 4u); - } - } - - const MCRegisterInfo *TRC = getContext().getRegisterInfo(); - unsigned RC = getRegClass(IsVgpr, RegWidth); - if (RegIndexInClass > TRC->getRegClass(RC).getNumRegs()) - return true; - RegNo = TRC->getRegClass(RC).getRegister(RegIndexInClass); - return false; -} - -unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { - - uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags; - - if ((getForcedEncodingSize() == 32 && (TSFlags & SIInstrFlags::VOP3)) || - (getForcedEncodingSize() == 64 && !(TSFlags & SIInstrFlags::VOP3))) - return Match_InvalidOperand; - - return Match_Success; -} - - -bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, - OperandVector &Operands, - MCStreamer &Out, - uint64_t &ErrorInfo, - bool MatchingInlineAsm) { - MCInst Inst; - - switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) { - default: break; - case Match_Success: - Inst.setLoc(IDLoc); - Out.EmitInstruction(Inst, STI); - return false; - case Match_MissingFeature: - return Error(IDLoc, "instruction not supported on this GPU"); - - case Match_MnemonicFail: - return Error(IDLoc, "unrecognized instruction mnemonic"); - - case Match_InvalidOperand: { - SMLoc ErrorLoc = IDLoc; - if (ErrorInfo != ~0ULL) { - if (ErrorInfo >= Operands.size()) { - if (isForcedVOP3()) { - // If 64-bit encoding has been forced we can end up with no - // clamp or omod operands if none of the registers have modifiers, - // so we need to add these to the operand list. - AMDGPUOperand &LastOp = - ((AMDGPUOperand &)*Operands[Operands.size() - 1]); - if (LastOp.isRegKind() || - (LastOp.isImm() && - LastOp.getImmTy() != AMDGPUOperand::ImmTyNone)) { - SMLoc S = Parser.getTok().getLoc(); - Operands.push_back(AMDGPUOperand::CreateImm(0, S, - AMDGPUOperand::ImmTyClamp)); - Operands.push_back(AMDGPUOperand::CreateImm(0, S, - AMDGPUOperand::ImmTyOMod)); - bool Res = MatchAndEmitInstruction(IDLoc, Opcode, Operands, - Out, ErrorInfo, - MatchingInlineAsm); - if (!Res) - return Res; - } - - } - return Error(IDLoc, "too few operands for instruction"); - } - - ErrorLoc = ((AMDGPUOperand &)*Operands[ErrorInfo]).getStartLoc(); - if (ErrorLoc == SMLoc()) - ErrorLoc = IDLoc; - } - return Error(ErrorLoc, "invalid operand for instruction"); - } - } - llvm_unreachable("Implement any new match types added!"); -} - -bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { - return true; -} - -static bool operandsHaveModifiers(const OperandVector &Operands) { - - for (unsigned i = 0, e = Operands.size(); i != e; ++i) { - const AMDGPUOperand &Op = ((AMDGPUOperand&)*Operands[i]); - if (Op.isRegKind() && Op.hasModifiers()) - return true; - if (Op.isImm() && (Op.getImmTy() == AMDGPUOperand::ImmTyOMod || - Op.getImmTy() == AMDGPUOperand::ImmTyClamp)) - return true; - } - return false; -} - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { - - // Try to parse with a custom parser - OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic); - - // If we successfully parsed the operand or if there as an error parsing, - // we are done. - // - // If we are parsing after we reach EndOfStatement then this means we - // are appending default values to the Operands list. This is only done - // by custom parser, so we shouldn't continue on to the generic parsing. - if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail || - getLexer().is(AsmToken::EndOfStatement)) - return ResTy; - - bool Negate = false, Abs = false; - if (getLexer().getKind()== AsmToken::Minus) { - Parser.Lex(); - Negate = true; - } - - if (getLexer().getKind() == AsmToken::Pipe) { - Parser.Lex(); - Abs = true; - } - - switch(getLexer().getKind()) { - case AsmToken::Integer: { - SMLoc S = Parser.getTok().getLoc(); - int64_t IntVal; - if (getParser().parseAbsoluteExpression(IntVal)) - return MatchOperand_ParseFail; - APInt IntVal32(32, IntVal); - if (IntVal32.getSExtValue() != IntVal) { - Error(S, "invalid immediate: only 32-bit values are legal"); - return MatchOperand_ParseFail; - } - - IntVal = IntVal32.getSExtValue(); - if (Negate) - IntVal *= -1; - Operands.push_back(AMDGPUOperand::CreateImm(IntVal, S)); - return MatchOperand_Success; - } - case AsmToken::Real: { - // FIXME: We should emit an error if a double precisions floating-point - // value is used. I'm not sure the best way to detect this. - SMLoc S = Parser.getTok().getLoc(); - int64_t IntVal; - if (getParser().parseAbsoluteExpression(IntVal)) - return MatchOperand_ParseFail; - - APFloat F((float)BitsToDouble(IntVal)); - if (Negate) - F.changeSign(); - Operands.push_back( - AMDGPUOperand::CreateImm(F.bitcastToAPInt().getZExtValue(), S)); - return MatchOperand_Success; - } - case AsmToken::Identifier: { - SMLoc S, E; - unsigned RegNo; - if (!ParseRegister(RegNo, S, E)) { - - bool HasModifiers = operandsHaveModifiers(Operands); - unsigned Modifiers = 0; - - if (Negate) - Modifiers |= 0x1; - - if (Abs) { - if (getLexer().getKind() != AsmToken::Pipe) - return MatchOperand_ParseFail; - Parser.Lex(); - Modifiers |= 0x2; - } - - if (Modifiers && !HasModifiers) { - // We are adding a modifier to src1 or src2 and previous sources - // don't have modifiers, so we need to go back and empty modifers - // for each previous source. - for (unsigned PrevRegIdx = Operands.size() - 1; PrevRegIdx > 1; - --PrevRegIdx) { - - AMDGPUOperand &RegOp = ((AMDGPUOperand&)*Operands[PrevRegIdx]); - RegOp.setModifiers(0); - } - } - - - Operands.push_back(AMDGPUOperand::CreateReg( - RegNo, S, E, getContext().getRegisterInfo(), - isForcedVOP3())); - - if (HasModifiers || Modifiers) { - AMDGPUOperand &RegOp = ((AMDGPUOperand&)*Operands[Operands.size() - 1]); - RegOp.setModifiers(Modifiers); - - } - } else { - Operands.push_back(AMDGPUOperand::CreateToken(Parser.getTok().getString(), - S)); - Parser.Lex(); - } - return MatchOperand_Success; - } - default: - return MatchOperand_NoMatch; - } -} - -bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, - StringRef Name, - SMLoc NameLoc, OperandVector &Operands) { - - // Clear any forced encodings from the previous instruction. - setForcedEncodingSize(0); - - if (Name.endswith("_e64")) - setForcedEncodingSize(64); - else if (Name.endswith("_e32")) - setForcedEncodingSize(32); - - // Add the instruction mnemonic - Operands.push_back(AMDGPUOperand::CreateToken(Name, NameLoc)); - - while (!getLexer().is(AsmToken::EndOfStatement)) { - AMDGPUAsmParser::OperandMatchResultTy Res = parseOperand(Operands, Name); - - // Eat the comma or space if there is one. - if (getLexer().is(AsmToken::Comma)) - Parser.Lex(); - - switch (Res) { - case MatchOperand_Success: break; - case MatchOperand_ParseFail: return Error(getLexer().getLoc(), - "failed parsing operand."); - case MatchOperand_NoMatch: return Error(getLexer().getLoc(), - "not a valid operand."); - } - } - - // Once we reach end of statement, continue parsing so we can add default - // values for optional arguments. - AMDGPUAsmParser::OperandMatchResultTy Res; - while ((Res = parseOperand(Operands, Name)) != MatchOperand_NoMatch) { - if (Res != MatchOperand_Success) - return Error(getLexer().getLoc(), "failed parsing operand."); - } - return false; -} - -//===----------------------------------------------------------------------===// -// Utility functions -//===----------------------------------------------------------------------===// - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int, - int64_t Default) { - - // We are at the end of the statement, and this is a default argument, so - // use a default value. - if (getLexer().is(AsmToken::EndOfStatement)) { - Int = Default; - return MatchOperand_Success; - } - - switch(getLexer().getKind()) { - default: return MatchOperand_NoMatch; - case AsmToken::Identifier: { - StringRef OffsetName = Parser.getTok().getString(); - if (!OffsetName.equals(Prefix)) - return MatchOperand_NoMatch; - - Parser.Lex(); - if (getLexer().isNot(AsmToken::Colon)) - return MatchOperand_ParseFail; - - Parser.Lex(); - if (getLexer().isNot(AsmToken::Integer)) - return MatchOperand_ParseFail; - - if (getParser().parseAbsoluteExpression(Int)) - return MatchOperand_ParseFail; - break; - } - } - return MatchOperand_Success; -} - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands, - enum AMDGPUOperand::ImmTy ImmTy) { - - SMLoc S = Parser.getTok().getLoc(); - int64_t Offset = 0; - - AMDGPUAsmParser::OperandMatchResultTy Res = parseIntWithPrefix(Prefix, Offset); - if (Res != MatchOperand_Success) - return Res; - - Operands.push_back(AMDGPUOperand::CreateImm(Offset, S, ImmTy)); - return MatchOperand_Success; -} - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands, - enum AMDGPUOperand::ImmTy ImmTy) { - int64_t Bit = 0; - SMLoc S = Parser.getTok().getLoc(); - - // We are at the end of the statement, and this is a default argument, so - // use a default value. - if (getLexer().isNot(AsmToken::EndOfStatement)) { - switch(getLexer().getKind()) { - case AsmToken::Identifier: { - StringRef Tok = Parser.getTok().getString(); - if (Tok == Name) { - Bit = 1; - Parser.Lex(); - } else if (Tok.startswith("no") && Tok.endswith(Name)) { - Bit = 0; - Parser.Lex(); - } else { - return MatchOperand_NoMatch; - } - break; - } - default: - return MatchOperand_NoMatch; - } - } - - Operands.push_back(AMDGPUOperand::CreateImm(Bit, S, ImmTy)); - return MatchOperand_Success; -} - -static bool operandsHasOptionalOp(const OperandVector &Operands, - const OptionalOperand &OOp) { - for (unsigned i = 0; i < Operands.size(); i++) { - const AMDGPUOperand &ParsedOp = ((const AMDGPUOperand &)*Operands[i]); - if ((ParsedOp.isImm() && ParsedOp.getImmTy() == OOp.Type) || - (ParsedOp.isToken() && ParsedOp.getToken() == OOp.Name)) - return true; - - } - return false; -} - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseOptionalOps(const ArrayRef &OptionalOps, - OperandVector &Operands) { - SMLoc S = Parser.getTok().getLoc(); - for (const OptionalOperand &Op : OptionalOps) { - if (operandsHasOptionalOp(Operands, Op)) - continue; - AMDGPUAsmParser::OperandMatchResultTy Res; - int64_t Value; - if (Op.IsBit) { - Res = parseNamedBit(Op.Name, Operands, Op.Type); - if (Res == MatchOperand_NoMatch) - continue; - return Res; - } - - Res = parseIntWithPrefix(Op.Name, Value, Op.Default); - - if (Res == MatchOperand_NoMatch) - continue; - - if (Res != MatchOperand_Success) - return Res; - - if (Op.ConvertResult && !Op.ConvertResult(Value)) { - return MatchOperand_ParseFail; - } - - Operands.push_back(AMDGPUOperand::CreateImm(Value, S, Op.Type)); - return MatchOperand_Success; - } - return MatchOperand_NoMatch; -} - -//===----------------------------------------------------------------------===// -// ds -//===----------------------------------------------------------------------===// - -static const OptionalOperand DSOptionalOps [] = { - {"offset", AMDGPUOperand::ImmTyOffset, false, 0, nullptr}, - {"gds", AMDGPUOperand::ImmTyGDS, true, 0, nullptr} -}; - -static const OptionalOperand DSOptionalOpsOff01 [] = { - {"offset0", AMDGPUOperand::ImmTyDSOffset0, false, 0, nullptr}, - {"offset1", AMDGPUOperand::ImmTyDSOffset1, false, 0, nullptr}, - {"gds", AMDGPUOperand::ImmTyGDS, true, 0, nullptr} -}; - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseDSOptionalOps(OperandVector &Operands) { - return parseOptionalOps(DSOptionalOps, Operands); -} -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseDSOff01OptionalOps(OperandVector &Operands) { - return parseOptionalOps(DSOptionalOpsOff01, Operands); -} - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseDSOffsetOptional(OperandVector &Operands) { - SMLoc S = Parser.getTok().getLoc(); - AMDGPUAsmParser::OperandMatchResultTy Res = - parseIntWithPrefix("offset", Operands, AMDGPUOperand::ImmTyOffset); - if (Res == MatchOperand_NoMatch) { - Operands.push_back(AMDGPUOperand::CreateImm(0, S, - AMDGPUOperand::ImmTyOffset)); - Res = MatchOperand_Success; - } - return Res; -} - -bool AMDGPUOperand::isDSOffset() const { - return isImm() && isUInt<16>(getImm()); -} - -bool AMDGPUOperand::isDSOffset01() const { - return isImm() && isUInt<8>(getImm()); -} - -void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst, - const OperandVector &Operands) { - - std::map OptionalIdx; - - for (unsigned i = 1, e = Operands.size(); i != e; ++i) { - AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); - - // Add the register arguments - if (Op.isReg()) { - Op.addRegOperands(Inst, 1); - continue; - } - - // Handle optional arguments - OptionalIdx[Op.getImmTy()] = i; - } - - unsigned Offset0Idx = OptionalIdx[AMDGPUOperand::ImmTyDSOffset0]; - unsigned Offset1Idx = OptionalIdx[AMDGPUOperand::ImmTyDSOffset1]; - unsigned GDSIdx = OptionalIdx[AMDGPUOperand::ImmTyGDS]; - - ((AMDGPUOperand &)*Operands[Offset0Idx]).addImmOperands(Inst, 1); // offset0 - ((AMDGPUOperand &)*Operands[Offset1Idx]).addImmOperands(Inst, 1); // offset1 - ((AMDGPUOperand &)*Operands[GDSIdx]).addImmOperands(Inst, 1); // gds - Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0 -} - -void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) { - - std::map OptionalIdx; - bool GDSOnly = false; - - for (unsigned i = 1, e = Operands.size(); i != e; ++i) { - AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); - - // Add the register arguments - if (Op.isReg()) { - Op.addRegOperands(Inst, 1); - continue; - } - - if (Op.isToken() && Op.getToken() == "gds") { - GDSOnly = true; - continue; - } - - // Handle optional arguments - OptionalIdx[Op.getImmTy()] = i; - } - - unsigned OffsetIdx = OptionalIdx[AMDGPUOperand::ImmTyOffset]; - ((AMDGPUOperand &)*Operands[OffsetIdx]).addImmOperands(Inst, 1); // offset - - if (!GDSOnly) { - unsigned GDSIdx = OptionalIdx[AMDGPUOperand::ImmTyGDS]; - ((AMDGPUOperand &)*Operands[GDSIdx]).addImmOperands(Inst, 1); // gds - } - Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0 -} - - -//===----------------------------------------------------------------------===// -// s_waitcnt -//===----------------------------------------------------------------------===// - -bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) { - StringRef CntName = Parser.getTok().getString(); - int64_t CntVal; - - Parser.Lex(); - if (getLexer().isNot(AsmToken::LParen)) - return true; - - Parser.Lex(); - if (getLexer().isNot(AsmToken::Integer)) - return true; - - if (getParser().parseAbsoluteExpression(CntVal)) - return true; - - if (getLexer().isNot(AsmToken::RParen)) - return true; - - Parser.Lex(); - if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma)) - Parser.Lex(); - - int CntShift; - int CntMask; - - if (CntName == "vmcnt") { - CntMask = 0xf; - CntShift = 0; - } else if (CntName == "expcnt") { - CntMask = 0x7; - CntShift = 4; - } else if (CntName == "lgkmcnt") { - CntMask = 0x7; - CntShift = 8; - } else { - return true; - } - - IntVal &= ~(CntMask << CntShift); - IntVal |= (CntVal << CntShift); - return false; -} - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) { - // Disable all counters by default. - // vmcnt [3:0] - // expcnt [6:4] - // lgkmcnt [10:8] - int64_t CntVal = 0x77f; - SMLoc S = Parser.getTok().getLoc(); - - switch(getLexer().getKind()) { - default: return MatchOperand_ParseFail; - case AsmToken::Integer: - // The operand can be an integer value. - if (getParser().parseAbsoluteExpression(CntVal)) - return MatchOperand_ParseFail; - break; - - case AsmToken::Identifier: - do { - if (parseCnt(CntVal)) - return MatchOperand_ParseFail; - } while(getLexer().isNot(AsmToken::EndOfStatement)); - break; - } - Operands.push_back(AMDGPUOperand::CreateImm(CntVal, S)); - return MatchOperand_Success; -} - -bool AMDGPUOperand::isSWaitCnt() const { - return isImm(); -} - -//===----------------------------------------------------------------------===// -// sopp branch targets -//===----------------------------------------------------------------------===// - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) { - SMLoc S = Parser.getTok().getLoc(); - - switch (getLexer().getKind()) { - default: return MatchOperand_ParseFail; - case AsmToken::Integer: { - int64_t Imm; - if (getParser().parseAbsoluteExpression(Imm)) - return MatchOperand_ParseFail; - Operands.push_back(AMDGPUOperand::CreateImm(Imm, S)); - return MatchOperand_Success; - } - - case AsmToken::Identifier: - Operands.push_back(AMDGPUOperand::CreateExpr( - MCSymbolRefExpr::create(getContext().getOrCreateSymbol( - Parser.getTok().getString()), getContext()), S)); - Parser.Lex(); - return MatchOperand_Success; - } -} - -//===----------------------------------------------------------------------===// -// flat -//===----------------------------------------------------------------------===// - -static const OptionalOperand FlatOptionalOps [] = { - {"glc", AMDGPUOperand::ImmTyGLC, true, 0, nullptr}, - {"slc", AMDGPUOperand::ImmTySLC, true, 0, nullptr}, - {"tfe", AMDGPUOperand::ImmTyTFE, true, 0, nullptr} -}; - -static const OptionalOperand FlatAtomicOptionalOps [] = { - {"slc", AMDGPUOperand::ImmTySLC, true, 0, nullptr}, - {"tfe", AMDGPUOperand::ImmTyTFE, true, 0, nullptr} -}; - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseFlatOptionalOps(OperandVector &Operands) { - return parseOptionalOps(FlatOptionalOps, Operands); -} - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseFlatAtomicOptionalOps(OperandVector &Operands) { - return parseOptionalOps(FlatAtomicOptionalOps, Operands); -} - -void AMDGPUAsmParser::cvtFlat(MCInst &Inst, - const OperandVector &Operands) { - std::map OptionalIdx; - - for (unsigned i = 1, e = Operands.size(); i != e; ++i) { - AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); - - // Add the register arguments - if (Op.isReg()) { - Op.addRegOperands(Inst, 1); - continue; - } - - // Handle 'glc' token which is sometimes hard-coded into the - // asm string. There are no MCInst operands for these. - if (Op.isToken()) - continue; - - // Handle optional arguments - OptionalIdx[Op.getImmTy()] = i; - - } - - // flat atomic instructions don't have a glc argument. - if (OptionalIdx.count(AMDGPUOperand::ImmTyGLC)) { - unsigned GLCIdx = OptionalIdx[AMDGPUOperand::ImmTyGLC]; - ((AMDGPUOperand &)*Operands[GLCIdx]).addImmOperands(Inst, 1); - } - - unsigned SLCIdx = OptionalIdx[AMDGPUOperand::ImmTySLC]; - unsigned TFEIdx = OptionalIdx[AMDGPUOperand::ImmTyTFE]; - - ((AMDGPUOperand &)*Operands[SLCIdx]).addImmOperands(Inst, 1); - ((AMDGPUOperand &)*Operands[TFEIdx]).addImmOperands(Inst, 1); -} - -//===----------------------------------------------------------------------===// -// mubuf -//===----------------------------------------------------------------------===// - -static const OptionalOperand MubufOptionalOps [] = { - {"offset", AMDGPUOperand::ImmTyOffset, false, 0, nullptr}, - {"glc", AMDGPUOperand::ImmTyGLC, true, 0, nullptr}, - {"slc", AMDGPUOperand::ImmTySLC, true, 0, nullptr}, - {"tfe", AMDGPUOperand::ImmTyTFE, true, 0, nullptr} -}; - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseMubufOptionalOps(OperandVector &Operands) { - return parseOptionalOps(MubufOptionalOps, Operands); -} - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseOffset(OperandVector &Operands) { - return parseIntWithPrefix("offset", Operands); -} - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseGLC(OperandVector &Operands) { - return parseNamedBit("glc", Operands); -} - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseSLC(OperandVector &Operands) { - return parseNamedBit("slc", Operands); -} - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseTFE(OperandVector &Operands) { - return parseNamedBit("tfe", Operands); -} - -bool AMDGPUOperand::isMubufOffset() const { - return isImm() && isUInt<12>(getImm()); -} - -void AMDGPUAsmParser::cvtMubuf(MCInst &Inst, - const OperandVector &Operands) { - std::map OptionalIdx; - - for (unsigned i = 1, e = Operands.size(); i != e; ++i) { - AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); - - // Add the register arguments - if (Op.isReg()) { - Op.addRegOperands(Inst, 1); - continue; - } - - // Handle the case where soffset is an immediate - if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) { - Op.addImmOperands(Inst, 1); - continue; - } - - // Handle tokens like 'offen' which are sometimes hard-coded into the - // asm string. There are no MCInst operands for these. - if (Op.isToken()) { - continue; - } - assert(Op.isImm()); - - // Handle optional arguments - OptionalIdx[Op.getImmTy()] = i; - } - - assert(OptionalIdx.size() == 4); - - unsigned OffsetIdx = OptionalIdx[AMDGPUOperand::ImmTyOffset]; - unsigned GLCIdx = OptionalIdx[AMDGPUOperand::ImmTyGLC]; - unsigned SLCIdx = OptionalIdx[AMDGPUOperand::ImmTySLC]; - unsigned TFEIdx = OptionalIdx[AMDGPUOperand::ImmTyTFE]; - - ((AMDGPUOperand &)*Operands[OffsetIdx]).addImmOperands(Inst, 1); - ((AMDGPUOperand &)*Operands[GLCIdx]).addImmOperands(Inst, 1); - ((AMDGPUOperand &)*Operands[SLCIdx]).addImmOperands(Inst, 1); - ((AMDGPUOperand &)*Operands[TFEIdx]).addImmOperands(Inst, 1); -} - -//===----------------------------------------------------------------------===// -// mimg -//===----------------------------------------------------------------------===// - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseDMask(OperandVector &Operands) { - return parseIntWithPrefix("dmask", Operands); -} - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseUNorm(OperandVector &Operands) { - return parseNamedBit("unorm", Operands); -} - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseR128(OperandVector &Operands) { - return parseNamedBit("r128", Operands); -} - -//===----------------------------------------------------------------------===// -// vop3 -//===----------------------------------------------------------------------===// - -static bool ConvertOmodMul(int64_t &Mul) { - if (Mul != 1 && Mul != 2 && Mul != 4) - return false; - - Mul >>= 1; - return true; -} - -static bool ConvertOmodDiv(int64_t &Div) { - if (Div == 1) { - Div = 0; - return true; - } - - if (Div == 2) { - Div = 3; - return true; - } - - return false; -} - -static const OptionalOperand VOP3OptionalOps [] = { - {"clamp", AMDGPUOperand::ImmTyClamp, true, 0, nullptr}, - {"mul", AMDGPUOperand::ImmTyOMod, false, 1, ConvertOmodMul}, - {"div", AMDGPUOperand::ImmTyOMod, false, 1, ConvertOmodDiv}, -}; - -static bool isVOP3(OperandVector &Operands) { - if (operandsHaveModifiers(Operands)) - return true; - - AMDGPUOperand &DstOp = ((AMDGPUOperand&)*Operands[1]); - - if (DstOp.isReg() && DstOp.isRegClass(AMDGPU::SGPR_64RegClassID)) - return true; - - if (Operands.size() >= 5) - return true; - - if (Operands.size() > 3) { - AMDGPUOperand &Src1Op = ((AMDGPUOperand&)*Operands[3]); - if (Src1Op.getReg() && (Src1Op.isRegClass(AMDGPU::SReg_32RegClassID) || - Src1Op.isRegClass(AMDGPU::SReg_64RegClassID))) - return true; - } - return false; -} - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseVOP3OptionalOps(OperandVector &Operands) { - - // The value returned by this function may change after parsing - // an operand so store the original value here. - bool HasModifiers = operandsHaveModifiers(Operands); - - bool IsVOP3 = isVOP3(Operands); - if (HasModifiers || IsVOP3 || - getLexer().isNot(AsmToken::EndOfStatement) || - getForcedEncodingSize() == 64) { - - AMDGPUAsmParser::OperandMatchResultTy Res = - parseOptionalOps(VOP3OptionalOps, Operands); - - if (!HasModifiers && Res == MatchOperand_Success) { - // We have added a modifier operation, so we need to make sure all - // previous register operands have modifiers - for (unsigned i = 2, e = Operands.size(); i != e; ++i) { - AMDGPUOperand &Op = ((AMDGPUOperand&)*Operands[i]); - if (Op.isReg()) - Op.setModifiers(0); - } - } - return Res; - } - return MatchOperand_NoMatch; -} - -void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) { - ((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1); - unsigned i = 2; - - std::map OptionalIdx; - - if (operandsHaveModifiers(Operands)) { - for (unsigned e = Operands.size(); i != e; ++i) { - AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); - - if (Op.isRegWithInputMods()) { - ((AMDGPUOperand &)*Operands[i]).addRegWithInputModsOperands(Inst, 2); - continue; - } - OptionalIdx[Op.getImmTy()] = i; - } - - unsigned ClampIdx = OptionalIdx[AMDGPUOperand::ImmTyClamp]; - unsigned OModIdx = OptionalIdx[AMDGPUOperand::ImmTyOMod]; - - ((AMDGPUOperand &)*Operands[ClampIdx]).addImmOperands(Inst, 1); - ((AMDGPUOperand &)*Operands[OModIdx]).addImmOperands(Inst, 1); - } else { - for (unsigned e = Operands.size(); i != e; ++i) - ((AMDGPUOperand &)*Operands[i]).addRegOrImmOperands(Inst, 1); - } -} - -/// Force static initialization. -extern "C" void LLVMInitializeR600AsmParser() { - RegisterMCAsmParser A(TheAMDGPUTarget); - RegisterMCAsmParser B(TheGCNTarget); -} - -#define GET_REGISTER_MATCHER -#define GET_MATCHER_IMPLEMENTATION -#include "AMDGPUGenAsmMatcher.inc" - diff --git a/lib/Target/R600/AsmParser/CMakeLists.txt b/lib/Target/R600/AsmParser/CMakeLists.txt deleted file mode 100644 index 1b42af73740..00000000000 --- a/lib/Target/R600/AsmParser/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -add_llvm_library(LLVMR600AsmParser - AMDGPUAsmParser.cpp - ) diff --git a/lib/Target/R600/AsmParser/LLVMBuild.txt b/lib/Target/R600/AsmParser/LLVMBuild.txt deleted file mode 100644 index 940e4cee6df..00000000000 --- a/lib/Target/R600/AsmParser/LLVMBuild.txt +++ /dev/null @@ -1,23 +0,0 @@ -;===- ./lib/Target/R600/AsmParser/LLVMBuild.txt -------------*- Conf -*--===; -; -; The LLVM Compiler Infrastructure -; -; This file is distributed under the University of Illinois Open Source -; License. See LICENSE.TXT for details. -; -;===------------------------------------------------------------------------===; -; -; This is an LLVMBuild description file for the components in this subdirectory. -; -; For more information on the LLVMBuild system, please see: -; -; http://llvm.org/docs/LLVMBuild.html -; -;===------------------------------------------------------------------------===; - -[component_0] -type = Library -name = R600AsmParser -parent = R600 -required_libraries = MC MCParser R600Desc R600Info Support -add_to_library_groups = R600 diff --git a/lib/Target/R600/AsmParser/Makefile b/lib/Target/R600/AsmParser/Makefile deleted file mode 100644 index e6689b54b6b..00000000000 --- a/lib/Target/R600/AsmParser/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -##===- lib/Target/R600/AsmParser/Makefile ----------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## -LEVEL = ../../../.. -LIBRARYNAME = LLVMR600AsmParser - -# Hack: we need to include 'main' R600 target directory to grab private headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/R600/CIInstructions.td b/lib/Target/R600/CIInstructions.td deleted file mode 100644 index 2f5fdbe9207..00000000000 --- a/lib/Target/R600/CIInstructions.td +++ /dev/null @@ -1,149 +0,0 @@ -//===-- CIInstructions.td - CI Instruction Defintions ---------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// Instruction definitions for CI and newer. -//===----------------------------------------------------------------------===// - - -def isCIVI : Predicate < - "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || " - "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS" ->, AssemblerPredicate<"FeatureCIInsts">; - -def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">; - -//===----------------------------------------------------------------------===// -// VOP1 Instructions -//===----------------------------------------------------------------------===// - -let SubtargetPredicate = isCIVI in { - -defm V_TRUNC_F64 : VOP1Inst , "v_trunc_f64", - VOP_F64_F64, ftrunc ->; -defm V_CEIL_F64 : VOP1Inst , "v_ceil_f64", - VOP_F64_F64, fceil ->; -defm V_FLOOR_F64 : VOP1Inst , "v_floor_f64", - VOP_F64_F64, ffloor ->; -defm V_RNDNE_F64 : VOP1Inst , "v_rndne_f64", - VOP_F64_F64, frint ->; -defm V_LOG_LEGACY_F32 : VOP1Inst , "v_log_legacy_f32", - VOP_F32_F32 ->; -defm V_EXP_LEGACY_F32 : VOP1Inst , "v_exp_legacy_f32", - VOP_F32_F32 ->; - -//===----------------------------------------------------------------------===// -// Flat Instructions -//===----------------------------------------------------------------------===// - -def FLAT_LOAD_UBYTE : FLAT_Load_Helper <0x8, "flat_load_ubyte", VGPR_32>; -def FLAT_LOAD_SBYTE : FLAT_Load_Helper <0x9, "flat_load_sbyte", VGPR_32>; -def FLAT_LOAD_USHORT : FLAT_Load_Helper <0xa, "flat_load_ushort", VGPR_32>; -def FLAT_LOAD_SSHORT : FLAT_Load_Helper <0xb, "flat_load_sshort", VGPR_32>; -def FLAT_LOAD_DWORD : FLAT_Load_Helper <0xc, "flat_load_dword", VGPR_32>; -def FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <0xd, "flat_load_dwordx2", VReg_64>; -def FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <0xe, "flat_load_dwordx4", VReg_128>; -def FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <0xf, "flat_load_dwordx3", VReg_96>; -def FLAT_STORE_BYTE : FLAT_Store_Helper <0x18, "flat_store_byte", VGPR_32>; -def FLAT_STORE_SHORT : FLAT_Store_Helper <0x1a, "flat_store_short", VGPR_32>; -def FLAT_STORE_DWORD : FLAT_Store_Helper <0x1c, "flat_store_dword", VGPR_32>; -def FLAT_STORE_DWORDX2 : FLAT_Store_Helper < - 0x1d, "flat_store_dwordx2", VReg_64 ->; -def FLAT_STORE_DWORDX4 : FLAT_Store_Helper < - 0x1e, "flat_store_dwordx4", VReg_128 ->; -def FLAT_STORE_DWORDX3 : FLAT_Store_Helper < - 0x1f, "flat_store_dwordx3", VReg_96 ->; -defm FLAT_ATOMIC_SWAP : FLAT_ATOMIC <0x30, "flat_atomic_swap", VGPR_32>; -defm FLAT_ATOMIC_CMPSWAP : FLAT_ATOMIC < - 0x31, "flat_atomic_cmpswap", VGPR_32, VReg_64 ->; -defm FLAT_ATOMIC_ADD : FLAT_ATOMIC <0x32, "flat_atomic_add", VGPR_32>; -defm FLAT_ATOMIC_SUB : FLAT_ATOMIC <0x33, "flat_atomic_sub", VGPR_32>; -defm FLAT_ATOMIC_RSUB : FLAT_ATOMIC <0x34, "flat_atomic_rsub", VGPR_32>; -defm FLAT_ATOMIC_SMIN : FLAT_ATOMIC <0x35, "flat_atomic_smin", VGPR_32>; -defm FLAT_ATOMIC_UMIN : FLAT_ATOMIC <0x36, "flat_atomic_umin", VGPR_32>; -defm FLAT_ATOMIC_SMAX : FLAT_ATOMIC <0x37, "flat_atomic_smax", VGPR_32>; -defm FLAT_ATOMIC_UMAX : FLAT_ATOMIC <0x38, "flat_atomic_umax", VGPR_32>; -defm FLAT_ATOMIC_AND : FLAT_ATOMIC <0x39, "flat_atomic_and", VGPR_32>; -defm FLAT_ATOMIC_OR : FLAT_ATOMIC <0x3a, "flat_atomic_or", VGPR_32>; -defm FLAT_ATOMIC_XOR : FLAT_ATOMIC <0x3b, "flat_atomic_xor", VGPR_32>; -defm FLAT_ATOMIC_INC : FLAT_ATOMIC <0x3c, "flat_atomic_inc", VGPR_32>; -defm FLAT_ATOMIC_DEC : FLAT_ATOMIC <0x3d, "flat_atomic_dec", VGPR_32>; -defm FLAT_ATOMIC_FCMPSWAP : FLAT_ATOMIC < - 0x3e, "flat_atomic_fcmpswap", VGPR_32, VReg_64 ->; -defm FLAT_ATOMIC_FMIN : FLAT_ATOMIC <0x3f, "flat_atomic_fmin", VGPR_32>; -defm FLAT_ATOMIC_FMAX : FLAT_ATOMIC <0x40, "flat_atomic_fmax", VGPR_32>; -defm FLAT_ATOMIC_SWAP_X2 : FLAT_ATOMIC <0x50, "flat_atomic_swap_x2", VReg_64>; -defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_ATOMIC < - 0x51, "flat_atomic_cmpswap_x2", VReg_64, VReg_128 ->; -defm FLAT_ATOMIC_ADD_X2 : FLAT_ATOMIC <0x52, "flat_atomic_add_x2", VReg_64>; -defm FLAT_ATOMIC_SUB_X2 : FLAT_ATOMIC <0x53, "flat_atomic_sub_x2", VReg_64>; -defm FLAT_ATOMIC_RSUB_X2 : FLAT_ATOMIC <0x54, "flat_atomic_rsub_x2", VReg_64>; -defm FLAT_ATOMIC_SMIN_X2 : FLAT_ATOMIC <0x55, "flat_atomic_smin_x2", VReg_64>; -defm FLAT_ATOMIC_UMIN_X2 : FLAT_ATOMIC <0x56, "flat_atomic_umin_x2", VReg_64>; -defm FLAT_ATOMIC_SMAX_X2 : FLAT_ATOMIC <0x57, "flat_atomic_smax_x2", VReg_64>; -defm FLAT_ATOMIC_UMAX_X2 : FLAT_ATOMIC <0x58, "flat_atomic_umax_x2", VReg_64>; -defm FLAT_ATOMIC_AND_X2 : FLAT_ATOMIC <0x59, "flat_atomic_and_x2", VReg_64>; -defm FLAT_ATOMIC_OR_X2 : FLAT_ATOMIC <0x5a, "flat_atomic_or_x2", VReg_64>; -defm FLAT_ATOMIC_XOR_X2 : FLAT_ATOMIC <0x5b, "flat_atomic_xor_x2", VReg_64>; -defm FLAT_ATOMIC_INC_X2 : FLAT_ATOMIC <0x5c, "flat_atomic_inc_x2", VReg_64>; -defm FLAT_ATOMIC_DEC_X2 : FLAT_ATOMIC <0x5d, "flat_atomic_dec_x2", VReg_64>; -defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_ATOMIC < - 0x5e, "flat_atomic_fcmpswap_x2", VReg_64, VReg_128 ->; -defm FLAT_ATOMIC_FMIN_X2 : FLAT_ATOMIC <0x5f, "flat_atomic_fmin_x2", VReg_64>; -defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC <0x60, "flat_atomic_fmax_x2", VReg_64>; - -} // End SubtargetPredicate = isCIVI - -//===----------------------------------------------------------------------===// -// Flat Patterns -//===----------------------------------------------------------------------===// - -let Predicates = [HasFlatAddressSpace] in { - -class FLATLoad_Pattern : - Pat <(vt (flat_ld i64:$ptr)), - (Instr_ADDR64 $ptr, 0, 0, 0) ->; - -def : FLATLoad_Pattern ; -def : FLATLoad_Pattern ; -def : FLATLoad_Pattern ; -def : FLATLoad_Pattern ; -def : FLATLoad_Pattern ; -def : FLATLoad_Pattern ; -def : FLATLoad_Pattern ; -def : FLATLoad_Pattern ; -def : FLATLoad_Pattern ; - -class FLATStore_Pattern : - Pat <(st vt:$value, i64:$ptr), - (Instr $value, $ptr, 0, 0, 0) - >; - -def : FLATStore_Pattern ; -def : FLATStore_Pattern ; -def : FLATStore_Pattern ; -def : FLATStore_Pattern ; -def : FLATStore_Pattern ; -def : FLATStore_Pattern ; - -} // End HasFlatAddressSpace predicate - diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt deleted file mode 100644 index 3c1bc49f282..00000000000 --- a/lib/Target/R600/CMakeLists.txt +++ /dev/null @@ -1,64 +0,0 @@ -set(LLVM_TARGET_DEFINITIONS AMDGPU.td) - -tablegen(LLVM AMDGPUGenRegisterInfo.inc -gen-register-info) -tablegen(LLVM AMDGPUGenInstrInfo.inc -gen-instr-info) -tablegen(LLVM AMDGPUGenDAGISel.inc -gen-dag-isel) -tablegen(LLVM AMDGPUGenCallingConv.inc -gen-callingconv) -tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget) -tablegen(LLVM AMDGPUGenIntrinsics.inc -gen-tgt-intrinsic) -tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter) -tablegen(LLVM AMDGPUGenDFAPacketizer.inc -gen-dfa-packetizer) -tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer) -tablegen(LLVM AMDGPUGenAsmMatcher.inc -gen-asm-matcher) -add_public_tablegen_target(AMDGPUCommonTableGen) - -add_llvm_target(R600CodeGen - AMDILCFGStructurizer.cpp - AMDGPUAlwaysInlinePass.cpp - AMDGPUAsmPrinter.cpp - AMDGPUFrameLowering.cpp - AMDGPUIntrinsicInfo.cpp - AMDGPUISelDAGToDAG.cpp - AMDGPUMCInstLower.cpp - AMDGPUMachineFunction.cpp - AMDGPUSubtarget.cpp - AMDGPUTargetMachine.cpp - AMDGPUTargetTransformInfo.cpp - AMDGPUISelLowering.cpp - AMDGPUInstrInfo.cpp - AMDGPUPromoteAlloca.cpp - AMDGPURegisterInfo.cpp - R600ClauseMergePass.cpp - R600ControlFlowFinalizer.cpp - R600EmitClauseMarkers.cpp - R600ExpandSpecialInstrs.cpp - R600InstrInfo.cpp - R600ISelLowering.cpp - R600MachineFunctionInfo.cpp - R600MachineScheduler.cpp - R600OptimizeVectorRegisters.cpp - R600Packetizer.cpp - R600RegisterInfo.cpp - R600TextureIntrinsicsReplacer.cpp - SIAnnotateControlFlow.cpp - SIFixControlFlowLiveIntervals.cpp - SIFixSGPRCopies.cpp - SIFixSGPRLiveRanges.cpp - SIFoldOperands.cpp - SIInsertWaits.cpp - SIInstrInfo.cpp - SIISelLowering.cpp - SILoadStoreOptimizer.cpp - SILowerControlFlow.cpp - SILowerI1Copies.cpp - SIMachineFunctionInfo.cpp - SIPrepareScratchRegs.cpp - SIRegisterInfo.cpp - SIShrinkInstructions.cpp - SITypeRewriter.cpp - ) - -add_subdirectory(AsmParser) -add_subdirectory(InstPrinter) -add_subdirectory(TargetInfo) -add_subdirectory(MCTargetDesc) diff --git a/lib/Target/R600/CaymanInstructions.td b/lib/Target/R600/CaymanInstructions.td deleted file mode 100644 index ba4df82a6d3..00000000000 --- a/lib/Target/R600/CaymanInstructions.td +++ /dev/null @@ -1,226 +0,0 @@ -//===-- CaymanInstructions.td - CM Instruction defs -------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// TableGen definitions for instructions which are available only on Cayman -// family GPUs. -// -//===----------------------------------------------------------------------===// - -def isCayman : Predicate<"Subtarget->hasCaymanISA()">; - -//===----------------------------------------------------------------------===// -// Cayman Instructions -//===----------------------------------------------------------------------===// - -let Predicates = [isCayman] in { - -def MULADD_INT24_cm : R600_3OP <0x08, "MULADD_INT24", - [(set i32:$dst, (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2))], VecALU ->; -def MUL_INT24_cm : R600_2OP <0x5B, "MUL_INT24", - [(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))], VecALU ->; - -def : IMad24Pat; - -let isVector = 1 in { - -def RECIP_IEEE_cm : RECIP_IEEE_Common<0x86>; - -def MULLO_INT_cm : MULLO_INT_Common<0x8F>; -def MULHI_INT_cm : MULHI_INT_Common<0x90>; -def MULLO_UINT_cm : MULLO_UINT_Common<0x91>; -def MULHI_UINT_cm : MULHI_UINT_Common<0x92>; -def RECIPSQRT_CLAMPED_cm : RECIPSQRT_CLAMPED_Common<0x87>; -def EXP_IEEE_cm : EXP_IEEE_Common<0x81>; -def LOG_IEEE_cm : LOG_IEEE_Common<0x83>; -def RECIP_CLAMPED_cm : RECIP_CLAMPED_Common<0x84>; -def RECIPSQRT_IEEE_cm : RECIPSQRT_IEEE_Common<0x89>; -def SIN_cm : SIN_Common<0x8D>; -def COS_cm : COS_Common<0x8E>; -} // End isVector = 1 - -def : RsqPat; - -def : POW_Common ; - -defm DIV_cm : DIV_Common; -defm : Expand24UBitOps; - -// RECIP_UINT emulation for Cayman -// The multiplication scales from [0,1] to the unsigned integer range -def : Pat < - (AMDGPUurecip i32:$src0), - (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg $src0)), - (MOV_IMM_I32 CONST.FP_UINT_MAX_PLUS_1))) ->; - - def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> { - let ADDR = 0; - let POP_COUNT = 0; - let COUNT = 0; - } - - -def : Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>; - -class RAT_STORE_DWORD mask> : - CF_MEM_RAT_CACHELESS <0x14, 0, mask, - (ins rc:$rw_gpr, R600_TReg32_X:$index_gpr), - "STORE_DWORD $rw_gpr, $index_gpr", - [(global_store vt:$rw_gpr, i32:$index_gpr)]> { - let eop = 0; // This bit is not used on Cayman. -} - -def RAT_STORE_DWORD32 : RAT_STORE_DWORD ; -def RAT_STORE_DWORD64 : RAT_STORE_DWORD ; -def RAT_STORE_DWORD128 : RAT_STORE_DWORD ; - -class VTX_READ_cm buffer_id, dag outs, list pattern> - : VTX_WORD0_cm, VTX_READ { - - // Static fields - let VC_INST = 0; - let FETCH_TYPE = 2; - let FETCH_WHOLE_QUAD = 0; - let BUFFER_ID = buffer_id; - let SRC_REL = 0; - // XXX: We can infer this field based on the SRC_GPR. This would allow us - // to store vertex addresses in any channel, not just X. - let SRC_SEL_X = 0; - let SRC_SEL_Y = 0; - let STRUCTURED_READ = 0; - let LDS_REQ = 0; - let COALESCED_READ = 0; - - let Inst{31-0} = Word0; -} - -class VTX_READ_8_cm buffer_id, list pattern> - : VTX_READ_cm <"VTX_READ_8 $dst_gpr, $src_gpr", buffer_id, - (outs R600_TReg32_X:$dst_gpr), pattern> { - - let DST_SEL_X = 0; - let DST_SEL_Y = 7; // Masked - let DST_SEL_Z = 7; // Masked - let DST_SEL_W = 7; // Masked - let DATA_FORMAT = 1; // FMT_8 -} - -class VTX_READ_16_cm buffer_id, list pattern> - : VTX_READ_cm <"VTX_READ_16 $dst_gpr, $src_gpr", buffer_id, - (outs R600_TReg32_X:$dst_gpr), pattern> { - let DST_SEL_X = 0; - let DST_SEL_Y = 7; // Masked - let DST_SEL_Z = 7; // Masked - let DST_SEL_W = 7; // Masked - let DATA_FORMAT = 5; // FMT_16 - -} - -class VTX_READ_32_cm buffer_id, list pattern> - : VTX_READ_cm <"VTX_READ_32 $dst_gpr, $src_gpr", buffer_id, - (outs R600_TReg32_X:$dst_gpr), pattern> { - - let DST_SEL_X = 0; - let DST_SEL_Y = 7; // Masked - let DST_SEL_Z = 7; // Masked - let DST_SEL_W = 7; // Masked - let DATA_FORMAT = 0xD; // COLOR_32 - - // This is not really necessary, but there were some GPU hangs that appeared - // to be caused by ALU instructions in the next instruction group that wrote - // to the $src_gpr registers of the VTX_READ. - // e.g. - // %T3_X = VTX_READ_PARAM_32_eg %T2_X, 24 - // %T2_X = MOV %ZERO - //Adding this constraint prevents this from happening. - let Constraints = "$src_gpr.ptr = $dst_gpr"; -} - -class VTX_READ_64_cm buffer_id, list pattern> - : VTX_READ_cm <"VTX_READ_64 $dst_gpr, $src_gpr", buffer_id, - (outs R600_Reg64:$dst_gpr), pattern> { - - let DST_SEL_X = 0; - let DST_SEL_Y = 1; - let DST_SEL_Z = 7; - let DST_SEL_W = 7; - let DATA_FORMAT = 0x1D; // COLOR_32_32 -} - -class VTX_READ_128_cm buffer_id, list pattern> - : VTX_READ_cm <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id, - (outs R600_Reg128:$dst_gpr), pattern> { - - let DST_SEL_X = 0; - let DST_SEL_Y = 1; - let DST_SEL_Z = 2; - let DST_SEL_W = 3; - let DATA_FORMAT = 0x22; // COLOR_32_32_32_32 - - // XXX: Need to force VTX_READ_128 instructions to write to the same register - // that holds its buffer address to avoid potential hangs. We can't use - // the same constraint as VTX_READ_32_eg, because the $src_gpr.ptr and $dst - // registers are different sizes. -} - -//===----------------------------------------------------------------------===// -// VTX Read from parameter memory space -//===----------------------------------------------------------------------===// -def VTX_READ_PARAM_8_cm : VTX_READ_8_cm <0, - [(set i32:$dst_gpr, (load_param_exti8 ADDRVTX_READ:$src_gpr))] ->; - -def VTX_READ_PARAM_16_cm : VTX_READ_16_cm <0, - [(set i32:$dst_gpr, (load_param_exti16 ADDRVTX_READ:$src_gpr))] ->; - -def VTX_READ_PARAM_32_cm : VTX_READ_32_cm <0, - [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] ->; - -def VTX_READ_PARAM_64_cm : VTX_READ_64_cm <0, - [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] ->; - -def VTX_READ_PARAM_128_cm : VTX_READ_128_cm <0, - [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] ->; - -//===----------------------------------------------------------------------===// -// VTX Read from global memory space -//===----------------------------------------------------------------------===// - -// 8-bit reads -def VTX_READ_GLOBAL_8_cm : VTX_READ_8_cm <1, - [(set i32:$dst_gpr, (az_extloadi8_global ADDRVTX_READ:$src_gpr))] ->; - -def VTX_READ_GLOBAL_16_cm : VTX_READ_16_cm <1, - [(set i32:$dst_gpr, (az_extloadi16_global ADDRVTX_READ:$src_gpr))] ->; - -// 32-bit reads -def VTX_READ_GLOBAL_32_cm : VTX_READ_32_cm <1, - [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] ->; - -// 64-bit reads -def VTX_READ_GLOBAL_64_cm : VTX_READ_64_cm <1, - [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] ->; - -// 128-bit reads -def VTX_READ_GLOBAL_128_cm : VTX_READ_128_cm <1, - [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] ->; - -} // End isCayman - diff --git a/lib/Target/R600/EvergreenInstructions.td b/lib/Target/R600/EvergreenInstructions.td deleted file mode 100644 index 7adcd46fe19..00000000000 --- a/lib/Target/R600/EvergreenInstructions.td +++ /dev/null @@ -1,670 +0,0 @@ -//===-- EvergreenInstructions.td - EG Instruction defs ----*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// TableGen definitions for instructions which are: -// - Available to Evergreen and newer VLIW4/VLIW5 GPUs -// - Available only on Evergreen family GPUs. -// -//===----------------------------------------------------------------------===// - -def isEG : Predicate< - "Subtarget->getGeneration() >= AMDGPUSubtarget::EVERGREEN && " - "Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS && " - "!Subtarget->hasCaymanISA()" ->; - -def isEGorCayman : Predicate< - "Subtarget->getGeneration() == AMDGPUSubtarget::EVERGREEN ||" - "Subtarget->getGeneration() ==AMDGPUSubtarget::NORTHERN_ISLANDS" ->; - -//===----------------------------------------------------------------------===// -// Evergreen / Cayman store instructions -//===----------------------------------------------------------------------===// - -let Predicates = [isEGorCayman] in { - -class CF_MEM_RAT_CACHELESS rat_inst, bits<4> rat_id, bits<4> mask, dag ins, - string name, list pattern> - : EG_CF_RAT <0x57, rat_inst, rat_id, mask, (outs), ins, - "MEM_RAT_CACHELESS "#name, pattern>; - -class CF_MEM_RAT rat_inst, bits<4> rat_id, dag ins, string name, - list pattern> - : EG_CF_RAT <0x56, rat_inst, rat_id, 0xf /* mask */, (outs), ins, - "MEM_RAT "#name, pattern>; - -def RAT_MSKOR : CF_MEM_RAT <0x11, 0, - (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr), - "MSKOR $rw_gpr.XW, $index_gpr", - [(mskor_global v4i32:$rw_gpr, i32:$index_gpr)] -> { - let eop = 0; -} - -} // End let Predicates = [isEGorCayman] - -//===----------------------------------------------------------------------===// -// Evergreen Only instructions -//===----------------------------------------------------------------------===// - -let Predicates = [isEG] in { - -def RECIP_IEEE_eg : RECIP_IEEE_Common<0x86>; -defm DIV_eg : DIV_Common; - -def MULLO_INT_eg : MULLO_INT_Common<0x8F>; -def MULHI_INT_eg : MULHI_INT_Common<0x90>; -def MULLO_UINT_eg : MULLO_UINT_Common<0x91>; -def MULHI_UINT_eg : MULHI_UINT_Common<0x92>; -def RECIP_UINT_eg : RECIP_UINT_Common<0x94>; -def RECIPSQRT_CLAMPED_eg : RECIPSQRT_CLAMPED_Common<0x87>; -def EXP_IEEE_eg : EXP_IEEE_Common<0x81>; -def LOG_IEEE_eg : LOG_IEEE_Common<0x83>; -def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>; -def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>; -def : RsqPat; -def SIN_eg : SIN_Common<0x8D>; -def COS_eg : COS_Common<0x8E>; - -def : POW_Common ; -def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>; - -defm : Expand24IBitOps; - -//===----------------------------------------------------------------------===// -// Memory read/write instructions -//===----------------------------------------------------------------------===// - -let usesCustomInserter = 1 in { - -// 32-bit store -def RAT_WRITE_CACHELESS_32_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x1, - (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), - "STORE_RAW $rw_gpr, $index_gpr, $eop", - [(global_store i32:$rw_gpr, i32:$index_gpr)] ->; - -// 64-bit store -def RAT_WRITE_CACHELESS_64_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x3, - (ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), - "STORE_RAW $rw_gpr.XY, $index_gpr, $eop", - [(global_store v2i32:$rw_gpr, i32:$index_gpr)] ->; - -//128-bit store -def RAT_WRITE_CACHELESS_128_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0xf, - (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), - "STORE_RAW $rw_gpr.XYZW, $index_gpr, $eop", - [(global_store v4i32:$rw_gpr, i32:$index_gpr)] ->; - -} // End usesCustomInserter = 1 - -class VTX_READ_eg buffer_id, dag outs, list pattern> - : VTX_WORD0_eg, VTX_READ { - - // Static fields - let VC_INST = 0; - let FETCH_TYPE = 2; - let FETCH_WHOLE_QUAD = 0; - let BUFFER_ID = buffer_id; - let SRC_REL = 0; - // XXX: We can infer this field based on the SRC_GPR. This would allow us - // to store vertex addresses in any channel, not just X. - let SRC_SEL_X = 0; - - let Inst{31-0} = Word0; -} - -class VTX_READ_8_eg buffer_id, list pattern> - : VTX_READ_eg <"VTX_READ_8 $dst_gpr, $src_gpr", buffer_id, - (outs R600_TReg32_X:$dst_gpr), pattern> { - - let MEGA_FETCH_COUNT = 1; - let DST_SEL_X = 0; - let DST_SEL_Y = 7; // Masked - let DST_SEL_Z = 7; // Masked - let DST_SEL_W = 7; // Masked - let DATA_FORMAT = 1; // FMT_8 -} - -class VTX_READ_16_eg buffer_id, list pattern> - : VTX_READ_eg <"VTX_READ_16 $dst_gpr, $src_gpr", buffer_id, - (outs R600_TReg32_X:$dst_gpr), pattern> { - let MEGA_FETCH_COUNT = 2; - let DST_SEL_X = 0; - let DST_SEL_Y = 7; // Masked - let DST_SEL_Z = 7; // Masked - let DST_SEL_W = 7; // Masked - let DATA_FORMAT = 5; // FMT_16 - -} - -class VTX_READ_32_eg buffer_id, list pattern> - : VTX_READ_eg <"VTX_READ_32 $dst_gpr, $src_gpr", buffer_id, - (outs R600_TReg32_X:$dst_gpr), pattern> { - - let MEGA_FETCH_COUNT = 4; - let DST_SEL_X = 0; - let DST_SEL_Y = 7; // Masked - let DST_SEL_Z = 7; // Masked - let DST_SEL_W = 7; // Masked - let DATA_FORMAT = 0xD; // COLOR_32 - - // This is not really necessary, but there were some GPU hangs that appeared - // to be caused by ALU instructions in the next instruction group that wrote - // to the $src_gpr registers of the VTX_READ. - // e.g. - // %T3_X = VTX_READ_PARAM_32_eg %T2_X, 24 - // %T2_X = MOV %ZERO - //Adding this constraint prevents this from happening. - let Constraints = "$src_gpr.ptr = $dst_gpr"; -} - -class VTX_READ_64_eg buffer_id, list pattern> - : VTX_READ_eg <"VTX_READ_64 $dst_gpr.XY, $src_gpr", buffer_id, - (outs R600_Reg64:$dst_gpr), pattern> { - - let MEGA_FETCH_COUNT = 8; - let DST_SEL_X = 0; - let DST_SEL_Y = 1; - let DST_SEL_Z = 7; - let DST_SEL_W = 7; - let DATA_FORMAT = 0x1D; // COLOR_32_32 -} - -class VTX_READ_128_eg buffer_id, list pattern> - : VTX_READ_eg <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id, - (outs R600_Reg128:$dst_gpr), pattern> { - - let MEGA_FETCH_COUNT = 16; - let DST_SEL_X = 0; - let DST_SEL_Y = 1; - let DST_SEL_Z = 2; - let DST_SEL_W = 3; - let DATA_FORMAT = 0x22; // COLOR_32_32_32_32 - - // XXX: Need to force VTX_READ_128 instructions to write to the same register - // that holds its buffer address to avoid potential hangs. We can't use - // the same constraint as VTX_READ_32_eg, because the $src_gpr.ptr and $dst - // registers are different sizes. -} - -//===----------------------------------------------------------------------===// -// VTX Read from parameter memory space -//===----------------------------------------------------------------------===// - -def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <0, - [(set i32:$dst_gpr, (load_param_exti8 ADDRVTX_READ:$src_gpr))] ->; - -def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <0, - [(set i32:$dst_gpr, (load_param_exti16 ADDRVTX_READ:$src_gpr))] ->; - -def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0, - [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] ->; - -def VTX_READ_PARAM_64_eg : VTX_READ_64_eg <0, - [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] ->; - -def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0, - [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] ->; - -//===----------------------------------------------------------------------===// -// VTX Read from global memory space -//===----------------------------------------------------------------------===// - -// 8-bit reads -def VTX_READ_GLOBAL_8_eg : VTX_READ_8_eg <1, - [(set i32:$dst_gpr, (az_extloadi8_global ADDRVTX_READ:$src_gpr))] ->; - -def VTX_READ_GLOBAL_16_eg : VTX_READ_16_eg <1, - [(set i32:$dst_gpr, (az_extloadi16_global ADDRVTX_READ:$src_gpr))] ->; - -// 32-bit reads -def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1, - [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] ->; - -// 64-bit reads -def VTX_READ_GLOBAL_64_eg : VTX_READ_64_eg <1, - [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] ->; - -// 128-bit reads -def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1, - [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] ->; - -} // End Predicates = [isEG] - -//===----------------------------------------------------------------------===// -// Evergreen / Cayman Instructions -//===----------------------------------------------------------------------===// - -let Predicates = [isEGorCayman] in { - -// Should be predicated on FeatureFP64 -// def FMA_64 : R600_3OP < -// 0xA, "FMA_64", -// [(set f64:$dst, (fma f64:$src0, f64:$src1, f64:$src2))] -// >; - -// BFE_UINT - bit_extract, an optimization for mask and shift -// Src0 = Input -// Src1 = Offset -// Src2 = Width -// -// bit_extract = (Input << (32 - Offset - Width)) >> (32 - Width) -// -// Example Usage: -// (Offset, Width) -// -// (0, 8) = (Input << 24) >> 24 = (Input & 0xff) >> 0 -// (8, 8) = (Input << 16) >> 24 = (Input & 0xffff) >> 8 -// (16, 8) = (Input << 8) >> 24 = (Input & 0xffffff) >> 16 -// (24, 8) = (Input << 0) >> 24 = (Input & 0xffffffff) >> 24 -def BFE_UINT_eg : R600_3OP <0x4, "BFE_UINT", - [(set i32:$dst, (AMDGPUbfe_u32 i32:$src0, i32:$src1, i32:$src2))], - VecALU ->; - -def BFE_INT_eg : R600_3OP <0x5, "BFE_INT", - [(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))], - VecALU ->; - -def : BFEPattern ; - -def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", - [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))], - VecALU ->; - -def : Pat<(i32 (sext_inreg i32:$src, i1)), - (BFE_INT_eg i32:$src, (i32 ZERO), (i32 ONE_INT))>; -def : Pat<(i32 (sext_inreg i32:$src, i8)), - (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 8))>; -def : Pat<(i32 (sext_inreg i32:$src, i16)), - (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 16))>; - -defm : BFIPatterns ; - -def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT", - [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))], - VecALU ->; - -def MULADD_UINT24_eg : R600_3OP <0x10, "MULADD_UINT24", - [(set i32:$dst, (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2))], VecALU ->; - -def : UMad24Pat; - -def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>; -def : ROTRPattern ; -def MULADD_eg : MULADD_Common<0x14>; -def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>; -def FMA_eg : FMA_Common<0x7>; -def ASHR_eg : ASHR_Common<0x15>; -def LSHR_eg : LSHR_Common<0x16>; -def LSHL_eg : LSHL_Common<0x17>; -def CNDE_eg : CNDE_Common<0x19>; -def CNDGT_eg : CNDGT_Common<0x1A>; -def CNDGE_eg : CNDGE_Common<0x1B>; -def MUL_LIT_eg : MUL_LIT_Common<0x1F>; -def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>; -def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24", - [(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))], VecALU ->; -def DOT4_eg : DOT4_Common<0xBE>; -defm CUBE_eg : CUBE_Common<0xC0>; - -def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>; - -def ADDC_UINT : R600_2OP_Helper <0x52, "ADDC_UINT", AMDGPUcarry>; -def SUBB_UINT : R600_2OP_Helper <0x53, "SUBB_UINT", AMDGPUborrow>; - -def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", ctlz_zero_undef, VecALU>; -def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>; - -let hasSideEffects = 1 in { - def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>; -} - -def TGSI_LIT_Z_eg : TGSI_LIT_Z_Common; - -def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> { - let Pattern = []; - let Itinerary = AnyALU; -} - -def INT_TO_FLT_eg : INT_TO_FLT_Common<0x9B>; - -def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> { - let Pattern = []; -} - -def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>; - -def GROUP_BARRIER : InstR600 < - (outs), (ins), " GROUP_BARRIER", [(int_AMDGPU_barrier_local), (int_AMDGPU_barrier_global)], AnyALU>, - R600ALU_Word0, - R600ALU_Word1_OP2 <0x54> { - - let dst = 0; - let dst_rel = 0; - let src0 = 0; - let src0_rel = 0; - let src0_neg = 0; - let src0_abs = 0; - let src1 = 0; - let src1_rel = 0; - let src1_neg = 0; - let src1_abs = 0; - let write = 0; - let omod = 0; - let clamp = 0; - let last = 1; - let bank_swizzle = 0; - let pred_sel = 0; - let update_exec_mask = 0; - let update_pred = 0; - - let Inst{31-0} = Word0; - let Inst{63-32} = Word1; - - let ALUInst = 1; -} - -def : Pat < - (int_AMDGPU_barrier_global), - (GROUP_BARRIER) ->; - -//===----------------------------------------------------------------------===// -// LDS Instructions -//===----------------------------------------------------------------------===// -class R600_LDS op, dag outs, dag ins, string asm, - list pattern = []> : - - InstR600 , - R600_ALU_LDS_Word0, - R600LDS_Word1 { - - bits<6> offset = 0; - let lds_op = op; - - let Word1{27} = offset{0}; - let Word1{12} = offset{1}; - let Word1{28} = offset{2}; - let Word1{31} = offset{3}; - let Word0{12} = offset{4}; - let Word0{25} = offset{5}; - - - let Inst{31-0} = Word0; - let Inst{63-32} = Word1; - - let ALUInst = 1; - let HasNativeOperands = 1; - let UseNamedOperandTable = 1; -} - -class R600_LDS_1A lds_op, string name, list pattern> : R600_LDS < - lds_op, - (outs R600_Reg32:$dst), - (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel, - LAST:$last, R600_Pred:$pred_sel, - BANK_SWIZZLE:$bank_swizzle), - " "#name#" $last OQAP, $src0$src0_rel $pred_sel", - pattern - > { - - let src1 = 0; - let src1_rel = 0; - let src2 = 0; - let src2_rel = 0; - - let usesCustomInserter = 1; - let LDS_1A = 1; - let DisableEncoding = "$dst"; -} - -class R600_LDS_1A1D lds_op, dag outs, string name, list pattern, - string dst =""> : - R600_LDS < - lds_op, outs, - (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel, - R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel, - LAST:$last, R600_Pred:$pred_sel, - BANK_SWIZZLE:$bank_swizzle), - " "#name#" $last "#dst#"$src0$src0_rel, $src1$src1_rel, $pred_sel", - pattern - > { - - field string BaseOp; - - let src2 = 0; - let src2_rel = 0; - let LDS_1A1D = 1; -} - -class R600_LDS_1A1D_NORET lds_op, string name, list pattern> : - R600_LDS_1A1D { - let BaseOp = name; -} - -class R600_LDS_1A1D_RET lds_op, string name, list pattern> : - R600_LDS_1A1D { - - let BaseOp = name; - let usesCustomInserter = 1; - let DisableEncoding = "$dst"; -} - -class R600_LDS_1A2D lds_op, dag outs, string name, list pattern, - string dst =""> : - R600_LDS < - lds_op, outs, - (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel, - R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel, - R600_Reg32:$src2, REL:$src2_rel, SEL:$src2_sel, - LAST:$last, R600_Pred:$pred_sel, BANK_SWIZZLE:$bank_swizzle), - " "#name# "$last "#dst#"$src0$src0_rel, $src1$src1_rel, $src2$src2_rel, $pred_sel", - pattern> { - - field string BaseOp; - - let LDS_1A1D = 0; - let LDS_1A2D = 1; -} - -class R600_LDS_1A2D_NORET lds_op, string name, list pattern> : - R600_LDS_1A2D { - let BaseOp = name; -} - -class R600_LDS_1A2D_RET lds_op, string name, list pattern> : - R600_LDS_1A2D { - - let BaseOp = name; - let usesCustomInserter = 1; - let DisableEncoding = "$dst"; -} - -def LDS_ADD : R600_LDS_1A1D_NORET <0x0, "LDS_ADD", [] >; -def LDS_SUB : R600_LDS_1A1D_NORET <0x1, "LDS_SUB", [] >; -def LDS_AND : R600_LDS_1A1D_NORET <0x9, "LDS_AND", [] >; -def LDS_OR : R600_LDS_1A1D_NORET <0xa, "LDS_OR", [] >; -def LDS_XOR : R600_LDS_1A1D_NORET <0xb, "LDS_XOR", [] >; -def LDS_WRXCHG: R600_LDS_1A1D_NORET <0xd, "LDS_WRXCHG", [] >; -def LDS_CMPST: R600_LDS_1A2D_NORET <0x10, "LDS_CMPST", [] >; -def LDS_MIN_INT : R600_LDS_1A1D_NORET <0x5, "LDS_MIN_INT", [] >; -def LDS_MAX_INT : R600_LDS_1A1D_NORET <0x6, "LDS_MAX_INT", [] >; -def LDS_MIN_UINT : R600_LDS_1A1D_NORET <0x7, "LDS_MIN_UINT", [] >; -def LDS_MAX_UINT : R600_LDS_1A1D_NORET <0x8, "LDS_MAX_UINT", [] >; -def LDS_WRITE : R600_LDS_1A1D_NORET <0xD, "LDS_WRITE", - [(local_store (i32 R600_Reg32:$src1), R600_Reg32:$src0)] ->; -def LDS_BYTE_WRITE : R600_LDS_1A1D_NORET<0x12, "LDS_BYTE_WRITE", - [(truncstorei8_local i32:$src1, i32:$src0)] ->; -def LDS_SHORT_WRITE : R600_LDS_1A1D_NORET<0x13, "LDS_SHORT_WRITE", - [(truncstorei16_local i32:$src1, i32:$src0)] ->; -def LDS_ADD_RET : R600_LDS_1A1D_RET <0x20, "LDS_ADD", - [(set i32:$dst, (atomic_load_add_local i32:$src0, i32:$src1))] ->; -def LDS_SUB_RET : R600_LDS_1A1D_RET <0x21, "LDS_SUB", - [(set i32:$dst, (atomic_load_sub_local i32:$src0, i32:$src1))] ->; -def LDS_AND_RET : R600_LDS_1A1D_RET <0x29, "LDS_AND", - [(set i32:$dst, (atomic_load_and_local i32:$src0, i32:$src1))] ->; -def LDS_OR_RET : R600_LDS_1A1D_RET <0x2a, "LDS_OR", - [(set i32:$dst, (atomic_load_or_local i32:$src0, i32:$src1))] ->; -def LDS_XOR_RET : R600_LDS_1A1D_RET <0x2b, "LDS_XOR", - [(set i32:$dst, (atomic_load_xor_local i32:$src0, i32:$src1))] ->; -def LDS_MIN_INT_RET : R600_LDS_1A1D_RET <0x25, "LDS_MIN_INT", - [(set i32:$dst, (atomic_load_min_local i32:$src0, i32:$src1))] ->; -def LDS_MAX_INT_RET : R600_LDS_1A1D_RET <0x26, "LDS_MAX_INT", - [(set i32:$dst, (atomic_load_max_local i32:$src0, i32:$src1))] ->; -def LDS_MIN_UINT_RET : R600_LDS_1A1D_RET <0x27, "LDS_MIN_UINT", - [(set i32:$dst, (atomic_load_umin_local i32:$src0, i32:$src1))] ->; -def LDS_MAX_UINT_RET : R600_LDS_1A1D_RET <0x28, "LDS_MAX_UINT", - [(set i32:$dst, (atomic_load_umax_local i32:$src0, i32:$src1))] ->; -def LDS_WRXCHG_RET : R600_LDS_1A1D_RET <0x2d, "LDS_WRXCHG", - [(set i32:$dst, (atomic_swap_local i32:$src0, i32:$src1))] ->; -def LDS_CMPST_RET : R600_LDS_1A2D_RET <0x30, "LDS_CMPST", - [(set i32:$dst, (atomic_cmp_swap_32_local i32:$src0, i32:$src1, i32:$src2))] ->; -def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET", - [(set (i32 R600_Reg32:$dst), (local_load R600_Reg32:$src0))] ->; -def LDS_BYTE_READ_RET : R600_LDS_1A <0x36, "LDS_BYTE_READ_RET", - [(set i32:$dst, (sextloadi8_local i32:$src0))] ->; -def LDS_UBYTE_READ_RET : R600_LDS_1A <0x37, "LDS_UBYTE_READ_RET", - [(set i32:$dst, (az_extloadi8_local i32:$src0))] ->; -def LDS_SHORT_READ_RET : R600_LDS_1A <0x38, "LDS_SHORT_READ_RET", - [(set i32:$dst, (sextloadi16_local i32:$src0))] ->; -def LDS_USHORT_READ_RET : R600_LDS_1A <0x39, "LDS_USHORT_READ_RET", - [(set i32:$dst, (az_extloadi16_local i32:$src0))] ->; - -// TRUNC is used for the FLT_TO_INT instructions to work around a -// perceived problem where the rounding modes are applied differently -// depending on the instruction and the slot they are in. -// See: -// https://bugs.freedesktop.org/show_bug.cgi?id=50232 -// Mesa commit: a1a0974401c467cb86ef818f22df67c21774a38c -// -// XXX: Lowering SELECT_CC will sometimes generate fp_to_[su]int nodes, -// which do not need to be truncated since the fp values are 0.0f or 1.0f. -// We should look into handling these cases separately. -def : Pat<(fp_to_sint f32:$src0), (FLT_TO_INT_eg (TRUNC $src0))>; - -def : Pat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>; - -// SHA-256 Patterns -def : SHA256MaPattern ; - -def EG_ExportSwz : ExportSwzInst { - let Word1{19-16} = 0; // BURST_COUNT - let Word1{20} = 0; // VALID_PIXEL_MODE - let Word1{21} = eop; - let Word1{29-22} = inst; - let Word1{30} = 0; // MARK - let Word1{31} = 1; // BARRIER -} -defm : ExportPattern; - -def EG_ExportBuf : ExportBufInst { - let Word1{19-16} = 0; // BURST_COUNT - let Word1{20} = 0; // VALID_PIXEL_MODE - let Word1{21} = eop; - let Word1{29-22} = inst; - let Word1{30} = 0; // MARK - let Word1{31} = 1; // BARRIER -} -defm : SteamOutputExportPattern; - -def CF_TC_EG : CF_CLAUSE_EG<1, (ins i32imm:$ADDR, i32imm:$COUNT), - "TEX $COUNT @$ADDR"> { - let POP_COUNT = 0; -} -def CF_VC_EG : CF_CLAUSE_EG<2, (ins i32imm:$ADDR, i32imm:$COUNT), - "VTX $COUNT @$ADDR"> { - let POP_COUNT = 0; -} -def WHILE_LOOP_EG : CF_CLAUSE_EG<6, (ins i32imm:$ADDR), - "LOOP_START_DX10 @$ADDR"> { - let POP_COUNT = 0; - let COUNT = 0; -} -def END_LOOP_EG : CF_CLAUSE_EG<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> { - let POP_COUNT = 0; - let COUNT = 0; -} -def LOOP_BREAK_EG : CF_CLAUSE_EG<9, (ins i32imm:$ADDR), - "LOOP_BREAK @$ADDR"> { - let POP_COUNT = 0; - let COUNT = 0; -} -def CF_CONTINUE_EG : CF_CLAUSE_EG<8, (ins i32imm:$ADDR), - "CONTINUE @$ADDR"> { - let POP_COUNT = 0; - let COUNT = 0; -} -def CF_JUMP_EG : CF_CLAUSE_EG<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT), - "JUMP @$ADDR POP:$POP_COUNT"> { - let COUNT = 0; -} -def CF_PUSH_EG : CF_CLAUSE_EG<11, (ins i32imm:$ADDR, i32imm:$POP_COUNT), - "PUSH @$ADDR POP:$POP_COUNT"> { - let COUNT = 0; -} -def CF_ELSE_EG : CF_CLAUSE_EG<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT), - "ELSE @$ADDR POP:$POP_COUNT"> { - let COUNT = 0; -} -def CF_CALL_FS_EG : CF_CLAUSE_EG<19, (ins), "CALL_FS"> { - let ADDR = 0; - let COUNT = 0; - let POP_COUNT = 0; -} -def POP_EG : CF_CLAUSE_EG<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT), - "POP @$ADDR POP:$POP_COUNT"> { - let COUNT = 0; -} -def CF_END_EG : CF_CLAUSE_EG<0, (ins), "CF_END"> { - let COUNT = 0; - let POP_COUNT = 0; - let ADDR = 0; - let END_OF_PROGRAM = 1; -} - -} // End Predicates = [isEGorCayman] diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp deleted file mode 100644 index e811d5cff22..00000000000 --- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp +++ /dev/null @@ -1,642 +0,0 @@ -//===-- AMDGPUInstPrinter.cpp - AMDGPU MC Inst -> ASM ---------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -// \file -//===----------------------------------------------------------------------===// - -#include "AMDGPUInstPrinter.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "SIDefines.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCRegisterInfo.h" -#include "llvm/Support/MathExtras.h" - -using namespace llvm; - -void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, - StringRef Annot, const MCSubtargetInfo &STI) { - OS.flush(); - printInstruction(MI, OS); - - printAnnotation(OS, Annot); -} - -void AMDGPUInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - O << formatHex(MI->getOperand(OpNo).getImm() & 0xff); -} - -void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - O << formatHex(MI->getOperand(OpNo).getImm() & 0xffff); -} - -void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff); -} - -void AMDGPUInstPrinter::printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - O << formatDec(MI->getOperand(OpNo).getImm() & 0xff); -} - -void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff); -} - -void AMDGPUInstPrinter::printOffen(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " offen"; -} - -void AMDGPUInstPrinter::printIdxen(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " idxen"; -} - -void AMDGPUInstPrinter::printAddr64(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " addr64"; -} - -void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) { - O << " offset:"; - printU16ImmDecOperand(MI, OpNo, O); - } -} - -void AMDGPUInstPrinter::printDSOffset(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - uint16_t Imm = MI->getOperand(OpNo).getImm(); - if (Imm != 0) { - O << " offset:"; - printU16ImmDecOperand(MI, OpNo, O); - } -} - -void AMDGPUInstPrinter::printDSOffset0(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) { - O << " offset0:"; - printU8ImmDecOperand(MI, OpNo, O); - } -} - -void AMDGPUInstPrinter::printDSOffset1(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) { - O << " offset1:"; - printU8ImmDecOperand(MI, OpNo, O); - } -} - -void AMDGPUInstPrinter::printGDS(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " gds"; -} - -void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " glc"; -} - -void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " slc"; -} - -void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " tfe"; -} - -void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O, - const MCRegisterInfo &MRI) { - switch (reg) { - case AMDGPU::VCC: - O << "vcc"; - return; - case AMDGPU::SCC: - O << "scc"; - return; - case AMDGPU::EXEC: - O << "exec"; - return; - case AMDGPU::M0: - O << "m0"; - return; - case AMDGPU::FLAT_SCR: - O << "flat_scratch"; - return; - case AMDGPU::VCC_LO: - O << "vcc_lo"; - return; - case AMDGPU::VCC_HI: - O << "vcc_hi"; - return; - case AMDGPU::EXEC_LO: - O << "exec_lo"; - return; - case AMDGPU::EXEC_HI: - O << "exec_hi"; - return; - case AMDGPU::FLAT_SCR_LO: - O << "flat_scratch_lo"; - return; - case AMDGPU::FLAT_SCR_HI: - O << "flat_scratch_hi"; - return; - default: - break; - } - - char Type; - unsigned NumRegs; - - if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(reg)) { - Type = 'v'; - NumRegs = 1; - } else if (MRI.getRegClass(AMDGPU::SGPR_32RegClassID).contains(reg)) { - Type = 's'; - NumRegs = 1; - } else if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(reg)) { - Type = 'v'; - NumRegs = 2; - } else if (MRI.getRegClass(AMDGPU::SReg_64RegClassID).contains(reg)) { - Type = 's'; - NumRegs = 2; - } else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(reg)) { - Type = 'v'; - NumRegs = 4; - } else if (MRI.getRegClass(AMDGPU::SReg_128RegClassID).contains(reg)) { - Type = 's'; - NumRegs = 4; - } else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(reg)) { - Type = 'v'; - NumRegs = 3; - } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(reg)) { - Type = 'v'; - NumRegs = 8; - } else if (MRI.getRegClass(AMDGPU::SReg_256RegClassID).contains(reg)) { - Type = 's'; - NumRegs = 8; - } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(reg)) { - Type = 'v'; - NumRegs = 16; - } else if (MRI.getRegClass(AMDGPU::SReg_512RegClassID).contains(reg)) { - Type = 's'; - NumRegs = 16; - } else { - O << getRegisterName(reg); - return; - } - - // The low 8 bits of the encoding value is the register index, for both VGPRs - // and SGPRs. - unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1); - if (NumRegs == 1) { - O << Type << RegIdx; - return; - } - - O << Type << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']'; -} - -void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3) - O << "_e64 "; - else - O << "_e32 "; - - printOperand(MI, OpNo, O); -} - -void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, raw_ostream &O) { - int32_t SImm = static_cast(Imm); - if (SImm >= -16 && SImm <= 64) { - O << SImm; - return; - } - - if (Imm == FloatToBits(0.0f)) - O << "0.0"; - else if (Imm == FloatToBits(1.0f)) - O << "1.0"; - else if (Imm == FloatToBits(-1.0f)) - O << "-1.0"; - else if (Imm == FloatToBits(0.5f)) - O << "0.5"; - else if (Imm == FloatToBits(-0.5f)) - O << "-0.5"; - else if (Imm == FloatToBits(2.0f)) - O << "2.0"; - else if (Imm == FloatToBits(-2.0f)) - O << "-2.0"; - else if (Imm == FloatToBits(4.0f)) - O << "4.0"; - else if (Imm == FloatToBits(-4.0f)) - O << "-4.0"; - else - O << formatHex(static_cast(Imm)); -} - -void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, raw_ostream &O) { - int64_t SImm = static_cast(Imm); - if (SImm >= -16 && SImm <= 64) { - O << SImm; - return; - } - - if (Imm == DoubleToBits(0.0)) - O << "0.0"; - else if (Imm == DoubleToBits(1.0)) - O << "1.0"; - else if (Imm == DoubleToBits(-1.0)) - O << "-1.0"; - else if (Imm == DoubleToBits(0.5)) - O << "0.5"; - else if (Imm == DoubleToBits(-0.5)) - O << "-0.5"; - else if (Imm == DoubleToBits(2.0)) - O << "2.0"; - else if (Imm == DoubleToBits(-2.0)) - O << "-2.0"; - else if (Imm == DoubleToBits(4.0)) - O << "4.0"; - else if (Imm == DoubleToBits(-4.0)) - O << "-4.0"; - else - llvm_unreachable("64-bit literal constants not supported"); -} - -void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isReg()) { - switch (Op.getReg()) { - // This is the default predicate state, so we don't need to print it. - case AMDGPU::PRED_SEL_OFF: - break; - - default: - printRegOperand(Op.getReg(), O, MRI); - break; - } - } else if (Op.isImm()) { - const MCInstrDesc &Desc = MII.get(MI->getOpcode()); - int RCID = Desc.OpInfo[OpNo].RegClass; - if (RCID != -1) { - const MCRegisterClass &ImmRC = MRI.getRegClass(RCID); - if (ImmRC.getSize() == 4) - printImmediate32(Op.getImm(), O); - else if (ImmRC.getSize() == 8) - printImmediate64(Op.getImm(), O); - else - llvm_unreachable("Invalid register class size"); - } else if (Desc.OpInfo[OpNo].OperandType == MCOI::OPERAND_IMMEDIATE) { - printImmediate32(Op.getImm(), O); - } else { - // We hit this for the immediate instruction bits that don't yet have a - // custom printer. - // TODO: Eventually this should be unnecessary. - O << formatDec(Op.getImm()); - } - } else if (Op.isFPImm()) { - // We special case 0.0 because otherwise it will be printed as an integer. - if (Op.getFPImm() == 0.0) - O << "0.0"; - else { - const MCInstrDesc &Desc = MII.get(MI->getOpcode()); - const MCRegisterClass &ImmRC = MRI.getRegClass(Desc.OpInfo[OpNo].RegClass); - - if (ImmRC.getSize() == 4) - printImmediate32(FloatToBits(Op.getFPImm()), O); - else if (ImmRC.getSize() == 8) - printImmediate64(DoubleToBits(Op.getFPImm()), O); - else - llvm_unreachable("Invalid register class size"); - } - } else if (Op.isExpr()) { - const MCExpr *Exp = Op.getExpr(); - Exp->print(O, &MAI); - } else { - llvm_unreachable("unknown operand type in printOperand"); - } -} - -void AMDGPUInstPrinter::printOperandAndMods(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - unsigned InputModifiers = MI->getOperand(OpNo).getImm(); - if (InputModifiers & SISrcMods::NEG) - O << '-'; - if (InputModifiers & SISrcMods::ABS) - O << '|'; - printOperand(MI, OpNo + 1, O); - if (InputModifiers & SISrcMods::ABS) - O << '|'; -} - -void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - unsigned Imm = MI->getOperand(OpNum).getImm(); - - if (Imm == 2) { - O << "P0"; - } else if (Imm == 1) { - O << "P20"; - } else if (Imm == 0) { - O << "P10"; - } else { - llvm_unreachable("Invalid interpolation parameter slot"); - } -} - -void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - printOperand(MI, OpNo, O); - O << ", "; - printOperand(MI, OpNo + 1, O); -} - -void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo, - raw_ostream &O, StringRef Asm, - StringRef Default) { - const MCOperand &Op = MI->getOperand(OpNo); - assert(Op.isImm()); - if (Op.getImm() == 1) { - O << Asm; - } else { - O << Default; - } -} - -void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - printIfSet(MI, OpNo, O, "|"); -} - -void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - printIfSet(MI, OpNo, O, "_SAT"); -} - -void AMDGPUInstPrinter::printClampSI(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " clamp"; -} - -void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - int Imm = MI->getOperand(OpNo).getImm(); - if (Imm == SIOutMods::MUL2) - O << " mul:2"; - else if (Imm == SIOutMods::MUL4) - O << " mul:4"; - else if (Imm == SIOutMods::DIV2) - O << " div:2"; -} - -void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - int32_t Imm = MI->getOperand(OpNo).getImm(); - O << Imm << '(' << BitsToFloat(Imm) << ')'; -} - -void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - printIfSet(MI, OpNo, O, "*", " "); -} - -void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - printIfSet(MI, OpNo, O, "-"); -} - -void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - switch (MI->getOperand(OpNo).getImm()) { - default: break; - case 1: - O << " * 2.0"; - break; - case 2: - O << " * 4.0"; - break; - case 3: - O << " / 2.0"; - break; - } -} - -void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - printIfSet(MI, OpNo, O, "+"); -} - -void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - printIfSet(MI, OpNo, O, "ExecMask,"); -} - -void AMDGPUInstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - printIfSet(MI, OpNo, O, "Pred,"); -} - -void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.getImm() == 0) { - O << " (MASKED)"; - } -} - -void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - const char * chans = "XYZW"; - int sel = MI->getOperand(OpNo).getImm(); - - int chan = sel & 3; - sel >>= 2; - - if (sel >= 512) { - sel -= 512; - int cb = sel >> 12; - sel &= 4095; - O << cb << '[' << sel << ']'; - } else if (sel >= 448) { - sel -= 448; - O << sel; - } else if (sel >= 0){ - O << sel; - } - - if (sel >= 0) - O << '.' << chans[chan]; -} - -void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - int BankSwizzle = MI->getOperand(OpNo).getImm(); - switch (BankSwizzle) { - case 1: - O << "BS:VEC_021/SCL_122"; - break; - case 2: - O << "BS:VEC_120/SCL_212"; - break; - case 3: - O << "BS:VEC_102/SCL_221"; - break; - case 4: - O << "BS:VEC_201"; - break; - case 5: - O << "BS:VEC_210"; - break; - default: - break; - } - return; -} - -void AMDGPUInstPrinter::printRSel(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - unsigned Sel = MI->getOperand(OpNo).getImm(); - switch (Sel) { - case 0: - O << 'X'; - break; - case 1: - O << 'Y'; - break; - case 2: - O << 'Z'; - break; - case 3: - O << 'W'; - break; - case 4: - O << '0'; - break; - case 5: - O << '1'; - break; - case 7: - O << '_'; - break; - default: - break; - } -} - -void AMDGPUInstPrinter::printCT(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - unsigned CT = MI->getOperand(OpNo).getImm(); - switch (CT) { - case 0: - O << 'U'; - break; - case 1: - O << 'N'; - break; - default: - break; - } -} - -void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - int KCacheMode = MI->getOperand(OpNo).getImm(); - if (KCacheMode > 0) { - int KCacheBank = MI->getOperand(OpNo - 2).getImm(); - O << "CB" << KCacheBank << ':'; - int KCacheAddr = MI->getOperand(OpNo + 2).getImm(); - int LineSize = (KCacheMode == 1) ? 16 : 32; - O << KCacheAddr * 16 << '-' << KCacheAddr * 16 + LineSize; - } -} - -void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - unsigned SImm16 = MI->getOperand(OpNo).getImm(); - unsigned Msg = SImm16 & 0xF; - if (Msg == 2 || Msg == 3) { - unsigned Op = (SImm16 >> 4) & 0xF; - if (Msg == 3) - O << "Gs_done("; - else - O << "Gs("; - if (Op == 0) { - O << "nop"; - } else { - unsigned Stream = (SImm16 >> 8) & 0x3; - if (Op == 1) - O << "cut"; - else if (Op == 2) - O << "emit"; - else if (Op == 3) - O << "emit-cut"; - O << " stream " << Stream; - } - O << "), [m0] "; - } else if (Msg == 1) - O << "interrupt "; - else if (Msg == 15) - O << "system "; - else - O << "unknown(" << Msg << ") "; -} - -void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - // Note: Mask values are taken from SIInsertWaits.cpp and not from ISA docs - // SIInsertWaits.cpp bits usage does not match ISA docs description but it - // works so it might be a misprint in docs. - unsigned SImm16 = MI->getOperand(OpNo).getImm(); - unsigned Vmcnt = SImm16 & 0xF; - unsigned Expcnt = (SImm16 >> 4) & 0xF; - unsigned Lgkmcnt = (SImm16 >> 8) & 0xF; - - bool NeedSpace = false; - - if (Vmcnt != 0xF) { - O << "vmcnt(" << Vmcnt << ')'; - NeedSpace = true; - } - - if (Expcnt != 0x7) { - if (NeedSpace) - O << ' '; - O << "expcnt(" << Expcnt << ')'; - NeedSpace = true; - } - - if (Lgkmcnt != 0x7) { - if (NeedSpace) - O << ' '; - O << "lgkmcnt(" << Lgkmcnt << ')'; - } -} - -#include "AMDGPUGenAsmWriter.inc" diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h deleted file mode 100644 index 14fb511e923..00000000000 --- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h +++ /dev/null @@ -1,88 +0,0 @@ -//===-- AMDGPUInstPrinter.h - AMDGPU MC Inst -> ASM interface ---*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H -#define LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H - -#include "llvm/ADT/StringRef.h" -#include "llvm/MC/MCInstPrinter.h" -#include "llvm/Support/raw_ostream.h" - -namespace llvm { - -class AMDGPUInstPrinter : public MCInstPrinter { -public: - AMDGPUInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI) - : MCInstPrinter(MAI, MII, MRI) {} - - //Autogenerated by tblgen - void printInstruction(const MCInst *MI, raw_ostream &O); - static const char *getRegisterName(unsigned RegNo); - - void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, - const MCSubtargetInfo &STI) override; - static void printRegOperand(unsigned RegNo, raw_ostream &O, - const MCRegisterInfo &MRI); - -private: - void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU32ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printOffen(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printIdxen(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printAddr64(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printDSOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printDSOffset0(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printDSOffset1(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printGDS(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printGLC(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printSLC(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printTFE(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printRegOperand(unsigned RegNo, raw_ostream &O); - void printVOPDst(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printImmediate32(uint32_t I, raw_ostream &O); - void printImmediate64(uint64_t I, raw_ostream &O); - void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printOperandAndMods(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O, - StringRef Asm, StringRef Default = ""); - static void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printClampSI(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printOModSI(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printOMOD(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printRel(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printUpdateExecMask(const MCInst *MI, unsigned OpNo, - raw_ostream &O); - static void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printBankSwizzle(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printRSel(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printCT(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printSendMsg(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printWaitFlag(const MCInst *MI, unsigned OpNo, raw_ostream &O); -}; - -} // End namespace llvm - -#endif diff --git a/lib/Target/R600/InstPrinter/CMakeLists.txt b/lib/Target/R600/InstPrinter/CMakeLists.txt deleted file mode 100644 index dcd87037fab..00000000000 --- a/lib/Target/R600/InstPrinter/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -add_llvm_library(LLVMR600AsmPrinter - AMDGPUInstPrinter.cpp - ) diff --git a/lib/Target/R600/InstPrinter/LLVMBuild.txt b/lib/Target/R600/InstPrinter/LLVMBuild.txt deleted file mode 100644 index ec0be89f104..00000000000 --- a/lib/Target/R600/InstPrinter/LLVMBuild.txt +++ /dev/null @@ -1,24 +0,0 @@ -;===- ./lib/Target/R600/InstPrinter/LLVMBuild.txt -----------*- Conf -*--===; -; -; The LLVM Compiler Infrastructure -; -; This file is distributed under the University of Illinois Open Source -; License. See LICENSE.TXT for details. -; -;===------------------------------------------------------------------------===; -; -; This is an LLVMBuild description file for the components in this subdirectory. -; -; For more information on the LLVMBuild system, please see: -; -; http://llvm.org/docs/LLVMBuild.html -; -;===------------------------------------------------------------------------===; - -[component_0] -type = Library -name = R600AsmPrinter -parent = R600 -required_libraries = MC Support -add_to_library_groups = R600 - diff --git a/lib/Target/R600/InstPrinter/Makefile b/lib/Target/R600/InstPrinter/Makefile deleted file mode 100644 index a794cc1124e..00000000000 --- a/lib/Target/R600/InstPrinter/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -#===- lib/Target/R600/AsmPrinter/Makefile ------------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## -LEVEL = ../../../.. -LIBRARYNAME = LLVMR600AsmPrinter - -# Hack: we need to include 'main' x86 target directory to grab private headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/R600/LLVMBuild.txt b/lib/Target/R600/LLVMBuild.txt deleted file mode 100644 index f3f254fdcba..00000000000 --- a/lib/Target/R600/LLVMBuild.txt +++ /dev/null @@ -1,33 +0,0 @@ -;===- ./lib/Target/AMDIL/LLVMBuild.txt -------------------------*- Conf -*--===; -; -; The LLVM Compiler Infrastructure -; -; This file is distributed under the University of Illinois Open Source -; License. See LICENSE.TXT for details. -; -;===------------------------------------------------------------------------===; -; -; This is an LLVMBuild description file for the components in this subdirectory. -; -; For more information on the LLVMBuild system, please see: -; -; http://llvm.org/docs/LLVMBuild.html -; -;===------------------------------------------------------------------------===; - -[common] -subdirectories = AsmParser InstPrinter MCTargetDesc TargetInfo - -[component_0] -type = TargetGroup -name = R600 -parent = Target -has_asmparser = 1 -has_asmprinter = 1 - -[component_1] -type = Library -name = R600CodeGen -parent = R600 -required_libraries = Analysis AsmPrinter CodeGen Core IPO MC R600AsmParser R600AsmPrinter R600Desc R600Info Scalar SelectionDAG Support Target TransformUtils -add_to_library_groups = R600 diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp deleted file mode 100644 index 8bed2deef4c..00000000000 --- a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp +++ /dev/null @@ -1,144 +0,0 @@ -//===-- AMDGPUAsmBackend.cpp - AMDGPU Assembler Backend -------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -/// \file -//===----------------------------------------------------------------------===// - -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "MCTargetDesc/AMDGPUFixupKinds.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/MC/MCAsmBackend.h" -#include "llvm/MC/MCAssembler.h" -#include "llvm/MC/MCFixupKindInfo.h" -#include "llvm/MC/MCObjectWriter.h" -#include "llvm/MC/MCValue.h" -#include "llvm/Support/TargetRegistry.h" - -using namespace llvm; - -namespace { - -class AMDGPUMCObjectWriter : public MCObjectWriter { -public: - AMDGPUMCObjectWriter(raw_pwrite_stream &OS) : MCObjectWriter(OS, true) {} - void executePostLayoutBinding(MCAssembler &Asm, - const MCAsmLayout &Layout) override { - //XXX: Implement if necessary. - } - void recordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout, - const MCFragment *Fragment, const MCFixup &Fixup, - MCValue Target, bool &IsPCRel, - uint64_t &FixedValue) override { - assert(!"Not implemented"); - } - - void writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override; - -}; - -class AMDGPUAsmBackend : public MCAsmBackend { -public: - AMDGPUAsmBackend(const Target &T) - : MCAsmBackend() {} - - unsigned getNumFixupKinds() const override { return AMDGPU::NumTargetFixupKinds; }; - void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, - uint64_t Value, bool IsPCRel) const override; - bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, - const MCRelaxableFragment *DF, - const MCAsmLayout &Layout) const override { - return false; - } - void relaxInstruction(const MCInst &Inst, MCInst &Res) const override { - assert(!"Not implemented"); - } - bool mayNeedRelaxation(const MCInst &Inst) const override { return false; } - bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override; - - const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override; -}; - -} //End anonymous namespace - -void AMDGPUMCObjectWriter::writeObject(MCAssembler &Asm, - const MCAsmLayout &Layout) { - for (MCAssembler::iterator I = Asm.begin(), E = Asm.end(); I != E; ++I) { - Asm.writeSectionData(&*I, Layout); - } -} - -void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, - unsigned DataSize, uint64_t Value, - bool IsPCRel) const { - - switch ((unsigned)Fixup.getKind()) { - default: llvm_unreachable("Unknown fixup kind"); - case AMDGPU::fixup_si_sopp_br: { - uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset()); - *Dst = (Value - 4) / 4; - break; - } - - case AMDGPU::fixup_si_rodata: { - uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset()); - *Dst = Value; - break; - } - - case AMDGPU::fixup_si_end_of_text: { - uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset()); - // The value points to the last instruction in the text section, so we - // need to add 4 bytes to get to the start of the constants. - *Dst = Value + 4; - break; - } - } -} - -const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo( - MCFixupKind Kind) const { - const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = { - // name offset bits flags - { "fixup_si_sopp_br", 0, 16, MCFixupKindInfo::FKF_IsPCRel }, - { "fixup_si_rodata", 0, 32, 0 }, - { "fixup_si_end_of_text", 0, 32, MCFixupKindInfo::FKF_IsPCRel } - }; - - if (Kind < FirstTargetFixupKind) - return MCAsmBackend::getFixupKindInfo(Kind); - - return Infos[Kind - FirstTargetFixupKind]; -} - -bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { - OW->WriteZeros(Count); - - return true; -} - -//===----------------------------------------------------------------------===// -// ELFAMDGPUAsmBackend class -//===----------------------------------------------------------------------===// - -namespace { - -class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend { -public: - ELFAMDGPUAsmBackend(const Target &T) : AMDGPUAsmBackend(T) { } - - MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { - return createAMDGPUELFObjectWriter(OS); - } -}; - -} // end anonymous namespace - -MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, - const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU) { - return new ELFAMDGPUAsmBackend(T); -} diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp deleted file mode 100644 index 59f45ff02d8..00000000000 --- a/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp +++ /dev/null @@ -1,39 +0,0 @@ -//===-- AMDGPUELFObjectWriter.cpp - AMDGPU ELF Writer ----------------------==// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -/// \file -//===----------------------------------------------------------------------===// - -#include "AMDGPUMCTargetDesc.h" -#include "llvm/MC/MCELFObjectWriter.h" -#include "llvm/MC/MCFixup.h" - -using namespace llvm; - -namespace { - -class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter { -public: - AMDGPUELFObjectWriter(); -protected: - unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, - bool IsPCRel) const override { - return Fixup.getKind(); - } - -}; - - -} // End anonymous namespace - -AMDGPUELFObjectWriter::AMDGPUELFObjectWriter() - : MCELFObjectTargetWriter(false, 0, 0, false) { } - -MCObjectWriter *llvm::createAMDGPUELFObjectWriter(raw_pwrite_stream &OS) { - MCELFObjectTargetWriter *MOTW = new AMDGPUELFObjectWriter(); - return createELFObjectWriter(MOTW, OS, true); -} diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUFixupKinds.h b/lib/Target/R600/MCTargetDesc/AMDGPUFixupKinds.h deleted file mode 100644 index 01021d67ffd..00000000000 --- a/lib/Target/R600/MCTargetDesc/AMDGPUFixupKinds.h +++ /dev/null @@ -1,34 +0,0 @@ -//===-- AMDGPUFixupKinds.h - AMDGPU Specific Fixup Entries ------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUFIXUPKINDS_H -#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUFIXUPKINDS_H - -#include "llvm/MC/MCFixup.h" - -namespace llvm { -namespace AMDGPU { -enum Fixups { - /// 16-bit PC relative fixup for SOPP branch instructions. - fixup_si_sopp_br = FirstTargetFixupKind, - - /// fixup for global addresses with constant initializers - fixup_si_rodata, - - /// fixup for offset from instruction to end of text section - fixup_si_end_of_text, - - // Marker - LastTargetFixupKind, - NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind -}; -} -} - -#endif diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp deleted file mode 100644 index 028a86dfc7a..00000000000 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ /dev/null @@ -1,43 +0,0 @@ -//===-- MCTargetDesc/AMDGPUMCAsmInfo.cpp - Assembly Info ------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -/// \file -//===----------------------------------------------------------------------===// - -#include "AMDGPUMCAsmInfo.h" - -using namespace llvm; -AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() { - HasSingleParameterDotFile = false; - //===------------------------------------------------------------------===// - MaxInstLength = 16; - SeparatorString = "\n"; - CommentString = ";"; - PrivateLabelPrefix = ""; - InlineAsmStart = ";#ASMSTART"; - InlineAsmEnd = ";#ASMEND"; - - //===--- Data Emission Directives -------------------------------------===// - ZeroDirective = ".zero"; - AsciiDirective = ".ascii\t"; - AscizDirective = ".asciz\t"; - Data8bitsDirective = ".byte\t"; - Data16bitsDirective = ".short\t"; - Data32bitsDirective = ".long\t"; - Data64bitsDirective = ".quad\t"; - SunStyleELFSectionSwitchSyntax = true; - UsesELFSectionDirectiveForBSS = true; - - //===--- Global Variable Emission Directives --------------------------===// - HasAggressiveSymbolFolding = true; - COMMDirectiveAlignmentIsInBytes = false; - HasDotTypeDotSizeDirective = false; - HasNoDeadStrip = true; - WeakRefDirective = ".weakref\t"; - //===--- Dwarf Emission Directives -----------------------------------===// - SupportsDebugInformation = true; -} diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h deleted file mode 100644 index a5bac51e356..00000000000 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h +++ /dev/null @@ -1,32 +0,0 @@ -//===-- MCTargetDesc/AMDGPUMCAsmInfo.h - AMDGPU MCAsm Interface -*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCASMINFO_H -#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCASMINFO_H - -#include "llvm/MC/MCAsmInfoELF.h" -namespace llvm { - -class Triple; - -// If you need to create another MCAsmInfo class, which inherits from MCAsmInfo, -// you will need to make sure your new class sets PrivateGlobalPrefix to -// a prefix that won't appeary in a fuction name. The default value -// for PrivateGlobalPrefix is 'L', so it will consider any function starting -// with 'L' as a local symbol. -class AMDGPUMCAsmInfo : public MCAsmInfoELF { -public: - explicit AMDGPUMCAsmInfo(const Triple &TT); -}; -} // namespace llvm -#endif diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.cpp deleted file mode 100644 index 521b3b39bba..00000000000 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ /dev/null @@ -1,21 +0,0 @@ -//===-- AMDGPUCodeEmitter.cpp - AMDGPU Code Emitter interface -------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief CodeEmitter interface for R600 and SI codegen. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPUMCCodeEmitter.h" - -using namespace llvm; - -// pin vtable to this file -void AMDGPUMCCodeEmitter::anchor() {} - diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h deleted file mode 100644 index c9574276223..00000000000 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h +++ /dev/null @@ -1,50 +0,0 @@ -//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief CodeEmitter interface for R600 and SI codegen. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCCODEEMITTER_H -#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCCODEEMITTER_H - -#include "llvm/MC/MCCodeEmitter.h" -#include "llvm/Support/raw_ostream.h" - -namespace llvm { - -class MCInst; -class MCOperand; -class MCSubtargetInfo; - -class AMDGPUMCCodeEmitter : public MCCodeEmitter { - virtual void anchor(); -public: - - uint64_t getBinaryCodeForInstr(const MCInst &MI, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - - virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - return 0; - } - - virtual unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - return 0; - } -}; - -} // End namespace llvm - -#endif diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp deleted file mode 100644 index 02192c40f92..00000000000 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ /dev/null @@ -1,90 +0,0 @@ -//===-- AMDGPUMCTargetDesc.cpp - AMDGPU Target Descriptions ---------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief This file provides AMDGPU specific target descriptions. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPUMCTargetDesc.h" -#include "AMDGPUMCAsmInfo.h" -#include "InstPrinter/AMDGPUInstPrinter.h" -#include "SIDefines.h" -#include "llvm/MC/MCCodeGenInfo.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCStreamer.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/MC/MachineLocation.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/TargetRegistry.h" - -using namespace llvm; - -#define GET_INSTRINFO_MC_DESC -#include "AMDGPUGenInstrInfo.inc" - -#define GET_SUBTARGETINFO_MC_DESC -#include "AMDGPUGenSubtargetInfo.inc" - -#define GET_REGINFO_MC_DESC -#include "AMDGPUGenRegisterInfo.inc" - -static MCInstrInfo *createAMDGPUMCInstrInfo() { - MCInstrInfo *X = new MCInstrInfo(); - InitAMDGPUMCInstrInfo(X); - return X; -} - -static MCRegisterInfo *createAMDGPUMCRegisterInfo(StringRef TT) { - MCRegisterInfo *X = new MCRegisterInfo(); - InitAMDGPUMCRegisterInfo(X, 0); - return X; -} - -static MCSubtargetInfo * -createAMDGPUMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { - MCSubtargetInfo * X = new MCSubtargetInfo(); - InitAMDGPUMCSubtargetInfo(X, TT, CPU, FS); - return X; -} - -static MCCodeGenInfo *createAMDGPUMCCodeGenInfo(StringRef TT, Reloc::Model RM, - CodeModel::Model CM, - CodeGenOpt::Level OL) { - MCCodeGenInfo *X = new MCCodeGenInfo(); - X->initMCCodeGenInfo(RM, CM, OL); - return X; -} - -static MCInstPrinter *createAMDGPUMCInstPrinter(const Triple &T, - unsigned SyntaxVariant, - const MCAsmInfo &MAI, - const MCInstrInfo &MII, - const MCRegisterInfo &MRI) { - return new AMDGPUInstPrinter(MAI, MII, MRI); -} - -extern "C" void LLVMInitializeR600TargetMC() { - for (Target *T : {&TheAMDGPUTarget, &TheGCNTarget}) { - RegisterMCAsmInfo X(*T); - - TargetRegistry::RegisterMCCodeGenInfo(*T, createAMDGPUMCCodeGenInfo); - TargetRegistry::RegisterMCInstrInfo(*T, createAMDGPUMCInstrInfo); - TargetRegistry::RegisterMCRegInfo(*T, createAMDGPUMCRegisterInfo); - TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo); - TargetRegistry::RegisterMCInstPrinter(*T, createAMDGPUMCInstPrinter); - TargetRegistry::RegisterMCAsmBackend(*T, createAMDGPUAsmBackend); - } - - TargetRegistry::RegisterMCCodeEmitter(TheAMDGPUTarget, - createR600MCCodeEmitter); - TargetRegistry::RegisterMCCodeEmitter(TheGCNTarget, createSIMCCodeEmitter); -} diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h deleted file mode 100644 index 92e29dc7037..00000000000 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h +++ /dev/null @@ -1,61 +0,0 @@ -//===-- AMDGPUMCTargetDesc.h - AMDGPU Target Descriptions -----*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Provides AMDGPU specific target descriptions. -// -//===----------------------------------------------------------------------===// -// - -#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H -#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H - -#include "llvm/Support/DataTypes.h" -#include "llvm/ADT/StringRef.h" - -namespace llvm { -class MCAsmBackend; -class MCCodeEmitter; -class MCContext; -class MCInstrInfo; -class MCObjectWriter; -class MCRegisterInfo; -class MCSubtargetInfo; -class Target; -class Triple; -class raw_pwrite_stream; -class raw_ostream; - -extern Target TheAMDGPUTarget; -extern Target TheGCNTarget; - -MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII, - const MCRegisterInfo &MRI, - MCContext &Ctx); - -MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII, - const MCRegisterInfo &MRI, - MCContext &Ctx); - -MCAsmBackend *createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU); - -MCObjectWriter *createAMDGPUELFObjectWriter(raw_pwrite_stream &OS); -} // End llvm namespace - -#define GET_REGINFO_ENUM -#include "AMDGPUGenRegisterInfo.inc" - -#define GET_INSTRINFO_ENUM -#include "AMDGPUGenInstrInfo.inc" - -#define GET_SUBTARGETINFO_ENUM -#include "AMDGPUGenSubtargetInfo.inc" - -#endif diff --git a/lib/Target/R600/MCTargetDesc/CMakeLists.txt b/lib/Target/R600/MCTargetDesc/CMakeLists.txt deleted file mode 100644 index 801c9054937..00000000000 --- a/lib/Target/R600/MCTargetDesc/CMakeLists.txt +++ /dev/null @@ -1,10 +0,0 @@ - -add_llvm_library(LLVMR600Desc - AMDGPUAsmBackend.cpp - AMDGPUELFObjectWriter.cpp - AMDGPUMCCodeEmitter.cpp - AMDGPUMCTargetDesc.cpp - AMDGPUMCAsmInfo.cpp - R600MCCodeEmitter.cpp - SIMCCodeEmitter.cpp - ) diff --git a/lib/Target/R600/MCTargetDesc/LLVMBuild.txt b/lib/Target/R600/MCTargetDesc/LLVMBuild.txt deleted file mode 100644 index 74b8ca09ae1..00000000000 --- a/lib/Target/R600/MCTargetDesc/LLVMBuild.txt +++ /dev/null @@ -1,23 +0,0 @@ -;===- ./lib/Target/R600/MCTargetDesc/LLVMBuild.txt -------------*- Conf -*--===; -; -; The LLVM Compiler Infrastructure -; -; This file is distributed under the University of Illinois Open Source -; License. See LICENSE.TXT for details. -; -;===------------------------------------------------------------------------===; -; -; This is an LLVMBuild description file for the components in this subdirectory. -; -; For more information on the LLVMBuild system, please see: -; -; http://llvm.org/docs/LLVMBuild.html -; -;===------------------------------------------------------------------------===; - -[component_0] -type = Library -name = R600Desc -parent = R600 -required_libraries = MC R600AsmPrinter R600Info Support -add_to_library_groups = R600 diff --git a/lib/Target/R600/MCTargetDesc/Makefile b/lib/Target/R600/MCTargetDesc/Makefile deleted file mode 100644 index 8894a7607f4..00000000000 --- a/lib/Target/R600/MCTargetDesc/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -##===- lib/Target/AMDGPU/TargetDesc/Makefile ----------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## - -LEVEL = ../../../.. -LIBRARYNAME = LLVMR600Desc - -# Hack: we need to include 'main' target directory to grab private headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp deleted file mode 100644 index e683498d52a..00000000000 --- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp +++ /dev/null @@ -1,181 +0,0 @@ -//===- R600MCCodeEmitter.cpp - Code Emitter for R600->Cayman GPU families -===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// -/// \brief The R600 code emitter produces machine code that can be executed -/// directly on the GPU device. -// -//===----------------------------------------------------------------------===// - -#include "R600Defines.h" -#include "MCTargetDesc/AMDGPUMCCodeEmitter.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "llvm/MC/MCCodeEmitter.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/Support/EndianStream.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -namespace { - -class R600MCCodeEmitter : public AMDGPUMCCodeEmitter { - R600MCCodeEmitter(const R600MCCodeEmitter &) = delete; - void operator=(const R600MCCodeEmitter &) = delete; - const MCInstrInfo &MCII; - const MCRegisterInfo &MRI; - -public: - - R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri) - : MCII(mcii), MRI(mri) { } - - /// \brief Encode the instruction and write it to the OS. - void encodeInstruction(const MCInst &MI, raw_ostream &OS, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const override; - - /// \returns the encoding for an MCOperand. - uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const override; -private: - - void EmitByte(unsigned int byte, raw_ostream &OS) const; - - void Emit(uint32_t value, raw_ostream &OS) const; - void Emit(uint64_t value, raw_ostream &OS) const; - - unsigned getHWRegChan(unsigned reg) const; - unsigned getHWReg(unsigned regNo) const; - -}; - -} // End anonymous namespace - -enum RegElement { - ELEMENT_X = 0, - ELEMENT_Y, - ELEMENT_Z, - ELEMENT_W -}; - -enum FCInstr { - FC_IF_PREDICATE = 0, - FC_ELSE, - FC_ENDIF, - FC_BGNLOOP, - FC_ENDLOOP, - FC_BREAK_PREDICATE, - FC_CONTINUE -}; - -MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII, - const MCRegisterInfo &MRI, - MCContext &Ctx) { - return new R600MCCodeEmitter(MCII, MRI); -} - -void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); - if (MI.getOpcode() == AMDGPU::RETURN || - MI.getOpcode() == AMDGPU::FETCH_CLAUSE || - MI.getOpcode() == AMDGPU::ALU_CLAUSE || - MI.getOpcode() == AMDGPU::BUNDLE || - MI.getOpcode() == AMDGPU::KILL) { - return; - } else if (IS_VTX(Desc)) { - uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups, STI); - uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset - if (!(STI.getFeatureBits()[AMDGPU::FeatureCaymanISA])) { - InstWord2 |= 1 << 19; // Mega-Fetch bit - } - - Emit(InstWord01, OS); - Emit(InstWord2, OS); - Emit((uint32_t) 0, OS); - } else if (IS_TEX(Desc)) { - int64_t Sampler = MI.getOperand(14).getImm(); - - int64_t SrcSelect[4] = { - MI.getOperand(2).getImm(), - MI.getOperand(3).getImm(), - MI.getOperand(4).getImm(), - MI.getOperand(5).getImm() - }; - int64_t Offsets[3] = { - MI.getOperand(6).getImm() & 0x1F, - MI.getOperand(7).getImm() & 0x1F, - MI.getOperand(8).getImm() & 0x1F - }; - - uint64_t Word01 = getBinaryCodeForInstr(MI, Fixups, STI); - uint32_t Word2 = Sampler << 15 | SrcSelect[ELEMENT_X] << 20 | - SrcSelect[ELEMENT_Y] << 23 | SrcSelect[ELEMENT_Z] << 26 | - SrcSelect[ELEMENT_W] << 29 | Offsets[0] << 0 | Offsets[1] << 5 | - Offsets[2] << 10; - - Emit(Word01, OS); - Emit(Word2, OS); - Emit((uint32_t) 0, OS); - } else { - uint64_t Inst = getBinaryCodeForInstr(MI, Fixups, STI); - if ((STI.getFeatureBits()[AMDGPU::FeatureR600ALUInst]) && - ((Desc.TSFlags & R600_InstFlag::OP1) || - Desc.TSFlags & R600_InstFlag::OP2)) { - uint64_t ISAOpCode = Inst & (0x3FFULL << 39); - Inst &= ~(0x3FFULL << 39); - Inst |= ISAOpCode << 1; - } - Emit(Inst, OS); - } -} - -void R600MCCodeEmitter::EmitByte(unsigned int Byte, raw_ostream &OS) const { - OS.write((uint8_t) Byte & 0xff); -} - -void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const { - support::endian::Writer(OS).write(Value); -} - -void R600MCCodeEmitter::Emit(uint64_t Value, raw_ostream &OS) const { - support::endian::Writer(OS).write(Value); -} - -unsigned R600MCCodeEmitter::getHWRegChan(unsigned reg) const { - return MRI.getEncodingValue(reg) >> HW_CHAN_SHIFT; -} - -unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const { - return MRI.getEncodingValue(RegNo) & HW_REG_MASK; -} - -uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI, - const MCOperand &MO, - SmallVectorImpl &Fixup, - const MCSubtargetInfo &STI) const { - if (MO.isReg()) { - if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags)) - return MRI.getEncodingValue(MO.getReg()); - return getHWReg(MO.getReg()); - } - - assert(MO.isImm()); - return MO.getImm(); -} - -#include "AMDGPUGenMCCodeEmitter.inc" diff --git a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp deleted file mode 100644 index 65a0eeba2b1..00000000000 --- a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp +++ /dev/null @@ -1,289 +0,0 @@ -//===-- SIMCCodeEmitter.cpp - SI Code Emitter -------------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief The SI code emitter produces machine code that can be executed -/// directly on the GPU device. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "MCTargetDesc/AMDGPUFixupKinds.h" -#include "MCTargetDesc/AMDGPUMCCodeEmitter.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "SIDefines.h" -#include "llvm/MC/MCCodeEmitter.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCFixup.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -namespace { - -class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { - SIMCCodeEmitter(const SIMCCodeEmitter &) = delete; - void operator=(const SIMCCodeEmitter &) = delete; - const MCInstrInfo &MCII; - const MCRegisterInfo &MRI; - MCContext &Ctx; - - /// \brief Can this operand also contain immediate values? - bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const; - - /// \brief Encode an fp or int literal - uint32_t getLitEncoding(const MCOperand &MO, unsigned OpSize) const; - -public: - SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri, - MCContext &ctx) - : MCII(mcii), MRI(mri), Ctx(ctx) { } - - ~SIMCCodeEmitter() override {} - - /// \brief Encode the instruction and write it to the OS. - void encodeInstruction(const MCInst &MI, raw_ostream &OS, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const override; - - /// \returns the encoding for an MCOperand. - uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const override; - - /// \brief Use a fixup to encode the simm16 field for SOPP branch - /// instructions. - unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const override; -}; - -} // End anonymous namespace - -MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII, - const MCRegisterInfo &MRI, - MCContext &Ctx) { - return new SIMCCodeEmitter(MCII, MRI, Ctx); -} - -bool SIMCCodeEmitter::isSrcOperand(const MCInstrDesc &Desc, - unsigned OpNo) const { - unsigned OpType = Desc.OpInfo[OpNo].OperandType; - - return OpType == AMDGPU::OPERAND_REG_IMM32 || - OpType == AMDGPU::OPERAND_REG_INLINE_C; -} - -// Returns the encoding value to use if the given integer is an integer inline -// immediate value, or 0 if it is not. -template -static uint32_t getIntInlineImmEncoding(IntTy Imm) { - if (Imm >= 0 && Imm <= 64) - return 128 + Imm; - - if (Imm >= -16 && Imm <= -1) - return 192 + std::abs(Imm); - - return 0; -} - -static uint32_t getLit32Encoding(uint32_t Val) { - uint32_t IntImm = getIntInlineImmEncoding(static_cast(Val)); - if (IntImm != 0) - return IntImm; - - if (Val == FloatToBits(0.5f)) - return 240; - - if (Val == FloatToBits(-0.5f)) - return 241; - - if (Val == FloatToBits(1.0f)) - return 242; - - if (Val == FloatToBits(-1.0f)) - return 243; - - if (Val == FloatToBits(2.0f)) - return 244; - - if (Val == FloatToBits(-2.0f)) - return 245; - - if (Val == FloatToBits(4.0f)) - return 246; - - if (Val == FloatToBits(-4.0f)) - return 247; - - return 255; -} - -static uint32_t getLit64Encoding(uint64_t Val) { - uint32_t IntImm = getIntInlineImmEncoding(static_cast(Val)); - if (IntImm != 0) - return IntImm; - - if (Val == DoubleToBits(0.5)) - return 240; - - if (Val == DoubleToBits(-0.5)) - return 241; - - if (Val == DoubleToBits(1.0)) - return 242; - - if (Val == DoubleToBits(-1.0)) - return 243; - - if (Val == DoubleToBits(2.0)) - return 244; - - if (Val == DoubleToBits(-2.0)) - return 245; - - if (Val == DoubleToBits(4.0)) - return 246; - - if (Val == DoubleToBits(-4.0)) - return 247; - - return 255; -} - -uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO, - unsigned OpSize) const { - if (MO.isExpr()) - return 255; - - assert(!MO.isFPImm()); - - if (!MO.isImm()) - return ~0; - - if (OpSize == 4) - return getLit32Encoding(static_cast(MO.getImm())); - - assert(OpSize == 8); - - return getLit64Encoding(static_cast(MO.getImm())); -} - -void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - - uint64_t Encoding = getBinaryCodeForInstr(MI, Fixups, STI); - const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); - unsigned bytes = Desc.getSize(); - - for (unsigned i = 0; i < bytes; i++) { - OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff)); - } - - if (bytes > 4) - return; - - // Check for additional literals in SRC0/1/2 (Op 1/2/3) - for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) { - - // Check if this operand should be encoded as [SV]Src - if (!isSrcOperand(Desc, i)) - continue; - - int RCID = Desc.OpInfo[i].RegClass; - const MCRegisterClass &RC = MRI.getRegClass(RCID); - - // Is this operand a literal immediate? - const MCOperand &Op = MI.getOperand(i); - if (getLitEncoding(Op, RC.getSize()) != 255) - continue; - - // Yes! Encode it - int64_t Imm = 0; - - if (Op.isImm()) - Imm = Op.getImm(); - else if (!Op.isExpr()) // Exprs will be replaced with a fixup value. - llvm_unreachable("Must be immediate or expr"); - - for (unsigned j = 0; j < 4; j++) { - OS.write((uint8_t) ((Imm >> (8 * j)) & 0xff)); - } - - // Only one literal value allowed - break; - } -} - -unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpNo); - - if (MO.isExpr()) { - const MCExpr *Expr = MO.getExpr(); - MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_sopp_br; - Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc())); - return 0; - } - - return getMachineOpValue(MI, MO, Fixups, STI); -} - -uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, - const MCOperand &MO, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - if (MO.isReg()) - return MRI.getEncodingValue(MO.getReg()); - - if (MO.isExpr()) { - const MCSymbolRefExpr *Expr = cast(MO.getExpr()); - MCFixupKind Kind; - const MCSymbol *Sym = - Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); - - if (&Expr->getSymbol() == Sym) { - // Add the offset to the beginning of the constant values. - Kind = (MCFixupKind)AMDGPU::fixup_si_end_of_text; - } else { - // This is used for constant data stored in .rodata. - Kind = (MCFixupKind)AMDGPU::fixup_si_rodata; - } - Fixups.push_back(MCFixup::create(4, Expr, Kind, MI.getLoc())); - } - - // Figure out the operand number, needed for isSrcOperand check - unsigned OpNo = 0; - for (unsigned e = MI.getNumOperands(); OpNo < e; ++OpNo) { - if (&MO == &MI.getOperand(OpNo)) - break; - } - - const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); - if (isSrcOperand(Desc, OpNo)) { - int RCID = Desc.OpInfo[OpNo].RegClass; - const MCRegisterClass &RC = MRI.getRegClass(RCID); - - uint32_t Enc = getLitEncoding(MO, RC.getSize()); - if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4)) - return Enc; - - } else if (MO.isImm()) - return MO.getImm(); - - llvm_unreachable("Encoding of this operand type is not supported yet."); - return 0; -} - diff --git a/lib/Target/R600/Makefile b/lib/Target/R600/Makefile deleted file mode 100644 index 64a7c8c045c..00000000000 --- a/lib/Target/R600/Makefile +++ /dev/null @@ -1,23 +0,0 @@ -##===- lib/Target/R600/Makefile ---------------------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## - -LEVEL = ../../.. -LIBRARYNAME = LLVMR600CodeGen -TARGET = AMDGPU - -# Make sure that tblgen is run, first thing. -BUILT_SOURCES = AMDGPUGenRegisterInfo.inc AMDGPUGenInstrInfo.inc \ - AMDGPUGenDAGISel.inc AMDGPUGenSubtargetInfo.inc \ - AMDGPUGenMCCodeEmitter.inc AMDGPUGenCallingConv.inc \ - AMDGPUGenIntrinsics.inc AMDGPUGenDFAPacketizer.inc \ - AMDGPUGenAsmWriter.inc AMDGPUGenAsmMatcher.inc - -DIRS = AsmParser InstPrinter TargetInfo MCTargetDesc - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td deleted file mode 100644 index c0ffede5199..00000000000 --- a/lib/Target/R600/Processors.td +++ /dev/null @@ -1,137 +0,0 @@ -//===-- Processors.td - R600 Processor definitions ------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -class Proc Features> -: Processor; - -//===----------------------------------------------------------------------===// -// R600 -//===----------------------------------------------------------------------===// -def : Proc<"", R600_VLIW5_Itin, - [FeatureR600, FeatureVertexCache]>; - -def : Proc<"r600", R600_VLIW5_Itin, - [FeatureR600 , FeatureVertexCache, FeatureWavefrontSize64]>; - -def : Proc<"r630", R600_VLIW5_Itin, - [FeatureR600, FeatureVertexCache, FeatureWavefrontSize32]>; - -def : Proc<"rs880", R600_VLIW5_Itin, - [FeatureR600, FeatureWavefrontSize16]>; - -def : Proc<"rv670", R600_VLIW5_Itin, - [FeatureR600, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>; - -//===----------------------------------------------------------------------===// -// R700 -//===----------------------------------------------------------------------===// - -def : Proc<"rv710", R600_VLIW5_Itin, - [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>; - -def : Proc<"rv730", R600_VLIW5_Itin, - [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>; - -def : Proc<"rv770", R600_VLIW5_Itin, - [FeatureR700, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>; - -//===----------------------------------------------------------------------===// -// Evergreen -//===----------------------------------------------------------------------===// - -def : Proc<"cedar", R600_VLIW5_Itin, - [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize32, - FeatureCFALUBug]>; - -def : Proc<"redwood", R600_VLIW5_Itin, - [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64, - FeatureCFALUBug]>; - -def : Proc<"sumo", R600_VLIW5_Itin, - [FeatureEvergreen, FeatureWavefrontSize64, FeatureCFALUBug]>; - -def : Proc<"juniper", R600_VLIW5_Itin, - [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64]>; - -def : Proc<"cypress", R600_VLIW5_Itin, - [FeatureEvergreen, FeatureFP64, FeatureVertexCache, - FeatureWavefrontSize64]>; - -//===----------------------------------------------------------------------===// -// Northern Islands -//===----------------------------------------------------------------------===// - -def : Proc<"barts", R600_VLIW5_Itin, - [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>; - -def : Proc<"turks", R600_VLIW5_Itin, - [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>; - -def : Proc<"caicos", R600_VLIW5_Itin, - [FeatureNorthernIslands, FeatureCFALUBug]>; - -def : Proc<"cayman", R600_VLIW4_Itin, - [FeatureNorthernIslands, FeatureFP64, FeatureCaymanISA]>; - -//===----------------------------------------------------------------------===// -// Southern Islands -//===----------------------------------------------------------------------===// - -def : ProcessorModel<"SI", SIFullSpeedModel, - [FeatureSouthernIslands, FeatureFastFMAF32] ->; - -def : ProcessorModel<"tahiti", SIFullSpeedModel, - [FeatureSouthernIslands, FeatureFastFMAF32] ->; - -def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, [FeatureSouthernIslands]>; - -def : ProcessorModel<"verde", SIQuarterSpeedModel, [FeatureSouthernIslands]>; - -def : ProcessorModel<"oland", SIQuarterSpeedModel, [FeatureSouthernIslands]>; - -def : ProcessorModel<"hainan", SIQuarterSpeedModel, [FeatureSouthernIslands]>; - -//===----------------------------------------------------------------------===// -// Sea Islands -//===----------------------------------------------------------------------===// - -def : ProcessorModel<"bonaire", SIQuarterSpeedModel, - [FeatureSeaIslands, FeatureLDSBankCount32] ->; - -def : ProcessorModel<"kabini", SIQuarterSpeedModel, - [FeatureSeaIslands, FeatureLDSBankCount16] ->; - -def : ProcessorModel<"kaveri", SIQuarterSpeedModel, - [FeatureSeaIslands, FeatureLDSBankCount32] ->; - -def : ProcessorModel<"hawaii", SIFullSpeedModel, - [FeatureSeaIslands, FeatureFastFMAF32, FeatureLDSBankCount32] ->; - -def : ProcessorModel<"mullins", SIQuarterSpeedModel, - [FeatureSeaIslands, FeatureLDSBankCount16]>; - -//===----------------------------------------------------------------------===// -// Volcanic Islands -//===----------------------------------------------------------------------===// - -def : ProcessorModel<"tonga", SIQuarterSpeedModel, - [FeatureVolcanicIslands, FeatureSGPRInitBug] ->; - -def : ProcessorModel<"iceland", SIQuarterSpeedModel, - [FeatureVolcanicIslands, FeatureSGPRInitBug] ->; - -def : ProcessorModel<"carrizo", SIQuarterSpeedModel, [FeatureVolcanicIslands]>; diff --git a/lib/Target/R600/R600ClauseMergePass.cpp b/lib/Target/R600/R600ClauseMergePass.cpp deleted file mode 100644 index 3cb90218a7d..00000000000 --- a/lib/Target/R600/R600ClauseMergePass.cpp +++ /dev/null @@ -1,206 +0,0 @@ -//===-- R600ClauseMergePass - Merge consecutive CF_ALU -------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// R600EmitClauseMarker pass emits CFAlu instruction in a conservative maneer. -/// This pass is merging consecutive CFAlus where applicable. -/// It needs to be called after IfCvt for best results. -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "R600Defines.h" -#include "R600InstrInfo.h" -#include "R600MachineFunctionInfo.h" -#include "R600RegisterInfo.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -#define DEBUG_TYPE "r600mergeclause" - -namespace { - -static bool isCFAlu(const MachineInstr *MI) { - switch (MI->getOpcode()) { - case AMDGPU::CF_ALU: - case AMDGPU::CF_ALU_PUSH_BEFORE: - return true; - default: - return false; - } -} - -class R600ClauseMergePass : public MachineFunctionPass { - -private: - static char ID; - const R600InstrInfo *TII; - - unsigned getCFAluSize(const MachineInstr *MI) const; - bool isCFAluEnabled(const MachineInstr *MI) const; - - /// IfCvt pass can generate "disabled" ALU clause marker that need to be - /// removed and their content affected to the previous alu clause. - /// This function parse instructions after CFAlu until it find a disabled - /// CFAlu and merge the content, or an enabled CFAlu. - void cleanPotentialDisabledCFAlu(MachineInstr *CFAlu) const; - - /// Check whether LatrCFAlu can be merged into RootCFAlu and do it if - /// it is the case. - bool mergeIfPossible(MachineInstr *RootCFAlu, const MachineInstr *LatrCFAlu) - const; - -public: - R600ClauseMergePass(TargetMachine &tm) : MachineFunctionPass(ID) { } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override; -}; - -char R600ClauseMergePass::ID = 0; - -unsigned R600ClauseMergePass::getCFAluSize(const MachineInstr *MI) const { - assert(isCFAlu(MI)); - return MI->getOperand( - TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::COUNT)).getImm(); -} - -bool R600ClauseMergePass::isCFAluEnabled(const MachineInstr *MI) const { - assert(isCFAlu(MI)); - return MI->getOperand( - TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::Enabled)).getImm(); -} - -void R600ClauseMergePass::cleanPotentialDisabledCFAlu(MachineInstr *CFAlu) - const { - int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT); - MachineBasicBlock::iterator I = CFAlu, E = CFAlu->getParent()->end(); - I++; - do { - while (I!= E && !isCFAlu(I)) - I++; - if (I == E) - return; - MachineInstr *MI = I++; - if (isCFAluEnabled(MI)) - break; - CFAlu->getOperand(CntIdx).setImm(getCFAluSize(CFAlu) + getCFAluSize(MI)); - MI->eraseFromParent(); - } while (I != E); -} - -bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu, - const MachineInstr *LatrCFAlu) const { - assert(isCFAlu(RootCFAlu) && isCFAlu(LatrCFAlu)); - int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT); - unsigned RootInstCount = getCFAluSize(RootCFAlu), - LaterInstCount = getCFAluSize(LatrCFAlu); - unsigned CumuledInsts = RootInstCount + LaterInstCount; - if (CumuledInsts >= TII->getMaxAlusPerClause()) { - DEBUG(dbgs() << "Excess inst counts\n"); - return false; - } - if (RootCFAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE) - return false; - // Is KCache Bank 0 compatible ? - int Mode0Idx = - TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE0); - int KBank0Idx = - TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK0); - int KBank0LineIdx = - TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR0); - if (LatrCFAlu->getOperand(Mode0Idx).getImm() && - RootCFAlu->getOperand(Mode0Idx).getImm() && - (LatrCFAlu->getOperand(KBank0Idx).getImm() != - RootCFAlu->getOperand(KBank0Idx).getImm() || - LatrCFAlu->getOperand(KBank0LineIdx).getImm() != - RootCFAlu->getOperand(KBank0LineIdx).getImm())) { - DEBUG(dbgs() << "Wrong KC0\n"); - return false; - } - // Is KCache Bank 1 compatible ? - int Mode1Idx = - TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE1); - int KBank1Idx = - TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK1); - int KBank1LineIdx = - TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR1); - if (LatrCFAlu->getOperand(Mode1Idx).getImm() && - RootCFAlu->getOperand(Mode1Idx).getImm() && - (LatrCFAlu->getOperand(KBank1Idx).getImm() != - RootCFAlu->getOperand(KBank1Idx).getImm() || - LatrCFAlu->getOperand(KBank1LineIdx).getImm() != - RootCFAlu->getOperand(KBank1LineIdx).getImm())) { - DEBUG(dbgs() << "Wrong KC0\n"); - return false; - } - if (LatrCFAlu->getOperand(Mode0Idx).getImm()) { - RootCFAlu->getOperand(Mode0Idx).setImm( - LatrCFAlu->getOperand(Mode0Idx).getImm()); - RootCFAlu->getOperand(KBank0Idx).setImm( - LatrCFAlu->getOperand(KBank0Idx).getImm()); - RootCFAlu->getOperand(KBank0LineIdx).setImm( - LatrCFAlu->getOperand(KBank0LineIdx).getImm()); - } - if (LatrCFAlu->getOperand(Mode1Idx).getImm()) { - RootCFAlu->getOperand(Mode1Idx).setImm( - LatrCFAlu->getOperand(Mode1Idx).getImm()); - RootCFAlu->getOperand(KBank1Idx).setImm( - LatrCFAlu->getOperand(KBank1Idx).getImm()); - RootCFAlu->getOperand(KBank1LineIdx).setImm( - LatrCFAlu->getOperand(KBank1LineIdx).getImm()); - } - RootCFAlu->getOperand(CntIdx).setImm(CumuledInsts); - RootCFAlu->setDesc(TII->get(LatrCFAlu->getOpcode())); - return true; -} - -bool R600ClauseMergePass::runOnMachineFunction(MachineFunction &MF) { - TII = static_cast(MF.getSubtarget().getInstrInfo()); - for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); - BB != BB_E; ++BB) { - MachineBasicBlock &MBB = *BB; - MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - MachineBasicBlock::iterator LatestCFAlu = E; - while (I != E) { - MachineInstr *MI = I++; - if ((!TII->canBeConsideredALU(MI) && !isCFAlu(MI)) || - TII->mustBeLastInClause(MI->getOpcode())) - LatestCFAlu = E; - if (!isCFAlu(MI)) - continue; - cleanPotentialDisabledCFAlu(MI); - - if (LatestCFAlu != E && mergeIfPossible(LatestCFAlu, MI)) { - MI->eraseFromParent(); - } else { - assert(MI->getOperand(8).getImm() && "CF ALU instruction disabled"); - LatestCFAlu = MI; - } - } - } - return false; -} - -const char *R600ClauseMergePass::getPassName() const { - return "R600 Merge Clause Markers Pass"; -} - -} // end anonymous namespace - - -llvm::FunctionPass *llvm::createR600ClauseMergePass(TargetMachine &TM) { - return new R600ClauseMergePass(TM); -} diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp deleted file mode 100644 index c8f37f61fc1..00000000000 --- a/lib/Target/R600/R600ControlFlowFinalizer.cpp +++ /dev/null @@ -1,679 +0,0 @@ -//===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// This pass compute turns all control flow pseudo instructions into native one -/// computing their address on the fly ; it also sets STACK_SIZE info. -//===----------------------------------------------------------------------===// - -#include "llvm/Support/Debug.h" -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "R600Defines.h" -#include "R600InstrInfo.h" -#include "R600MachineFunctionInfo.h" -#include "R600RegisterInfo.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -#define DEBUG_TYPE "r600cf" - -namespace { - -struct CFStack { - - enum StackItem { - ENTRY = 0, - SUB_ENTRY = 1, - FIRST_NON_WQM_PUSH = 2, - FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3 - }; - - const AMDGPUSubtarget *ST; - std::vector BranchStack; - std::vector LoopStack; - unsigned MaxStackSize; - unsigned CurrentEntries; - unsigned CurrentSubEntries; - - CFStack(const AMDGPUSubtarget *st, unsigned ShaderType) : ST(st), - // We need to reserve a stack entry for CALL_FS in vertex shaders. - MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0), - CurrentEntries(0), CurrentSubEntries(0) { } - - unsigned getLoopDepth(); - bool branchStackContains(CFStack::StackItem); - bool requiresWorkAroundForInst(unsigned Opcode); - unsigned getSubEntrySize(CFStack::StackItem Item); - void updateMaxStackSize(); - void pushBranch(unsigned Opcode, bool isWQM = false); - void pushLoop(); - void popBranch(); - void popLoop(); -}; - -unsigned CFStack::getLoopDepth() { - return LoopStack.size(); -} - -bool CFStack::branchStackContains(CFStack::StackItem Item) { - for (std::vector::const_iterator I = BranchStack.begin(), - E = BranchStack.end(); I != E; ++I) { - if (*I == Item) - return true; - } - return false; -} - -bool CFStack::requiresWorkAroundForInst(unsigned Opcode) { - if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() && - getLoopDepth() > 1) - return true; - - if (!ST->hasCFAluBug()) - return false; - - switch(Opcode) { - default: return false; - case AMDGPU::CF_ALU_PUSH_BEFORE: - case AMDGPU::CF_ALU_ELSE_AFTER: - case AMDGPU::CF_ALU_BREAK: - case AMDGPU::CF_ALU_CONTINUE: - if (CurrentSubEntries == 0) - return false; - if (ST->getWavefrontSize() == 64) { - // We are being conservative here. We only require this work-around if - // CurrentSubEntries > 3 && - // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0) - // - // We have to be conservative, because we don't know for certain that - // our stack allocation algorithm for Evergreen/NI is correct. Applying this - // work-around when CurrentSubEntries > 3 allows us to over-allocate stack - // resources without any problems. - return CurrentSubEntries > 3; - } else { - assert(ST->getWavefrontSize() == 32); - // We are being conservative here. We only require the work-around if - // CurrentSubEntries > 7 && - // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0) - // See the comment on the wavefront size == 64 case for why we are - // being conservative. - return CurrentSubEntries > 7; - } - } -} - -unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { - switch(Item) { - default: - return 0; - case CFStack::FIRST_NON_WQM_PUSH: - assert(!ST->hasCaymanISA()); - if (ST->getGeneration() <= AMDGPUSubtarget::R700) { - // +1 For the push operation. - // +2 Extra space required. - return 3; - } else { - // Some documentation says that this is not necessary on Evergreen, - // but experimentation has show that we need to allocate 1 extra - // sub-entry for the first non-WQM push. - // +1 For the push operation. - // +1 Extra space required. - return 2; - } - case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY: - assert(ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN); - // +1 For the push operation. - // +1 Extra space required. - return 2; - case CFStack::SUB_ENTRY: - return 1; - } -} - -void CFStack::updateMaxStackSize() { - unsigned CurrentStackSize = CurrentEntries + - (RoundUpToAlignment(CurrentSubEntries, 4) / 4); - MaxStackSize = std::max(CurrentStackSize, MaxStackSize); -} - -void CFStack::pushBranch(unsigned Opcode, bool isWQM) { - CFStack::StackItem Item = CFStack::ENTRY; - switch(Opcode) { - case AMDGPU::CF_PUSH_EG: - case AMDGPU::CF_ALU_PUSH_BEFORE: - if (!isWQM) { - if (!ST->hasCaymanISA() && - !branchStackContains(CFStack::FIRST_NON_WQM_PUSH)) - Item = CFStack::FIRST_NON_WQM_PUSH; // May not be required on Evergreen/NI - // See comment in - // CFStack::getSubEntrySize() - else if (CurrentEntries > 0 && - ST->getGeneration() > AMDGPUSubtarget::EVERGREEN && - !ST->hasCaymanISA() && - !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY)) - Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY; - else - Item = CFStack::SUB_ENTRY; - } else - Item = CFStack::ENTRY; - break; - } - BranchStack.push_back(Item); - if (Item == CFStack::ENTRY) - CurrentEntries++; - else - CurrentSubEntries += getSubEntrySize(Item); - updateMaxStackSize(); -} - -void CFStack::pushLoop() { - LoopStack.push_back(CFStack::ENTRY); - CurrentEntries++; - updateMaxStackSize(); -} - -void CFStack::popBranch() { - CFStack::StackItem Top = BranchStack.back(); - if (Top == CFStack::ENTRY) - CurrentEntries--; - else - CurrentSubEntries-= getSubEntrySize(Top); - BranchStack.pop_back(); -} - -void CFStack::popLoop() { - CurrentEntries--; - LoopStack.pop_back(); -} - -class R600ControlFlowFinalizer : public MachineFunctionPass { - -private: - typedef std::pair > ClauseFile; - - enum ControlFlowInstruction { - CF_TC, - CF_VC, - CF_CALL_FS, - CF_WHILE_LOOP, - CF_END_LOOP, - CF_LOOP_BREAK, - CF_LOOP_CONTINUE, - CF_JUMP, - CF_ELSE, - CF_POP, - CF_END - }; - - static char ID; - const R600InstrInfo *TII; - const R600RegisterInfo *TRI; - unsigned MaxFetchInst; - const AMDGPUSubtarget *ST; - - bool IsTrivialInst(MachineInstr *MI) const { - switch (MI->getOpcode()) { - case AMDGPU::KILL: - case AMDGPU::RETURN: - return true; - default: - return false; - } - } - - const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const { - unsigned Opcode = 0; - bool isEg = (ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN); - switch (CFI) { - case CF_TC: - Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600; - break; - case CF_VC: - Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600; - break; - case CF_CALL_FS: - Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600; - break; - case CF_WHILE_LOOP: - Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600; - break; - case CF_END_LOOP: - Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600; - break; - case CF_LOOP_BREAK: - Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600; - break; - case CF_LOOP_CONTINUE: - Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600; - break; - case CF_JUMP: - Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600; - break; - case CF_ELSE: - Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600; - break; - case CF_POP: - Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600; - break; - case CF_END: - if (ST->hasCaymanISA()) { - Opcode = AMDGPU::CF_END_CM; - break; - } - Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600; - break; - } - assert (Opcode && "No opcode selected"); - return TII->get(Opcode); - } - - bool isCompatibleWithClause(const MachineInstr *MI, - std::set &DstRegs) const { - unsigned DstMI, SrcMI; - for (MachineInstr::const_mop_iterator I = MI->operands_begin(), - E = MI->operands_end(); I != E; ++I) { - const MachineOperand &MO = *I; - if (!MO.isReg()) - continue; - if (MO.isDef()) { - unsigned Reg = MO.getReg(); - if (AMDGPU::R600_Reg128RegClass.contains(Reg)) - DstMI = Reg; - else - DstMI = TRI->getMatchingSuperReg(Reg, - TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)), - &AMDGPU::R600_Reg128RegClass); - } - if (MO.isUse()) { - unsigned Reg = MO.getReg(); - if (AMDGPU::R600_Reg128RegClass.contains(Reg)) - SrcMI = Reg; - else - SrcMI = TRI->getMatchingSuperReg(Reg, - TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)), - &AMDGPU::R600_Reg128RegClass); - } - } - if ((DstRegs.find(SrcMI) == DstRegs.end())) { - DstRegs.insert(DstMI); - return true; - } else - return false; - } - - ClauseFile - MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) - const { - MachineBasicBlock::iterator ClauseHead = I; - std::vector ClauseContent; - unsigned AluInstCount = 0; - bool IsTex = TII->usesTextureCache(ClauseHead); - std::set DstRegs; - for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { - if (IsTrivialInst(I)) - continue; - if (AluInstCount >= MaxFetchInst) - break; - if ((IsTex && !TII->usesTextureCache(I)) || - (!IsTex && !TII->usesVertexCache(I))) - break; - if (!isCompatibleWithClause(I, DstRegs)) - break; - AluInstCount ++; - ClauseContent.push_back(I); - } - MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), - getHWInstrDesc(IsTex?CF_TC:CF_VC)) - .addImm(0) // ADDR - .addImm(AluInstCount - 1); // COUNT - return ClauseFile(MIb, std::move(ClauseContent)); - } - - void getLiteral(MachineInstr *MI, std::vector &Lits) const { - static const unsigned LiteralRegs[] = { - AMDGPU::ALU_LITERAL_X, - AMDGPU::ALU_LITERAL_Y, - AMDGPU::ALU_LITERAL_Z, - AMDGPU::ALU_LITERAL_W - }; - const SmallVector, 3 > Srcs = - TII->getSrcs(MI); - for (unsigned i = 0, e = Srcs.size(); i < e; ++i) { - if (Srcs[i].first->getReg() != AMDGPU::ALU_LITERAL_X) - continue; - int64_t Imm = Srcs[i].second; - std::vector::iterator It = - std::find(Lits.begin(), Lits.end(), Imm); - if (It != Lits.end()) { - unsigned Index = It - Lits.begin(); - Srcs[i].first->setReg(LiteralRegs[Index]); - } else { - assert(Lits.size() < 4 && "Too many literals in Instruction Group"); - Srcs[i].first->setReg(LiteralRegs[Lits.size()]); - Lits.push_back(Imm); - } - } - } - - MachineBasicBlock::iterator insertLiterals( - MachineBasicBlock::iterator InsertPos, - const std::vector &Literals) const { - MachineBasicBlock *MBB = InsertPos->getParent(); - for (unsigned i = 0, e = Literals.size(); i < e; i+=2) { - unsigned LiteralPair0 = Literals[i]; - unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0; - InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(), - TII->get(AMDGPU::LITERALS)) - .addImm(LiteralPair0) - .addImm(LiteralPair1); - } - return InsertPos; - } - - ClauseFile - MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) - const { - MachineBasicBlock::iterator ClauseHead = I; - std::vector ClauseContent; - I++; - for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) { - if (IsTrivialInst(I)) { - ++I; - continue; - } - if (!I->isBundle() && !TII->isALUInstr(I->getOpcode())) - break; - std::vector Literals; - if (I->isBundle()) { - MachineInstr *DeleteMI = I; - MachineBasicBlock::instr_iterator BI = I.getInstrIterator(); - while (++BI != E && BI->isBundledWithPred()) { - BI->unbundleFromPred(); - for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) { - MachineOperand &MO = BI->getOperand(i); - if (MO.isReg() && MO.isInternalRead()) - MO.setIsInternalRead(false); - } - getLiteral(BI, Literals); - ClauseContent.push_back(BI); - } - I = BI; - DeleteMI->eraseFromParent(); - } else { - getLiteral(I, Literals); - ClauseContent.push_back(I); - I++; - } - for (unsigned i = 0, e = Literals.size(); i < e; i+=2) { - unsigned literal0 = Literals[i]; - unsigned literal2 = (i + 1 < e)?Literals[i + 1]:0; - MachineInstr *MILit = BuildMI(MBB, I, I->getDebugLoc(), - TII->get(AMDGPU::LITERALS)) - .addImm(literal0) - .addImm(literal2); - ClauseContent.push_back(MILit); - } - } - assert(ClauseContent.size() < 128 && "ALU clause is too big"); - ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1); - return ClauseFile(ClauseHead, std::move(ClauseContent)); - } - - void - EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, - unsigned &CfCount) { - CounterPropagateAddr(Clause.first, CfCount); - MachineBasicBlock *BB = Clause.first->getParent(); - BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE)) - .addImm(CfCount); - for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { - BB->splice(InsertPos, BB, Clause.second[i]); - } - CfCount += 2 * Clause.second.size(); - } - - void - EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, - unsigned &CfCount) { - Clause.first->getOperand(0).setImm(0); - CounterPropagateAddr(Clause.first, CfCount); - MachineBasicBlock *BB = Clause.first->getParent(); - BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE)) - .addImm(CfCount); - for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { - BB->splice(InsertPos, BB, Clause.second[i]); - } - CfCount += Clause.second.size(); - } - - void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const { - MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm()); - } - void CounterPropagateAddr(const std::set &MIs, - unsigned Addr) const { - for (MachineInstr *MI : MIs) { - CounterPropagateAddr(MI, Addr); - } - } - -public: - R600ControlFlowFinalizer(TargetMachine &tm) - : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), ST(nullptr) {} - - bool runOnMachineFunction(MachineFunction &MF) override { - ST = &MF.getSubtarget(); - MaxFetchInst = ST->getTexVTXClauseSize(); - TII = static_cast(ST->getInstrInfo()); - TRI = static_cast(ST->getRegisterInfo()); - R600MachineFunctionInfo *MFI = MF.getInfo(); - - CFStack CFStack(ST, MFI->getShaderType()); - for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME; - ++MB) { - MachineBasicBlock &MBB = *MB; - unsigned CfCount = 0; - std::vector > > LoopStack; - std::vector IfThenElseStack; - if (MFI->getShaderType() == ShaderType::VERTEX) { - BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), - getHWInstrDesc(CF_CALL_FS)); - CfCount++; - } - std::vector FetchClauses, AluClauses; - std::vector LastAlu(1); - std::vector ToPopAfter; - - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E;) { - if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) { - DEBUG(dbgs() << CfCount << ":"; I->dump();); - FetchClauses.push_back(MakeFetchClause(MBB, I)); - CfCount++; - LastAlu.back() = nullptr; - continue; - } - - MachineBasicBlock::iterator MI = I; - if (MI->getOpcode() != AMDGPU::ENDIF) - LastAlu.back() = nullptr; - if (MI->getOpcode() == AMDGPU::CF_ALU) - LastAlu.back() = MI; - I++; - bool RequiresWorkAround = - CFStack.requiresWorkAroundForInst(MI->getOpcode()); - switch (MI->getOpcode()) { - case AMDGPU::CF_ALU_PUSH_BEFORE: - if (RequiresWorkAround) { - DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n"); - BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG)) - .addImm(CfCount + 1) - .addImm(1); - MI->setDesc(TII->get(AMDGPU::CF_ALU)); - CfCount++; - CFStack.pushBranch(AMDGPU::CF_PUSH_EG); - } else - CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE); - - case AMDGPU::CF_ALU: - I = MI; - AluClauses.push_back(MakeALUClause(MBB, I)); - DEBUG(dbgs() << CfCount << ":"; MI->dump();); - CfCount++; - break; - case AMDGPU::WHILELOOP: { - CFStack.pushLoop(); - MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), - getHWInstrDesc(CF_WHILE_LOOP)) - .addImm(1); - std::pair > Pair(CfCount, - std::set()); - Pair.second.insert(MIb); - LoopStack.push_back(std::move(Pair)); - MI->eraseFromParent(); - CfCount++; - break; - } - case AMDGPU::ENDLOOP: { - CFStack.popLoop(); - std::pair > Pair = - std::move(LoopStack.back()); - LoopStack.pop_back(); - CounterPropagateAddr(Pair.second, CfCount); - BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP)) - .addImm(Pair.first + 1); - MI->eraseFromParent(); - CfCount++; - break; - } - case AMDGPU::IF_PREDICATE_SET: { - LastAlu.push_back(nullptr); - MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), - getHWInstrDesc(CF_JUMP)) - .addImm(0) - .addImm(0); - IfThenElseStack.push_back(MIb); - DEBUG(dbgs() << CfCount << ":"; MIb->dump();); - MI->eraseFromParent(); - CfCount++; - break; - } - case AMDGPU::ELSE: { - MachineInstr * JumpInst = IfThenElseStack.back(); - IfThenElseStack.pop_back(); - CounterPropagateAddr(JumpInst, CfCount); - MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), - getHWInstrDesc(CF_ELSE)) - .addImm(0) - .addImm(0); - DEBUG(dbgs() << CfCount << ":"; MIb->dump();); - IfThenElseStack.push_back(MIb); - MI->eraseFromParent(); - CfCount++; - break; - } - case AMDGPU::ENDIF: { - CFStack.popBranch(); - if (LastAlu.back()) { - ToPopAfter.push_back(LastAlu.back()); - } else { - MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), - getHWInstrDesc(CF_POP)) - .addImm(CfCount + 1) - .addImm(1); - (void)MIb; - DEBUG(dbgs() << CfCount << ":"; MIb->dump();); - CfCount++; - } - - MachineInstr *IfOrElseInst = IfThenElseStack.back(); - IfThenElseStack.pop_back(); - CounterPropagateAddr(IfOrElseInst, CfCount); - IfOrElseInst->getOperand(1).setImm(1); - LastAlu.pop_back(); - MI->eraseFromParent(); - break; - } - case AMDGPU::BREAK: { - CfCount ++; - MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), - getHWInstrDesc(CF_LOOP_BREAK)) - .addImm(0); - LoopStack.back().second.insert(MIb); - MI->eraseFromParent(); - break; - } - case AMDGPU::CONTINUE: { - MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), - getHWInstrDesc(CF_LOOP_CONTINUE)) - .addImm(0); - LoopStack.back().second.insert(MIb); - MI->eraseFromParent(); - CfCount++; - break; - } - case AMDGPU::RETURN: { - BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END)); - CfCount++; - MI->eraseFromParent(); - if (CfCount % 2) { - BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD)); - CfCount++; - } - for (unsigned i = 0, e = FetchClauses.size(); i < e; i++) - EmitFetchClause(I, FetchClauses[i], CfCount); - for (unsigned i = 0, e = AluClauses.size(); i < e; i++) - EmitALUClause(I, AluClauses[i], CfCount); - } - default: - if (TII->isExport(MI->getOpcode())) { - DEBUG(dbgs() << CfCount << ":"; MI->dump();); - CfCount++; - } - break; - } - } - for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) { - MachineInstr *Alu = ToPopAfter[i]; - BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu), - TII->get(AMDGPU::CF_ALU_POP_AFTER)) - .addImm(Alu->getOperand(0).getImm()) - .addImm(Alu->getOperand(1).getImm()) - .addImm(Alu->getOperand(2).getImm()) - .addImm(Alu->getOperand(3).getImm()) - .addImm(Alu->getOperand(4).getImm()) - .addImm(Alu->getOperand(5).getImm()) - .addImm(Alu->getOperand(6).getImm()) - .addImm(Alu->getOperand(7).getImm()) - .addImm(Alu->getOperand(8).getImm()); - Alu->eraseFromParent(); - } - MFI->StackSize = CFStack.MaxStackSize; - } - - return false; - } - - const char *getPassName() const override { - return "R600 Control Flow Finalizer Pass"; - } -}; - -char R600ControlFlowFinalizer::ID = 0; - -} // end anonymous namespace - - -llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) { - return new R600ControlFlowFinalizer(TM); -} diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h deleted file mode 100644 index 51d87eda31d..00000000000 --- a/lib/Target/R600/R600Defines.h +++ /dev/null @@ -1,171 +0,0 @@ -//===-- R600Defines.h - R600 Helper Macros ----------------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -/// \file -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_R600DEFINES_H -#define LLVM_LIB_TARGET_R600_R600DEFINES_H - -#include "llvm/MC/MCRegisterInfo.h" - -// Operand Flags -#define MO_FLAG_CLAMP (1 << 0) -#define MO_FLAG_NEG (1 << 1) -#define MO_FLAG_ABS (1 << 2) -#define MO_FLAG_MASK (1 << 3) -#define MO_FLAG_PUSH (1 << 4) -#define MO_FLAG_NOT_LAST (1 << 5) -#define MO_FLAG_LAST (1 << 6) -#define NUM_MO_FLAGS 7 - -/// \brief Helper for getting the operand index for the instruction flags -/// operand. -#define GET_FLAG_OPERAND_IDX(Flags) (((Flags) >> 7) & 0x3) - -namespace R600_InstFlag { - enum TIF { - TRANS_ONLY = (1 << 0), - TEX = (1 << 1), - REDUCTION = (1 << 2), - FC = (1 << 3), - TRIG = (1 << 4), - OP3 = (1 << 5), - VECTOR = (1 << 6), - //FlagOperand bits 7, 8 - NATIVE_OPERANDS = (1 << 9), - OP1 = (1 << 10), - OP2 = (1 << 11), - VTX_INST = (1 << 12), - TEX_INST = (1 << 13), - ALU_INST = (1 << 14), - LDS_1A = (1 << 15), - LDS_1A1D = (1 << 16), - IS_EXPORT = (1 << 17), - LDS_1A2D = (1 << 18) - }; -} - -#define HAS_NATIVE_OPERANDS(Flags) ((Flags) & R600_InstFlag::NATIVE_OPERANDS) - -/// \brief Defines for extracting register information from register encoding -#define HW_REG_MASK 0x1ff -#define HW_CHAN_SHIFT 9 - -#define GET_REG_CHAN(reg) ((reg) >> HW_CHAN_SHIFT) -#define GET_REG_INDEX(reg) ((reg) & HW_REG_MASK) - -#define IS_VTX(desc) ((desc).TSFlags & R600_InstFlag::VTX_INST) -#define IS_TEX(desc) ((desc).TSFlags & R600_InstFlag::TEX_INST) - -namespace OpName { - - enum VecOps { - UPDATE_EXEC_MASK_X, - UPDATE_PREDICATE_X, - WRITE_X, - OMOD_X, - DST_REL_X, - CLAMP_X, - SRC0_X, - SRC0_NEG_X, - SRC0_REL_X, - SRC0_ABS_X, - SRC0_SEL_X, - SRC1_X, - SRC1_NEG_X, - SRC1_REL_X, - SRC1_ABS_X, - SRC1_SEL_X, - PRED_SEL_X, - UPDATE_EXEC_MASK_Y, - UPDATE_PREDICATE_Y, - WRITE_Y, - OMOD_Y, - DST_REL_Y, - CLAMP_Y, - SRC0_Y, - SRC0_NEG_Y, - SRC0_REL_Y, - SRC0_ABS_Y, - SRC0_SEL_Y, - SRC1_Y, - SRC1_NEG_Y, - SRC1_REL_Y, - SRC1_ABS_Y, - SRC1_SEL_Y, - PRED_SEL_Y, - UPDATE_EXEC_MASK_Z, - UPDATE_PREDICATE_Z, - WRITE_Z, - OMOD_Z, - DST_REL_Z, - CLAMP_Z, - SRC0_Z, - SRC0_NEG_Z, - SRC0_REL_Z, - SRC0_ABS_Z, - SRC0_SEL_Z, - SRC1_Z, - SRC1_NEG_Z, - SRC1_REL_Z, - SRC1_ABS_Z, - SRC1_SEL_Z, - PRED_SEL_Z, - UPDATE_EXEC_MASK_W, - UPDATE_PREDICATE_W, - WRITE_W, - OMOD_W, - DST_REL_W, - CLAMP_W, - SRC0_W, - SRC0_NEG_W, - SRC0_REL_W, - SRC0_ABS_W, - SRC0_SEL_W, - SRC1_W, - SRC1_NEG_W, - SRC1_REL_W, - SRC1_ABS_W, - SRC1_SEL_W, - PRED_SEL_W, - IMM_0, - IMM_1, - VEC_COUNT - }; - -} - -//===----------------------------------------------------------------------===// -// Config register definitions -//===----------------------------------------------------------------------===// - -#define R_02880C_DB_SHADER_CONTROL 0x02880C -#define S_02880C_KILL_ENABLE(x) (((x) & 0x1) << 6) - -// These fields are the same for all shader types and families. -#define S_NUM_GPRS(x) (((x) & 0xFF) << 0) -#define S_STACK_SIZE(x) (((x) & 0xFF) << 8) -//===----------------------------------------------------------------------===// -// R600, R700 Registers -//===----------------------------------------------------------------------===// - -#define R_028850_SQ_PGM_RESOURCES_PS 0x028850 -#define R_028868_SQ_PGM_RESOURCES_VS 0x028868 - -//===----------------------------------------------------------------------===// -// Evergreen, Northern Islands Registers -//===----------------------------------------------------------------------===// - -#define R_028844_SQ_PGM_RESOURCES_PS 0x028844 -#define R_028860_SQ_PGM_RESOURCES_VS 0x028860 -#define R_028878_SQ_PGM_RESOURCES_GS 0x028878 -#define R_0288D4_SQ_PGM_RESOURCES_LS 0x0288d4 - -#define R_0288E8_SQ_LDS_ALLOC 0x0288E8 - -#endif diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp deleted file mode 100644 index fdc20302f4a..00000000000 --- a/lib/Target/R600/R600EmitClauseMarkers.cpp +++ /dev/null @@ -1,336 +0,0 @@ -//===-- R600EmitClauseMarkers.cpp - Emit CF_ALU ---------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// Add CF_ALU. R600 Alu instructions are grouped in clause which can hold -/// 128 Alu instructions ; these instructions can access up to 4 prefetched -/// 4 lines of 16 registers from constant buffers. Such ALU clauses are -/// initiated by CF_ALU instructions. -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "R600Defines.h" -#include "R600InstrInfo.h" -#include "R600MachineFunctionInfo.h" -#include "R600RegisterInfo.h" -#include "AMDGPUSubtarget.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" - -using namespace llvm; - -namespace llvm { - void initializeR600EmitClauseMarkersPass(PassRegistry&); -} - -namespace { - -class R600EmitClauseMarkers : public MachineFunctionPass { - -private: - const R600InstrInfo *TII; - int Address; - - unsigned OccupiedDwords(MachineInstr *MI) const { - switch (MI->getOpcode()) { - case AMDGPU::INTERP_PAIR_XY: - case AMDGPU::INTERP_PAIR_ZW: - case AMDGPU::INTERP_VEC_LOAD: - case AMDGPU::DOT_4: - return 4; - case AMDGPU::KILL: - return 0; - default: - break; - } - - // These will be expanded to two ALU instructions in the - // ExpandSpecialInstructions pass. - if (TII->isLDSRetInstr(MI->getOpcode())) - return 2; - - if(TII->isVector(*MI) || - TII->isCubeOp(MI->getOpcode()) || - TII->isReductionOp(MI->getOpcode())) - return 4; - - unsigned NumLiteral = 0; - for (MachineInstr::mop_iterator It = MI->operands_begin(), - E = MI->operands_end(); It != E; ++It) { - MachineOperand &MO = *It; - if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X) - ++NumLiteral; - } - return 1 + NumLiteral; - } - - bool isALU(const MachineInstr *MI) const { - if (TII->isALUInstr(MI->getOpcode())) - return true; - if (TII->isVector(*MI) || TII->isCubeOp(MI->getOpcode())) - return true; - switch (MI->getOpcode()) { - case AMDGPU::PRED_X: - case AMDGPU::INTERP_PAIR_XY: - case AMDGPU::INTERP_PAIR_ZW: - case AMDGPU::INTERP_VEC_LOAD: - case AMDGPU::COPY: - case AMDGPU::DOT_4: - return true; - default: - return false; - } - } - - bool IsTrivialInst(MachineInstr *MI) const { - switch (MI->getOpcode()) { - case AMDGPU::KILL: - case AMDGPU::RETURN: - case AMDGPU::IMPLICIT_DEF: - return true; - default: - return false; - } - } - - std::pair getAccessedBankLine(unsigned Sel) const { - // Sel is (512 + (kc_bank << 12) + ConstIndex) << 2 - // (See also R600ISelLowering.cpp) - // ConstIndex value is in [0, 4095]; - return std::pair( - ((Sel >> 2) - 512) >> 12, // KC_BANK - // Line Number of ConstIndex - // A line contains 16 constant registers however KCX bank can lock - // two line at the same time ; thus we want to get an even line number. - // Line number can be retrieved with (>>4), using (>>5) <<1 generates - // an even number. - ((((Sel >> 2) - 512) & 4095) >> 5) << 1); - } - - bool SubstituteKCacheBank(MachineInstr *MI, - std::vector > &CachedConsts, - bool UpdateInstr = true) const { - std::vector > UsedKCache; - - if (!TII->isALUInstr(MI->getOpcode()) && MI->getOpcode() != AMDGPU::DOT_4) - return true; - - const SmallVectorImpl > &Consts = - TII->getSrcs(MI); - assert((TII->isALUInstr(MI->getOpcode()) || - MI->getOpcode() == AMDGPU::DOT_4) && "Can't assign Const"); - for (unsigned i = 0, n = Consts.size(); i < n; ++i) { - if (Consts[i].first->getReg() != AMDGPU::ALU_CONST) - continue; - unsigned Sel = Consts[i].second; - unsigned Chan = Sel & 3, Index = ((Sel >> 2) - 512) & 31; - unsigned KCacheIndex = Index * 4 + Chan; - const std::pair &BankLine = getAccessedBankLine(Sel); - if (CachedConsts.empty()) { - CachedConsts.push_back(BankLine); - UsedKCache.push_back(std::pair(0, KCacheIndex)); - continue; - } - if (CachedConsts[0] == BankLine) { - UsedKCache.push_back(std::pair(0, KCacheIndex)); - continue; - } - if (CachedConsts.size() == 1) { - CachedConsts.push_back(BankLine); - UsedKCache.push_back(std::pair(1, KCacheIndex)); - continue; - } - if (CachedConsts[1] == BankLine) { - UsedKCache.push_back(std::pair(1, KCacheIndex)); - continue; - } - return false; - } - - if (!UpdateInstr) - return true; - - for (unsigned i = 0, j = 0, n = Consts.size(); i < n; ++i) { - if (Consts[i].first->getReg() != AMDGPU::ALU_CONST) - continue; - switch(UsedKCache[j].first) { - case 0: - Consts[i].first->setReg( - AMDGPU::R600_KC0RegClass.getRegister(UsedKCache[j].second)); - break; - case 1: - Consts[i].first->setReg( - AMDGPU::R600_KC1RegClass.getRegister(UsedKCache[j].second)); - break; - default: - llvm_unreachable("Wrong Cache Line"); - } - j++; - } - return true; - } - - bool canClauseLocalKillFitInClause( - unsigned AluInstCount, - std::vector > KCacheBanks, - MachineBasicBlock::iterator Def, - MachineBasicBlock::iterator BBEnd) { - const R600RegisterInfo &TRI = TII->getRegisterInfo(); - for (MachineInstr::const_mop_iterator - MOI = Def->operands_begin(), - MOE = Def->operands_end(); MOI != MOE; ++MOI) { - if (!MOI->isReg() || !MOI->isDef() || - TRI.isPhysRegLiveAcrossClauses(MOI->getReg())) - continue; - - // Def defines a clause local register, so check that its use will fit - // in the clause. - unsigned LastUseCount = 0; - for (MachineBasicBlock::iterator UseI = Def; UseI != BBEnd; ++UseI) { - AluInstCount += OccupiedDwords(UseI); - // Make sure we won't need to end the clause due to KCache limitations. - if (!SubstituteKCacheBank(UseI, KCacheBanks, false)) - return false; - - // We have reached the maximum instruction limit before finding the - // use that kills this register, so we cannot use this def in the - // current clause. - if (AluInstCount >= TII->getMaxAlusPerClause()) - return false; - - // Register kill flags have been cleared by the time we get to this - // pass, but it is safe to assume that all uses of this register - // occur in the same basic block as its definition, because - // it is illegal for the scheduler to schedule them in - // different blocks. - if (UseI->findRegisterUseOperandIdx(MOI->getReg())) - LastUseCount = AluInstCount; - - if (UseI != Def && UseI->findRegisterDefOperandIdx(MOI->getReg()) != -1) - break; - } - if (LastUseCount) - return LastUseCount <= TII->getMaxAlusPerClause(); - llvm_unreachable("Clause local register live at end of clause."); - } - return true; - } - - MachineBasicBlock::iterator - MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) { - MachineBasicBlock::iterator ClauseHead = I; - std::vector > KCacheBanks; - bool PushBeforeModifier = false; - unsigned AluInstCount = 0; - for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { - if (IsTrivialInst(I)) - continue; - if (!isALU(I)) - break; - if (AluInstCount > TII->getMaxAlusPerClause()) - break; - if (I->getOpcode() == AMDGPU::PRED_X) { - // We put PRED_X in its own clause to ensure that ifcvt won't create - // clauses with more than 128 insts. - // IfCvt is indeed checking that "then" and "else" branches of an if - // statement have less than ~60 insts thus converted clauses can't be - // bigger than ~121 insts (predicate setter needs to be in the same - // clause as predicated alus). - if (AluInstCount > 0) - break; - if (TII->getFlagOp(I).getImm() & MO_FLAG_PUSH) - PushBeforeModifier = true; - AluInstCount ++; - continue; - } - // XXX: GROUP_BARRIER instructions cannot be in the same ALU clause as: - // - // * KILL or INTERP instructions - // * Any instruction that sets UPDATE_EXEC_MASK or UPDATE_PRED bits - // * Uses waterfalling (i.e. INDEX_MODE = AR.X) - // - // XXX: These checks have not been implemented yet. - if (TII->mustBeLastInClause(I->getOpcode())) { - I++; - break; - } - - // If this instruction defines a clause local register, make sure - // its use can fit in this clause. - if (!canClauseLocalKillFitInClause(AluInstCount, KCacheBanks, I, E)) - break; - - if (!SubstituteKCacheBank(I, KCacheBanks)) - break; - AluInstCount += OccupiedDwords(I); - } - unsigned Opcode = PushBeforeModifier ? - AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU; - BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode)) - // We don't use the ADDR field until R600ControlFlowFinalizer pass, where - // it is safe to assume it is 0. However if we always put 0 here, the ifcvt - // pass may assume that identical ALU clause starter at the beginning of a - // true and false branch can be factorized which is not the case. - .addImm(Address++) // ADDR - .addImm(KCacheBanks.empty()?0:KCacheBanks[0].first) // KB0 - .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].first) // KB1 - .addImm(KCacheBanks.empty()?0:2) // KM0 - .addImm((KCacheBanks.size() < 2)?0:2) // KM1 - .addImm(KCacheBanks.empty()?0:KCacheBanks[0].second) // KLINE0 - .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].second) // KLINE1 - .addImm(AluInstCount) // COUNT - .addImm(1); // Enabled - return I; - } - -public: - static char ID; - R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(nullptr), Address(0) { - - initializeR600EmitClauseMarkersPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override { - TII = static_cast(MF.getSubtarget().getInstrInfo()); - - for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); - BB != BB_E; ++BB) { - MachineBasicBlock &MBB = *BB; - MachineBasicBlock::iterator I = MBB.begin(); - if (I->getOpcode() == AMDGPU::CF_ALU) - continue; // BB was already parsed - for (MachineBasicBlock::iterator E = MBB.end(); I != E;) { - if (isALU(I)) - I = MakeALUClause(MBB, I); - else - ++I; - } - } - return false; - } - - const char *getPassName() const override { - return "R600 Emit Clause Markers Pass"; - } -}; - -char R600EmitClauseMarkers::ID = 0; - -} // end anonymous namespace - -INITIALIZE_PASS_BEGIN(R600EmitClauseMarkers, "emitclausemarkers", - "R600 Emit Clause Markters", false, false) -INITIALIZE_PASS_END(R600EmitClauseMarkers, "emitclausemarkers", - "R600 Emit Clause Markters", false, false) - -llvm::FunctionPass *llvm::createR600EmitClauseMarkers() { - return new R600EmitClauseMarkers(); -} - diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/lib/Target/R600/R600ExpandSpecialInstrs.cpp deleted file mode 100644 index 211d392e8fc..00000000000 --- a/lib/Target/R600/R600ExpandSpecialInstrs.cpp +++ /dev/null @@ -1,349 +0,0 @@ -//===-- R600ExpandSpecialInstrs.cpp - Expand special instructions ---------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// Vector, Reduction, and Cube instructions need to fill the entire instruction -/// group to work correctly. This pass expands these individual instructions -/// into several instructions that will completely fill the instruction group. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "R600Defines.h" -#include "R600InstrInfo.h" -#include "R600MachineFunctionInfo.h" -#include "R600RegisterInfo.h" -#include "AMDGPUSubtarget.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" - -using namespace llvm; - -namespace { - -class R600ExpandSpecialInstrsPass : public MachineFunctionPass { - -private: - static char ID; - const R600InstrInfo *TII; - - void SetFlagInNewMI(MachineInstr *NewMI, const MachineInstr *OldMI, - unsigned Op); - -public: - R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID), - TII(nullptr) { } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "R600 Expand special instructions pass"; - } -}; - -} // End anonymous namespace - -char R600ExpandSpecialInstrsPass::ID = 0; - -FunctionPass *llvm::createR600ExpandSpecialInstrsPass(TargetMachine &TM) { - return new R600ExpandSpecialInstrsPass(TM); -} - -void R600ExpandSpecialInstrsPass::SetFlagInNewMI(MachineInstr *NewMI, - const MachineInstr *OldMI, unsigned Op) { - int OpIdx = TII->getOperandIdx(*OldMI, Op); - if (OpIdx > -1) { - uint64_t Val = OldMI->getOperand(OpIdx).getImm(); - TII->setImmOperand(NewMI, Op, Val); - } -} - -bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { - TII = static_cast(MF.getSubtarget().getInstrInfo()); - - const R600RegisterInfo &TRI = TII->getRegisterInfo(); - - for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); - BB != BB_E; ++BB) { - MachineBasicBlock &MBB = *BB; - MachineBasicBlock::iterator I = MBB.begin(); - while (I != MBB.end()) { - MachineInstr &MI = *I; - I = std::next(I); - - // Expand LDS_*_RET instructions - if (TII->isLDSRetInstr(MI.getOpcode())) { - int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); - assert(DstIdx != -1); - MachineOperand &DstOp = MI.getOperand(DstIdx); - MachineInstr *Mov = TII->buildMovInstr(&MBB, I, - DstOp.getReg(), AMDGPU::OQAP); - DstOp.setReg(AMDGPU::OQAP); - int LDSPredSelIdx = TII->getOperandIdx(MI.getOpcode(), - AMDGPU::OpName::pred_sel); - int MovPredSelIdx = TII->getOperandIdx(Mov->getOpcode(), - AMDGPU::OpName::pred_sel); - // Copy the pred_sel bit - Mov->getOperand(MovPredSelIdx).setReg( - MI.getOperand(LDSPredSelIdx).getReg()); - } - - switch (MI.getOpcode()) { - default: break; - // Expand PRED_X to one of the PRED_SET instructions. - case AMDGPU::PRED_X: { - uint64_t Flags = MI.getOperand(3).getImm(); - // The native opcode used by PRED_X is stored as an immediate in the - // third operand. - MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I, - MI.getOperand(2).getImm(), // opcode - MI.getOperand(0).getReg(), // dst - MI.getOperand(1).getReg(), // src0 - AMDGPU::ZERO); // src1 - TII->addFlag(PredSet, 0, MO_FLAG_MASK); - if (Flags & MO_FLAG_PUSH) { - TII->setImmOperand(PredSet, AMDGPU::OpName::update_exec_mask, 1); - } else { - TII->setImmOperand(PredSet, AMDGPU::OpName::update_pred, 1); - } - MI.eraseFromParent(); - continue; - } - - case AMDGPU::INTERP_PAIR_XY: { - MachineInstr *BMI; - unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( - MI.getOperand(2).getImm()); - - for (unsigned Chan = 0; Chan < 4; ++Chan) { - unsigned DstReg; - - if (Chan < 2) - DstReg = MI.getOperand(Chan).getReg(); - else - DstReg = Chan == 2 ? AMDGPU::T0_Z : AMDGPU::T0_W; - - BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_XY, - DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg); - - if (Chan > 0) { - BMI->bundleWithPred(); - } - if (Chan >= 2) - TII->addFlag(BMI, 0, MO_FLAG_MASK); - if (Chan != 3) - TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); - } - - MI.eraseFromParent(); - continue; - } - - case AMDGPU::INTERP_PAIR_ZW: { - MachineInstr *BMI; - unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( - MI.getOperand(2).getImm()); - - for (unsigned Chan = 0; Chan < 4; ++Chan) { - unsigned DstReg; - - if (Chan < 2) - DstReg = Chan == 0 ? AMDGPU::T0_X : AMDGPU::T0_Y; - else - DstReg = MI.getOperand(Chan-2).getReg(); - - BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_ZW, - DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg); - - if (Chan > 0) { - BMI->bundleWithPred(); - } - if (Chan < 2) - TII->addFlag(BMI, 0, MO_FLAG_MASK); - if (Chan != 3) - TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); - } - - MI.eraseFromParent(); - continue; - } - - case AMDGPU::INTERP_VEC_LOAD: { - const R600RegisterInfo &TRI = TII->getRegisterInfo(); - MachineInstr *BMI; - unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( - MI.getOperand(1).getImm()); - unsigned DstReg = MI.getOperand(0).getReg(); - - for (unsigned Chan = 0; Chan < 4; ++Chan) { - BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_LOAD_P0, - TRI.getSubReg(DstReg, TRI.getSubRegFromChannel(Chan)), PReg); - if (Chan > 0) { - BMI->bundleWithPred(); - } - if (Chan != 3) - TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); - } - - MI.eraseFromParent(); - continue; - } - case AMDGPU::DOT_4: { - - const R600RegisterInfo &TRI = TII->getRegisterInfo(); - - unsigned DstReg = MI.getOperand(0).getReg(); - unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK; - - for (unsigned Chan = 0; Chan < 4; ++Chan) { - bool Mask = (Chan != TRI.getHWRegChan(DstReg)); - unsigned SubDstReg = - AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan); - MachineInstr *BMI = - TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, SubDstReg); - if (Chan > 0) { - BMI->bundleWithPred(); - } - if (Mask) { - TII->addFlag(BMI, 0, MO_FLAG_MASK); - } - if (Chan != 3) - TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); - unsigned Opcode = BMI->getOpcode(); - // While not strictly necessary from hw point of view, we force - // all src operands of a dot4 inst to belong to the same slot. - unsigned Src0 = BMI->getOperand( - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0)) - .getReg(); - unsigned Src1 = BMI->getOperand( - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1)) - .getReg(); - (void) Src0; - (void) Src1; - if ((TRI.getEncodingValue(Src0) & 0xff) < 127 && - (TRI.getEncodingValue(Src1) & 0xff) < 127) - assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1)); - } - MI.eraseFromParent(); - continue; - } - } - - bool IsReduction = TII->isReductionOp(MI.getOpcode()); - bool IsVector = TII->isVector(MI); - bool IsCube = TII->isCubeOp(MI.getOpcode()); - if (!IsReduction && !IsVector && !IsCube) { - continue; - } - - // Expand the instruction - // - // Reduction instructions: - // T0_X = DP4 T1_XYZW, T2_XYZW - // becomes: - // TO_X = DP4 T1_X, T2_X - // TO_Y (write masked) = DP4 T1_Y, T2_Y - // TO_Z (write masked) = DP4 T1_Z, T2_Z - // TO_W (write masked) = DP4 T1_W, T2_W - // - // Vector instructions: - // T0_X = MULLO_INT T1_X, T2_X - // becomes: - // T0_X = MULLO_INT T1_X, T2_X - // T0_Y (write masked) = MULLO_INT T1_X, T2_X - // T0_Z (write masked) = MULLO_INT T1_X, T2_X - // T0_W (write masked) = MULLO_INT T1_X, T2_X - // - // Cube instructions: - // T0_XYZW = CUBE T1_XYZW - // becomes: - // TO_X = CUBE T1_Z, T1_Y - // T0_Y = CUBE T1_Z, T1_X - // T0_Z = CUBE T1_X, T1_Z - // T0_W = CUBE T1_Y, T1_Z - for (unsigned Chan = 0; Chan < 4; Chan++) { - unsigned DstReg = MI.getOperand( - TII->getOperandIdx(MI, AMDGPU::OpName::dst)).getReg(); - unsigned Src0 = MI.getOperand( - TII->getOperandIdx(MI, AMDGPU::OpName::src0)).getReg(); - unsigned Src1 = 0; - - // Determine the correct source registers - if (!IsCube) { - int Src1Idx = TII->getOperandIdx(MI, AMDGPU::OpName::src1); - if (Src1Idx != -1) { - Src1 = MI.getOperand(Src1Idx).getReg(); - } - } - if (IsReduction) { - unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan); - Src0 = TRI.getSubReg(Src0, SubRegIndex); - Src1 = TRI.getSubReg(Src1, SubRegIndex); - } else if (IsCube) { - static const int CubeSrcSwz[] = {2, 2, 0, 1}; - unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]); - unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]); - Src1 = TRI.getSubReg(Src0, SubRegIndex1); - Src0 = TRI.getSubReg(Src0, SubRegIndex0); - } - - // Determine the correct destination registers; - bool Mask = false; - bool NotLast = true; - if (IsCube) { - unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan); - DstReg = TRI.getSubReg(DstReg, SubRegIndex); - } else { - // Mask the write if the original instruction does not write to - // the current Channel. - Mask = (Chan != TRI.getHWRegChan(DstReg)); - unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK; - DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan); - } - - // Set the IsLast bit - NotLast = (Chan != 3 ); - - // Add the new instruction - unsigned Opcode = MI.getOpcode(); - switch (Opcode) { - case AMDGPU::CUBE_r600_pseudo: - Opcode = AMDGPU::CUBE_r600_real; - break; - case AMDGPU::CUBE_eg_pseudo: - Opcode = AMDGPU::CUBE_eg_real; - break; - default: - break; - } - - MachineInstr *NewMI = - TII->buildDefaultInstruction(MBB, I, Opcode, DstReg, Src0, Src1); - - if (Chan != 0) - NewMI->bundleWithPred(); - if (Mask) { - TII->addFlag(NewMI, 0, MO_FLAG_MASK); - } - if (NotLast) { - TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST); - } - SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::clamp); - SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::literal); - SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_abs); - SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_abs); - SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_neg); - SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_neg); - } - MI.eraseFromParent(); - } - } - return false; -} diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp deleted file mode 100644 index 8357b6d9d0e..00000000000 --- a/lib/Target/R600/R600ISelLowering.cpp +++ /dev/null @@ -1,2286 +0,0 @@ -//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Custom DAG lowering for R600 -// -//===----------------------------------------------------------------------===// - -#include "R600ISelLowering.h" -#include "AMDGPUFrameLowering.h" -#include "AMDGPUIntrinsicInfo.h" -#include "AMDGPUSubtarget.h" -#include "R600Defines.h" -#include "R600InstrInfo.h" -#include "R600MachineFunctionInfo.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/CodeGen/CallingConvLower.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/IR/Argument.h" -#include "llvm/IR/Function.h" - -using namespace llvm; - -R600TargetLowering::R600TargetLowering(TargetMachine &TM, - const AMDGPUSubtarget &STI) - : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) { - addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); - addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); - addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); - addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass); - addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass); - addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass); - - computeRegisterProperties(STI.getRegisterInfo()); - - // Set condition code actions - setCondCodeAction(ISD::SETO, MVT::f32, Expand); - setCondCodeAction(ISD::SETUO, MVT::f32, Expand); - setCondCodeAction(ISD::SETLT, MVT::f32, Expand); - setCondCodeAction(ISD::SETLE, MVT::f32, Expand); - setCondCodeAction(ISD::SETOLT, MVT::f32, Expand); - setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); - setCondCodeAction(ISD::SETONE, MVT::f32, Expand); - setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); - setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); - setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); - setCondCodeAction(ISD::SETULT, MVT::f32, Expand); - setCondCodeAction(ISD::SETULE, MVT::f32, Expand); - - setCondCodeAction(ISD::SETLE, MVT::i32, Expand); - setCondCodeAction(ISD::SETLT, MVT::i32, Expand); - setCondCodeAction(ISD::SETULE, MVT::i32, Expand); - setCondCodeAction(ISD::SETULT, MVT::i32, Expand); - - setOperationAction(ISD::FCOS, MVT::f32, Custom); - setOperationAction(ISD::FSIN, MVT::f32, Custom); - - setOperationAction(ISD::SETCC, MVT::v4i32, Expand); - setOperationAction(ISD::SETCC, MVT::v2i32, Expand); - - setOperationAction(ISD::BR_CC, MVT::i32, Expand); - setOperationAction(ISD::BR_CC, MVT::f32, Expand); - setOperationAction(ISD::BRCOND, MVT::Other, Custom); - - setOperationAction(ISD::FSUB, MVT::f32, Expand); - - setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom); - - setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); - setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); - - setOperationAction(ISD::SETCC, MVT::i32, Expand); - setOperationAction(ISD::SETCC, MVT::f32, Expand); - setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); - - setOperationAction(ISD::SELECT, MVT::i32, Expand); - setOperationAction(ISD::SELECT, MVT::f32, Expand); - setOperationAction(ISD::SELECT, MVT::v2i32, Expand); - setOperationAction(ISD::SELECT, MVT::v4i32, Expand); - - // ADD, SUB overflow. - // TODO: turn these into Legal? - if (Subtarget->hasCARRY()) - setOperationAction(ISD::UADDO, MVT::i32, Custom); - - if (Subtarget->hasBORROW()) - setOperationAction(ISD::USUBO, MVT::i32, Custom); - - // Expand sign extension of vectors - if (!Subtarget->hasBFE()) - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); - - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand); - - if (!Subtarget->hasBFE()) - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand); - - if (!Subtarget->hasBFE()) - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand); - - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand); - - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand); - - - // Legalize loads and stores to the private address space. - setOperationAction(ISD::LOAD, MVT::i32, Custom); - setOperationAction(ISD::LOAD, MVT::v2i32, Custom); - setOperationAction(ISD::LOAD, MVT::v4i32, Custom); - - // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address - // spaces, so it is custom lowered to handle those where it isn't. - for (MVT VT : MVT::integer_valuetypes()) { - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom); - - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom); - - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom); - } - - setOperationAction(ISD::STORE, MVT::i8, Custom); - setOperationAction(ISD::STORE, MVT::i32, Custom); - setOperationAction(ISD::STORE, MVT::v2i32, Custom); - setOperationAction(ISD::STORE, MVT::v4i32, Custom); - setTruncStoreAction(MVT::i32, MVT::i8, Custom); - setTruncStoreAction(MVT::i32, MVT::i16, Custom); - - setOperationAction(ISD::LOAD, MVT::i32, Custom); - setOperationAction(ISD::LOAD, MVT::v4i32, Custom); - setOperationAction(ISD::FrameIndex, MVT::i32, Custom); - - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); - - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); - - setTargetDAGCombine(ISD::FP_ROUND); - setTargetDAGCombine(ISD::FP_TO_SINT); - setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); - setTargetDAGCombine(ISD::SELECT_CC); - setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); - - // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32 - // to be Legal/Custom in order to avoid library calls. - setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); - setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); - setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); - - setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); - - const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; - for (MVT VT : ScalarIntVTs) { - setOperationAction(ISD::ADDC, VT, Expand); - setOperationAction(ISD::SUBC, VT, Expand); - setOperationAction(ISD::ADDE, VT, Expand); - setOperationAction(ISD::SUBE, VT, Expand); - } - - setSchedulingPreference(Sched::Source); -} - -MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( - MachineInstr * MI, MachineBasicBlock * BB) const { - MachineFunction * MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - MachineBasicBlock::iterator I = *MI; - const R600InstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); - - switch (MI->getOpcode()) { - default: - // Replace LDS_*_RET instruction that don't have any uses with the - // equivalent LDS_*_NORET instruction. - if (TII->isLDSRetInstr(MI->getOpcode())) { - int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); - assert(DstIdx != -1); - MachineInstrBuilder NewMI; - // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add - // LDS_1A2D support and remove this special case. - if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) || - MI->getOpcode() == AMDGPU::LDS_CMPST_RET) - return BB; - - NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), - TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode()))); - for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) { - NewMI.addOperand(MI->getOperand(i)); - } - } else { - return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); - } - break; - case AMDGPU::CLAMP_R600: { - MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, - AMDGPU::MOV, - MI->getOperand(0).getReg(), - MI->getOperand(1).getReg()); - TII->addFlag(NewMI, 0, MO_FLAG_CLAMP); - break; - } - - case AMDGPU::FABS_R600: { - MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, - AMDGPU::MOV, - MI->getOperand(0).getReg(), - MI->getOperand(1).getReg()); - TII->addFlag(NewMI, 0, MO_FLAG_ABS); - break; - } - - case AMDGPU::FNEG_R600: { - MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, - AMDGPU::MOV, - MI->getOperand(0).getReg(), - MI->getOperand(1).getReg()); - TII->addFlag(NewMI, 0, MO_FLAG_NEG); - break; - } - - case AMDGPU::MASK_WRITE: { - unsigned maskedRegister = MI->getOperand(0).getReg(); - assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); - MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); - TII->addFlag(defInstr, 0, MO_FLAG_MASK); - break; - } - - case AMDGPU::MOV_IMM_F32: - TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), - MI->getOperand(1).getFPImm()->getValueAPF() - .bitcastToAPInt().getZExtValue()); - break; - case AMDGPU::MOV_IMM_I32: - TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), - MI->getOperand(1).getImm()); - break; - case AMDGPU::CONST_COPY: { - MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV, - MI->getOperand(0).getReg(), AMDGPU::ALU_CONST); - TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel, - MI->getOperand(1).getImm()); - break; - } - - case AMDGPU::RAT_WRITE_CACHELESS_32_eg: - case AMDGPU::RAT_WRITE_CACHELESS_64_eg: - case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { - unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; - - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) - .addOperand(MI->getOperand(0)) - .addOperand(MI->getOperand(1)) - .addImm(EOP); // Set End of program bit - break; - } - - case AMDGPU::TXD: { - unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); - unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); - MachineOperand &RID = MI->getOperand(4); - MachineOperand &SID = MI->getOperand(5); - unsigned TextureId = MI->getOperand(6).getImm(); - unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; - unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; - - switch (TextureId) { - case 5: // Rect - CTX = CTY = 0; - break; - case 6: // Shadow1D - SrcW = SrcZ; - break; - case 7: // Shadow2D - SrcW = SrcZ; - break; - case 8: // ShadowRect - CTX = CTY = 0; - SrcW = SrcZ; - break; - case 9: // 1DArray - SrcZ = SrcY; - CTZ = 0; - break; - case 10: // 2DArray - CTZ = 0; - break; - case 11: // Shadow1DArray - SrcZ = SrcY; - CTZ = 0; - break; - case 12: // Shadow2DArray - CTZ = 0; - break; - } - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) - .addOperand(MI->getOperand(3)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW); - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) - .addOperand(MI->getOperand(2)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW); - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G)) - .addOperand(MI->getOperand(0)) - .addOperand(MI->getOperand(1)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW) - .addReg(T0, RegState::Implicit) - .addReg(T1, RegState::Implicit); - break; - } - - case AMDGPU::TXD_SHADOW: { - unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); - unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); - MachineOperand &RID = MI->getOperand(4); - MachineOperand &SID = MI->getOperand(5); - unsigned TextureId = MI->getOperand(6).getImm(); - unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; - unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; - - switch (TextureId) { - case 5: // Rect - CTX = CTY = 0; - break; - case 6: // Shadow1D - SrcW = SrcZ; - break; - case 7: // Shadow2D - SrcW = SrcZ; - break; - case 8: // ShadowRect - CTX = CTY = 0; - SrcW = SrcZ; - break; - case 9: // 1DArray - SrcZ = SrcY; - CTZ = 0; - break; - case 10: // 2DArray - CTZ = 0; - break; - case 11: // Shadow1DArray - SrcZ = SrcY; - CTZ = 0; - break; - case 12: // Shadow2DArray - CTZ = 0; - break; - } - - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) - .addOperand(MI->getOperand(3)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW); - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) - .addOperand(MI->getOperand(2)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW); - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G)) - .addOperand(MI->getOperand(0)) - .addOperand(MI->getOperand(1)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW) - .addReg(T0, RegState::Implicit) - .addReg(T1, RegState::Implicit); - break; - } - - case AMDGPU::BRANCH: - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) - .addOperand(MI->getOperand(0)); - break; - - case AMDGPU::BRANCH_COND_f32: { - MachineInstr *NewMI = - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), - AMDGPU::PREDICATE_BIT) - .addOperand(MI->getOperand(1)) - .addImm(OPCODE_IS_NOT_ZERO) - .addImm(0); // Flags - TII->addFlag(NewMI, 0, MO_FLAG_PUSH); - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) - .addOperand(MI->getOperand(0)) - .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); - break; - } - - case AMDGPU::BRANCH_COND_i32: { - MachineInstr *NewMI = - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), - AMDGPU::PREDICATE_BIT) - .addOperand(MI->getOperand(1)) - .addImm(OPCODE_IS_NOT_ZERO_INT) - .addImm(0); // Flags - TII->addFlag(NewMI, 0, MO_FLAG_PUSH); - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) - .addOperand(MI->getOperand(0)) - .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); - break; - } - - case AMDGPU::EG_ExportSwz: - case AMDGPU::R600_ExportSwz: { - // Instruction is left unmodified if its not the last one of its type - bool isLastInstructionOfItsType = true; - unsigned InstExportType = MI->getOperand(1).getImm(); - for (MachineBasicBlock::iterator NextExportInst = std::next(I), - EndBlock = BB->end(); NextExportInst != EndBlock; - NextExportInst = std::next(NextExportInst)) { - if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz || - NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) { - unsigned CurrentInstExportType = NextExportInst->getOperand(1) - .getImm(); - if (CurrentInstExportType == InstExportType) { - isLastInstructionOfItsType = false; - break; - } - } - } - bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; - if (!EOP && !isLastInstructionOfItsType) - return BB; - unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40; - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) - .addOperand(MI->getOperand(0)) - .addOperand(MI->getOperand(1)) - .addOperand(MI->getOperand(2)) - .addOperand(MI->getOperand(3)) - .addOperand(MI->getOperand(4)) - .addOperand(MI->getOperand(5)) - .addOperand(MI->getOperand(6)) - .addImm(CfInst) - .addImm(EOP); - break; - } - case AMDGPU::RETURN: { - // RETURN instructions must have the live-out registers as implicit uses, - // otherwise they appear dead. - R600MachineFunctionInfo *MFI = MF->getInfo(); - MachineInstrBuilder MIB(*MF, MI); - for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i) - MIB.addReg(MFI->LiveOuts[i], RegState::Implicit); - return BB; - } - } - - MI->eraseFromParent(); - return BB; -} - -//===----------------------------------------------------------------------===// -// Custom DAG Lowering Operations -//===----------------------------------------------------------------------===// - -SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { - MachineFunction &MF = DAG.getMachineFunction(); - R600MachineFunctionInfo *MFI = MF.getInfo(); - switch (Op.getOpcode()) { - default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); - case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); - case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); - case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG); - case ISD::SRA_PARTS: - case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG); - case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY); - case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW); - case ISD::FCOS: - case ISD::FSIN: return LowerTrig(Op, DAG); - case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); - case ISD::STORE: return LowerSTORE(Op, DAG); - case ISD::LOAD: { - SDValue Result = LowerLOAD(Op, DAG); - assert((!Result.getNode() || - Result.getNode()->getNumValues() == 2) && - "Load should return a value and a chain"); - return Result; - } - - case ISD::BRCOND: return LowerBRCOND(Op, DAG); - case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); - case ISD::INTRINSIC_VOID: { - SDValue Chain = Op.getOperand(0); - unsigned IntrinsicID = - cast(Op.getOperand(1))->getZExtValue(); - switch (IntrinsicID) { - case AMDGPUIntrinsic::AMDGPU_store_output: { - int64_t RegIndex = cast(Op.getOperand(3))->getZExtValue(); - unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); - MFI->LiveOuts.push_back(Reg); - return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2)); - } - case AMDGPUIntrinsic::R600_store_swizzle: { - SDLoc DL(Op); - const SDValue Args[8] = { - Chain, - Op.getOperand(2), // Export Value - Op.getOperand(3), // ArrayBase - Op.getOperand(4), // Type - DAG.getConstant(0, DL, MVT::i32), // SWZ_X - DAG.getConstant(1, DL, MVT::i32), // SWZ_Y - DAG.getConstant(2, DL, MVT::i32), // SWZ_Z - DAG.getConstant(3, DL, MVT::i32) // SWZ_W - }; - return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args); - } - - // default for switch(IntrinsicID) - default: break; - } - // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode()) - break; - } - case ISD::INTRINSIC_WO_CHAIN: { - unsigned IntrinsicID = - cast(Op.getOperand(0))->getZExtValue(); - EVT VT = Op.getValueType(); - SDLoc DL(Op); - switch(IntrinsicID) { - default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); - case AMDGPUIntrinsic::R600_load_input: { - int64_t RegIndex = cast(Op.getOperand(1))->getZExtValue(); - unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); - MachineFunction &MF = DAG.getMachineFunction(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - MRI.addLiveIn(Reg); - return DAG.getCopyFromReg(DAG.getEntryNode(), - SDLoc(DAG.getEntryNode()), Reg, VT); - } - - case AMDGPUIntrinsic::R600_interp_input: { - int slot = cast(Op.getOperand(1))->getZExtValue(); - int ijb = cast(Op.getOperand(2))->getSExtValue(); - MachineSDNode *interp; - if (ijb < 0) { - const R600InstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); - interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL, - MVT::v4f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32)); - return DAG.getTargetExtractSubreg( - TII->getRegisterInfo().getSubRegFromChannel(slot % 4), - DL, MVT::f32, SDValue(interp, 0)); - } - MachineFunction &MF = DAG.getMachineFunction(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb); - unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1); - MRI.addLiveIn(RegisterI); - MRI.addLiveIn(RegisterJ); - SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(), - SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32); - SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(), - SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32); - - if (slot % 4 < 2) - interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, - MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32), - RegisterJNode, RegisterINode); - else - interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, - MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32), - RegisterJNode, RegisterINode); - return SDValue(interp, slot % 2); - } - case AMDGPUIntrinsic::R600_interp_xy: - case AMDGPUIntrinsic::R600_interp_zw: { - int slot = cast(Op.getOperand(1))->getZExtValue(); - MachineSDNode *interp; - SDValue RegisterINode = Op.getOperand(2); - SDValue RegisterJNode = Op.getOperand(3); - - if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy) - interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, - MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32), - RegisterJNode, RegisterINode); - else - interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, - MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32), - RegisterJNode, RegisterINode); - return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, - SDValue(interp, 0), SDValue(interp, 1)); - } - case AMDGPUIntrinsic::R600_tex: - case AMDGPUIntrinsic::R600_texc: - case AMDGPUIntrinsic::R600_txl: - case AMDGPUIntrinsic::R600_txlc: - case AMDGPUIntrinsic::R600_txb: - case AMDGPUIntrinsic::R600_txbc: - case AMDGPUIntrinsic::R600_txf: - case AMDGPUIntrinsic::R600_txq: - case AMDGPUIntrinsic::R600_ddx: - case AMDGPUIntrinsic::R600_ddy: - case AMDGPUIntrinsic::R600_ldptr: { - unsigned TextureOp; - switch (IntrinsicID) { - case AMDGPUIntrinsic::R600_tex: - TextureOp = 0; - break; - case AMDGPUIntrinsic::R600_texc: - TextureOp = 1; - break; - case AMDGPUIntrinsic::R600_txl: - TextureOp = 2; - break; - case AMDGPUIntrinsic::R600_txlc: - TextureOp = 3; - break; - case AMDGPUIntrinsic::R600_txb: - TextureOp = 4; - break; - case AMDGPUIntrinsic::R600_txbc: - TextureOp = 5; - break; - case AMDGPUIntrinsic::R600_txf: - TextureOp = 6; - break; - case AMDGPUIntrinsic::R600_txq: - TextureOp = 7; - break; - case AMDGPUIntrinsic::R600_ddx: - TextureOp = 8; - break; - case AMDGPUIntrinsic::R600_ddy: - TextureOp = 9; - break; - case AMDGPUIntrinsic::R600_ldptr: - TextureOp = 10; - break; - default: - llvm_unreachable("Unknow Texture Operation"); - } - - SDValue TexArgs[19] = { - DAG.getConstant(TextureOp, DL, MVT::i32), - Op.getOperand(1), - DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(1, DL, MVT::i32), - DAG.getConstant(2, DL, MVT::i32), - DAG.getConstant(3, DL, MVT::i32), - Op.getOperand(2), - Op.getOperand(3), - Op.getOperand(4), - DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(1, DL, MVT::i32), - DAG.getConstant(2, DL, MVT::i32), - DAG.getConstant(3, DL, MVT::i32), - Op.getOperand(5), - Op.getOperand(6), - Op.getOperand(7), - Op.getOperand(8), - Op.getOperand(9), - Op.getOperand(10) - }; - return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs); - } - case AMDGPUIntrinsic::AMDGPU_dp4: { - SDValue Args[8] = { - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), - DAG.getConstant(0, DL, MVT::i32)), - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), - DAG.getConstant(0, DL, MVT::i32)), - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), - DAG.getConstant(1, DL, MVT::i32)), - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), - DAG.getConstant(1, DL, MVT::i32)), - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), - DAG.getConstant(2, DL, MVT::i32)), - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), - DAG.getConstant(2, DL, MVT::i32)), - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), - DAG.getConstant(3, DL, MVT::i32)), - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), - DAG.getConstant(3, DL, MVT::i32)) - }; - return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args); - } - - case Intrinsic::r600_read_ngroups_x: - return LowerImplicitParameter(DAG, VT, DL, 0); - case Intrinsic::r600_read_ngroups_y: - return LowerImplicitParameter(DAG, VT, DL, 1); - case Intrinsic::r600_read_ngroups_z: - return LowerImplicitParameter(DAG, VT, DL, 2); - case Intrinsic::r600_read_global_size_x: - return LowerImplicitParameter(DAG, VT, DL, 3); - case Intrinsic::r600_read_global_size_y: - return LowerImplicitParameter(DAG, VT, DL, 4); - case Intrinsic::r600_read_global_size_z: - return LowerImplicitParameter(DAG, VT, DL, 5); - case Intrinsic::r600_read_local_size_x: - return LowerImplicitParameter(DAG, VT, DL, 6); - case Intrinsic::r600_read_local_size_y: - return LowerImplicitParameter(DAG, VT, DL, 7); - case Intrinsic::r600_read_local_size_z: - return LowerImplicitParameter(DAG, VT, DL, 8); - - case Intrinsic::AMDGPU_read_workdim: - return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4); - - case Intrinsic::r600_read_tgid_x: - return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T1_X, VT); - case Intrinsic::r600_read_tgid_y: - return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T1_Y, VT); - case Intrinsic::r600_read_tgid_z: - return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T1_Z, VT); - case Intrinsic::r600_read_tidig_x: - return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T0_X, VT); - case Intrinsic::r600_read_tidig_y: - return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T0_Y, VT); - case Intrinsic::r600_read_tidig_z: - return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T0_Z, VT); - case Intrinsic::AMDGPU_rsq: - // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior. - return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); - - case AMDGPUIntrinsic::AMDGPU_fract: - case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. - return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); - } - // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) - break; - } - } // end switch(Op.getOpcode()) - return SDValue(); -} - -void R600TargetLowering::ReplaceNodeResults(SDNode *N, - SmallVectorImpl &Results, - SelectionDAG &DAG) const { - switch (N->getOpcode()) { - default: - AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG); - return; - case ISD::FP_TO_UINT: - if (N->getValueType(0) == MVT::i1) { - Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); - return; - } - // Fall-through. Since we don't care about out of bounds values - // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint - // considers some extra cases which are not necessary here. - case ISD::FP_TO_SINT: { - SDValue Result; - if (expandFP_TO_SINT(N, Result, DAG)) - Results.push_back(Result); - return; - } - case ISD::SDIVREM: { - SDValue Op = SDValue(N, 1); - SDValue RES = LowerSDIVREM(Op, DAG); - Results.push_back(RES); - Results.push_back(RES.getValue(1)); - break; - } - case ISD::UDIVREM: { - SDValue Op = SDValue(N, 0); - LowerUDIVREM64(Op, DAG, Results); - break; - } - } -} - -SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG, - SDValue Vector) const { - - SDLoc DL(Vector); - EVT VecVT = Vector.getValueType(); - EVT EltVT = VecVT.getVectorElementType(); - SmallVector Args; - - for (unsigned i = 0, e = VecVT.getVectorNumElements(); - i != e; ++i) { - Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector, - DAG.getConstant(i, DL, getVectorIdxTy()))); - } - - return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args); -} - -SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, - SelectionDAG &DAG) const { - - SDLoc DL(Op); - SDValue Vector = Op.getOperand(0); - SDValue Index = Op.getOperand(1); - - if (isa(Index) || - Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) - return Op; - - Vector = vectorToVerticalVector(DAG, Vector); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(), - Vector, Index); -} - -SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, - SelectionDAG &DAG) const { - SDLoc DL(Op); - SDValue Vector = Op.getOperand(0); - SDValue Value = Op.getOperand(1); - SDValue Index = Op.getOperand(2); - - if (isa(Index) || - Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) - return Op; - - Vector = vectorToVerticalVector(DAG, Vector); - SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(), - Vector, Value, Index); - return vectorToVerticalVector(DAG, Insert); -} - -SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { - // On hw >= R700, COS/SIN input must be between -1. and 1. - // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5) - EVT VT = Op.getValueType(); - SDValue Arg = Op.getOperand(0); - SDLoc DL(Op); - SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, - DAG.getNode(ISD::FADD, DL, VT, - DAG.getNode(ISD::FMUL, DL, VT, Arg, - DAG.getConstantFP(0.15915494309, DL, MVT::f32)), - DAG.getConstantFP(0.5, DL, MVT::f32))); - unsigned TrigNode; - switch (Op.getOpcode()) { - case ISD::FCOS: - TrigNode = AMDGPUISD::COS_HW; - break; - case ISD::FSIN: - TrigNode = AMDGPUISD::SIN_HW; - break; - default: - llvm_unreachable("Wrong trig opcode"); - } - SDValue TrigVal = DAG.getNode(TrigNode, DL, VT, - DAG.getNode(ISD::FADD, DL, VT, FractPart, - DAG.getConstantFP(-0.5, DL, MVT::f32))); - if (Gen >= AMDGPUSubtarget::R700) - return TrigVal; - // On R600 hw, COS/SIN input must be between -Pi and Pi. - return DAG.getNode(ISD::FMUL, DL, VT, TrigVal, - DAG.getConstantFP(3.14159265359, DL, MVT::f32)); -} - -SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - - SDValue Lo = Op.getOperand(0); - SDValue Hi = Op.getOperand(1); - SDValue Shift = Op.getOperand(2); - SDValue Zero = DAG.getConstant(0, DL, VT); - SDValue One = DAG.getConstant(1, DL, VT); - - SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT); - SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT); - SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); - SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); - - // The dance around Width1 is necessary for 0 special case. - // Without it the CompShift might be 32, producing incorrect results in - // Overflow. So we do the shift in two steps, the alternative is to - // add a conditional to filter the special case. - - SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift); - Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One); - - SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift); - HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow); - SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift); - - SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift); - SDValue LoBig = Zero; - - Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); - Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); - - return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); -} - -SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - - SDValue Lo = Op.getOperand(0); - SDValue Hi = Op.getOperand(1); - SDValue Shift = Op.getOperand(2); - SDValue Zero = DAG.getConstant(0, DL, VT); - SDValue One = DAG.getConstant(1, DL, VT); - - const bool SRA = Op.getOpcode() == ISD::SRA_PARTS; - - SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT); - SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT); - SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); - SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); - - // The dance around Width1 is necessary for 0 special case. - // Without it the CompShift might be 32, producing incorrect results in - // Overflow. So we do the shift in two steps, the alternative is to - // add a conditional to filter the special case. - - SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift); - Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One); - - SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift); - SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift); - LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow); - - SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift); - SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero; - - Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); - Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); - - return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); -} - -SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG, - unsigned mainop, unsigned ovf) const { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - - SDValue Lo = Op.getOperand(0); - SDValue Hi = Op.getOperand(1); - - SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi); - // Extend sign. - OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF, - DAG.getValueType(MVT::i1)); - - SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi); - - return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF); -} - -SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - return DAG.getNode( - ISD::SETCC, - DL, - MVT::i1, - Op, DAG.getConstantFP(0.0f, DL, MVT::f32), - DAG.getCondCode(ISD::SETNE) - ); -} - -SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, - SDLoc DL, - unsigned DwordOffset) const { - unsigned ByteOffset = DwordOffset * 4; - PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), - AMDGPUAS::CONSTANT_BUFFER_0); - - // We shouldn't be using an offset wider than 16-bits for implicit parameters. - assert(isInt<16>(ByteOffset)); - - return DAG.getLoad(VT, DL, DAG.getEntryNode(), - DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR - MachinePointerInfo(ConstantPointerNull::get(PtrType)), - false, false, false, 0); -} - -bool R600TargetLowering::isZero(SDValue Op) const { - if(ConstantSDNode *Cst = dyn_cast(Op)) { - return Cst->isNullValue(); - } else if(ConstantFPSDNode *CstFP = dyn_cast(Op)){ - return CstFP->isZero(); - } else { - return false; - } -} - -SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - SDValue True = Op.getOperand(2); - SDValue False = Op.getOperand(3); - SDValue CC = Op.getOperand(4); - SDValue Temp; - - if (VT == MVT::f32) { - DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); - SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); - if (MinMax) - return MinMax; - } - - // LHS and RHS are guaranteed to be the same value type - EVT CompareVT = LHS.getValueType(); - - // Check if we can lower this to a native operation. - - // Try to lower to a SET* instruction: - // - // SET* can match the following patterns: - // - // select_cc f32, f32, -1, 0, cc_supported - // select_cc f32, f32, 1.0f, 0.0f, cc_supported - // select_cc i32, i32, -1, 0, cc_supported - // - - // Move hardware True/False values to the correct operand. - ISD::CondCode CCOpcode = cast(CC)->get(); - ISD::CondCode InverseCC = - ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); - if (isHWTrueValue(False) && isHWFalseValue(True)) { - if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) { - std::swap(False, True); - CC = DAG.getCondCode(InverseCC); - } else { - ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC); - if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) { - std::swap(False, True); - std::swap(LHS, RHS); - CC = DAG.getCondCode(SwapInvCC); - } - } - } - - if (isHWTrueValue(True) && isHWFalseValue(False) && - (CompareVT == VT || VT == MVT::i32)) { - // This can be matched by a SET* instruction. - return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC); - } - - // Try to lower to a CND* instruction: - // - // CND* can match the following patterns: - // - // select_cc f32, 0.0, f32, f32, cc_supported - // select_cc f32, 0.0, i32, i32, cc_supported - // select_cc i32, 0, f32, f32, cc_supported - // select_cc i32, 0, i32, i32, cc_supported - // - - // Try to move the zero value to the RHS - if (isZero(LHS)) { - ISD::CondCode CCOpcode = cast(CC)->get(); - // Try swapping the operands - ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode); - if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { - std::swap(LHS, RHS); - CC = DAG.getCondCode(CCSwapped); - } else { - // Try inverting the conditon and then swapping the operands - ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger()); - CCSwapped = ISD::getSetCCSwappedOperands(CCInv); - if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { - std::swap(True, False); - std::swap(LHS, RHS); - CC = DAG.getCondCode(CCSwapped); - } - } - } - if (isZero(RHS)) { - SDValue Cond = LHS; - SDValue Zero = RHS; - ISD::CondCode CCOpcode = cast(CC)->get(); - if (CompareVT != VT) { - // Bitcast True / False to the correct types. This will end up being - // a nop, but it allows us to define only a single pattern in the - // .TD files for each CND* instruction rather than having to have - // one pattern for integer True/False and one for fp True/False - True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True); - False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False); - } - - switch (CCOpcode) { - case ISD::SETONE: - case ISD::SETUNE: - case ISD::SETNE: - CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); - Temp = True; - True = False; - False = Temp; - break; - default: - break; - } - SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, - Cond, Zero, - True, False, - DAG.getCondCode(CCOpcode)); - return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode); - } - - // If we make it this for it means we have no native instructions to handle - // this SELECT_CC, so we must lower it. - SDValue HWTrue, HWFalse; - - if (CompareVT == MVT::f32) { - HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT); - HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT); - } else if (CompareVT == MVT::i32) { - HWTrue = DAG.getConstant(-1, DL, CompareVT); - HWFalse = DAG.getConstant(0, DL, CompareVT); - } - else { - llvm_unreachable("Unhandled value type in LowerSELECT_CC"); - } - - // Lower this unsupported SELECT_CC into a combination of two supported - // SELECT_CC operations. - SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC); - - return DAG.getNode(ISD::SELECT_CC, DL, VT, - Cond, HWFalse, - True, False, - DAG.getCondCode(ISD::SETNE)); -} - -/// LLVM generates byte-addressed pointers. For indirect addressing, we need to -/// convert these pointers to a register index. Each register holds -/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the -/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used -/// for indirect addressing. -SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr, - unsigned StackWidth, - SelectionDAG &DAG) const { - unsigned SRLPad; - switch(StackWidth) { - case 1: - SRLPad = 2; - break; - case 2: - SRLPad = 3; - break; - case 4: - SRLPad = 4; - break; - default: llvm_unreachable("Invalid stack width"); - } - - SDLoc DL(Ptr); - return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr, - DAG.getConstant(SRLPad, DL, MVT::i32)); -} - -void R600TargetLowering::getStackAddress(unsigned StackWidth, - unsigned ElemIdx, - unsigned &Channel, - unsigned &PtrIncr) const { - switch (StackWidth) { - default: - case 1: - Channel = 0; - if (ElemIdx > 0) { - PtrIncr = 1; - } else { - PtrIncr = 0; - } - break; - case 2: - Channel = ElemIdx % 2; - if (ElemIdx == 2) { - PtrIncr = 1; - } else { - PtrIncr = 0; - } - break; - case 4: - Channel = ElemIdx; - PtrIncr = 0; - break; - } -} - -SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - StoreSDNode *StoreNode = cast(Op); - SDValue Chain = Op.getOperand(0); - SDValue Value = Op.getOperand(1); - SDValue Ptr = Op.getOperand(2); - - SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG); - if (Result.getNode()) { - return Result; - } - - if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) { - if (StoreNode->isTruncatingStore()) { - EVT VT = Value.getValueType(); - assert(VT.bitsLE(MVT::i32)); - EVT MemVT = StoreNode->getMemoryVT(); - SDValue MaskConstant; - if (MemVT == MVT::i8) { - MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32); - } else { - assert(MemVT == MVT::i16); - MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32); - } - SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr, - DAG.getConstant(2, DL, MVT::i32)); - SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr, - DAG.getConstant(0x00000003, DL, VT)); - SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant); - SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex, - DAG.getConstant(3, DL, VT)); - SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift); - SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift); - // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32 - // vector instead. - SDValue Src[4] = { - ShiftedValue, - DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i32), - Mask - }; - SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src); - SDValue Args[3] = { Chain, Input, DWordAddr }; - return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL, - Op->getVTList(), Args, MemVT, - StoreNode->getMemOperand()); - } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && - Value.getValueType().bitsGE(MVT::i32)) { - // Convert pointer from byte address to dword address. - Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), - DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), - Ptr, DAG.getConstant(2, DL, MVT::i32))); - - if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) { - llvm_unreachable("Truncated and indexed stores not supported yet"); - } else { - Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); - } - return Chain; - } - } - - EVT ValueVT = Value.getValueType(); - - if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { - return SDValue(); - } - - SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); - if (Ret.getNode()) { - return Ret; - } - // Lowering for indirect addressing - - const MachineFunction &MF = DAG.getMachineFunction(); - const AMDGPUFrameLowering *TFL = - static_cast(Subtarget->getFrameLowering()); - unsigned StackWidth = TFL->getStackWidth(MF); - - Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); - - if (ValueVT.isVector()) { - unsigned NumElemVT = ValueVT.getVectorNumElements(); - EVT ElemVT = ValueVT.getVectorElementType(); - SmallVector Stores(NumElemVT); - - assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " - "vector width in load"); - - for (unsigned i = 0; i < NumElemVT; ++i) { - unsigned Channel, PtrIncr; - getStackAddress(StackWidth, i, Channel, PtrIncr); - Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, - DAG.getConstant(PtrIncr, DL, MVT::i32)); - SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, - Value, DAG.getConstant(i, DL, MVT::i32)); - - Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, - Chain, Elem, Ptr, - DAG.getTargetConstant(Channel, DL, MVT::i32)); - } - Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); - } else { - if (ValueVT == MVT::i8) { - Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value); - } - Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr, - DAG.getTargetConstant(0, DL, MVT::i32)); // Channel - } - - return Chain; -} - -// return (512 + (kc_bank << 12) -static int -ConstantAddressBlock(unsigned AddressSpace) { - switch (AddressSpace) { - case AMDGPUAS::CONSTANT_BUFFER_0: - return 512; - case AMDGPUAS::CONSTANT_BUFFER_1: - return 512 + 4096; - case AMDGPUAS::CONSTANT_BUFFER_2: - return 512 + 4096 * 2; - case AMDGPUAS::CONSTANT_BUFFER_3: - return 512 + 4096 * 3; - case AMDGPUAS::CONSTANT_BUFFER_4: - return 512 + 4096 * 4; - case AMDGPUAS::CONSTANT_BUFFER_5: - return 512 + 4096 * 5; - case AMDGPUAS::CONSTANT_BUFFER_6: - return 512 + 4096 * 6; - case AMDGPUAS::CONSTANT_BUFFER_7: - return 512 + 4096 * 7; - case AMDGPUAS::CONSTANT_BUFFER_8: - return 512 + 4096 * 8; - case AMDGPUAS::CONSTANT_BUFFER_9: - return 512 + 4096 * 9; - case AMDGPUAS::CONSTANT_BUFFER_10: - return 512 + 4096 * 10; - case AMDGPUAS::CONSTANT_BUFFER_11: - return 512 + 4096 * 11; - case AMDGPUAS::CONSTANT_BUFFER_12: - return 512 + 4096 * 12; - case AMDGPUAS::CONSTANT_BUFFER_13: - return 512 + 4096 * 13; - case AMDGPUAS::CONSTANT_BUFFER_14: - return 512 + 4096 * 14; - case AMDGPUAS::CONSTANT_BUFFER_15: - return 512 + 4096 * 15; - default: - return -1; - } -} - -SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const -{ - EVT VT = Op.getValueType(); - SDLoc DL(Op); - LoadSDNode *LoadNode = cast(Op); - SDValue Chain = Op.getOperand(0); - SDValue Ptr = Op.getOperand(1); - SDValue LoweredLoad; - - SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG); - if (Ret.getNode()) { - SDValue Ops[2] = { - Ret, - Chain - }; - return DAG.getMergeValues(Ops, DL); - } - - // Lower loads constant address space global variable loads - if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && - isa(GetUnderlyingObject( - LoadNode->getMemOperand()->getValue(), *getDataLayout()))) { - - SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL, - getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); - Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, - DAG.getConstant(2, DL, MVT::i32)); - return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(), - LoadNode->getChain(), Ptr, - DAG.getTargetConstant(0, DL, MVT::i32), - Op.getOperand(2)); - } - - if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) { - SDValue MergedValues[2] = { - ScalarizeVectorLoad(Op, DAG), - Chain - }; - return DAG.getMergeValues(MergedValues, DL); - } - - int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); - if (ConstantBlock > -1 && - ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) || - (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) { - SDValue Result; - if (isa(LoadNode->getMemOperand()->getValue()) || - isa(LoadNode->getMemOperand()->getValue()) || - isa(Ptr)) { - SDValue Slots[4]; - for (unsigned i = 0; i < 4; i++) { - // We want Const position encoded with the following formula : - // (((512 + (kc_bank << 12) + const_index) << 2) + chan) - // const_index is Ptr computed by llvm using an alignment of 16. - // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and - // then div by 4 at the ISel step - SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, - DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32)); - Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); - } - EVT NewVT = MVT::v4i32; - unsigned NumElements = 4; - if (VT.isVector()) { - NewVT = VT; - NumElements = VT.getVectorNumElements(); - } - Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, - makeArrayRef(Slots, NumElements)); - } else { - // non-constant ptr can't be folded, keeps it as a v4f32 load - Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, - DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, - DAG.getConstant(4, DL, MVT::i32)), - DAG.getConstant(LoadNode->getAddressSpace() - - AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32) - ); - } - - if (!VT.isVector()) { - Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, - DAG.getConstant(0, DL, MVT::i32)); - } - - SDValue MergedValues[2] = { - Result, - Chain - }; - return DAG.getMergeValues(MergedValues, DL); - } - - // For most operations returning SDValue() will result in the node being - // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we - // need to manually expand loads that may be legal in some address spaces and - // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for - // compute shaders, since the data is sign extended when it is uploaded to the - // buffer. However SEXT loads from other address spaces are not supported, so - // we need to expand them here. - if (LoadNode->getExtensionType() == ISD::SEXTLOAD) { - EVT MemVT = LoadNode->getMemoryVT(); - assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8)); - SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr, - LoadNode->getPointerInfo(), MemVT, - LoadNode->isVolatile(), - LoadNode->isNonTemporal(), - LoadNode->isInvariant(), - LoadNode->getAlignment()); - SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad, - DAG.getValueType(MemVT)); - - SDValue MergedValues[2] = { Res, Chain }; - return DAG.getMergeValues(MergedValues, DL); - } - - if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { - return SDValue(); - } - - // Lowering for indirect addressing - const MachineFunction &MF = DAG.getMachineFunction(); - const AMDGPUFrameLowering *TFL = - static_cast(Subtarget->getFrameLowering()); - unsigned StackWidth = TFL->getStackWidth(MF); - - Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); - - if (VT.isVector()) { - unsigned NumElemVT = VT.getVectorNumElements(); - EVT ElemVT = VT.getVectorElementType(); - SDValue Loads[4]; - - assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " - "vector width in load"); - - for (unsigned i = 0; i < NumElemVT; ++i) { - unsigned Channel, PtrIncr; - getStackAddress(StackWidth, i, Channel, PtrIncr); - Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, - DAG.getConstant(PtrIncr, DL, MVT::i32)); - Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT, - Chain, Ptr, - DAG.getTargetConstant(Channel, DL, MVT::i32), - Op.getOperand(2)); - } - for (unsigned i = NumElemVT; i < 4; ++i) { - Loads[i] = DAG.getUNDEF(ElemVT); - } - EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4); - LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads); - } else { - LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, - Chain, Ptr, - DAG.getTargetConstant(0, DL, MVT::i32), // Channel - Op.getOperand(2)); - } - - SDValue Ops[2] = { - LoweredLoad, - Chain - }; - - return DAG.getMergeValues(Ops, DL); -} - -SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { - SDValue Chain = Op.getOperand(0); - SDValue Cond = Op.getOperand(1); - SDValue Jump = Op.getOperand(2); - - return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(), - Chain, Jump, Cond); -} - -/// XXX Only kernel functions are supported, so we can assume for now that -/// every function is a kernel function, but in the future we should use -/// separate calling conventions for kernel and non-kernel functions. -SDValue R600TargetLowering::LowerFormalArguments( - SDValue Chain, - CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl &Ins, - SDLoc DL, SelectionDAG &DAG, - SmallVectorImpl &InVals) const { - SmallVector ArgLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, - *DAG.getContext()); - MachineFunction &MF = DAG.getMachineFunction(); - R600MachineFunctionInfo *MFI = MF.getInfo(); - - SmallVector LocalIns; - - getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns); - - AnalyzeFormalArguments(CCInfo, LocalIns); - - for (unsigned i = 0, e = Ins.size(); i < e; ++i) { - CCValAssign &VA = ArgLocs[i]; - const ISD::InputArg &In = Ins[i]; - EVT VT = In.VT; - EVT MemVT = VA.getLocVT(); - if (!VT.isVector() && MemVT.isVector()) { - // Get load source type if scalarized. - MemVT = MemVT.getVectorElementType(); - } - - if (MFI->getShaderType() != ShaderType::COMPUTE) { - unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass); - SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT); - InVals.push_back(Register); - continue; - } - - PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), - AMDGPUAS::CONSTANT_BUFFER_0); - - // i64 isn't a legal type, so the register type used ends up as i32, which - // isn't expected here. It attempts to create this sextload, but it ends up - // being invalid. Somehow this seems to work with i64 arguments, but breaks - // for <1 x i64>. - - // The first 36 bytes of the input buffer contains information about - // thread group and global sizes. - ISD::LoadExtType Ext = ISD::NON_EXTLOAD; - if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) { - // FIXME: This should really check the extload type, but the handling of - // extload vector parameters seems to be broken. - - // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD; - Ext = ISD::SEXTLOAD; - } - - // Compute the offset from the value. - // XXX - I think PartOffset should give you this, but it seems to give the - // size of the register which isn't useful. - - unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset(); - unsigned PartOffset = VA.getLocMemOffset(); - unsigned Offset = 36 + VA.getLocMemOffset(); - - MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase); - SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain, - DAG.getConstant(Offset, DL, MVT::i32), - DAG.getUNDEF(MVT::i32), - PtrInfo, - MemVT, false, true, true, 4); - - // 4 is the preferred alignment for the CONSTANT memory space. - InVals.push_back(Arg); - MFI->ABIArgOffset = Offset + MemVT.getStoreSize(); - } - return Chain; -} - -EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { - if (!VT.isVector()) - return MVT::i32; - return VT.changeVectorElementTypeToInteger(); -} - -static SDValue CompactSwizzlableVector( - SelectionDAG &DAG, SDValue VectorEntry, - DenseMap &RemapSwizzle) { - assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); - assert(RemapSwizzle.empty()); - SDValue NewBldVec[4] = { - VectorEntry.getOperand(0), - VectorEntry.getOperand(1), - VectorEntry.getOperand(2), - VectorEntry.getOperand(3) - }; - - for (unsigned i = 0; i < 4; i++) { - if (NewBldVec[i].getOpcode() == ISD::UNDEF) - // We mask write here to teach later passes that the ith element of this - // vector is undef. Thus we can use it to reduce 128 bits reg usage, - // break false dependencies and additionnaly make assembly easier to read. - RemapSwizzle[i] = 7; // SEL_MASK_WRITE - if (ConstantFPSDNode *C = dyn_cast(NewBldVec[i])) { - if (C->isZero()) { - RemapSwizzle[i] = 4; // SEL_0 - NewBldVec[i] = DAG.getUNDEF(MVT::f32); - } else if (C->isExactlyValue(1.0)) { - RemapSwizzle[i] = 5; // SEL_1 - NewBldVec[i] = DAG.getUNDEF(MVT::f32); - } - } - - if (NewBldVec[i].getOpcode() == ISD::UNDEF) - continue; - for (unsigned j = 0; j < i; j++) { - if (NewBldVec[i] == NewBldVec[j]) { - NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType()); - RemapSwizzle[i] = j; - break; - } - } - } - - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), - VectorEntry.getValueType(), NewBldVec); -} - -static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, - DenseMap &RemapSwizzle) { - assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); - assert(RemapSwizzle.empty()); - SDValue NewBldVec[4] = { - VectorEntry.getOperand(0), - VectorEntry.getOperand(1), - VectorEntry.getOperand(2), - VectorEntry.getOperand(3) - }; - bool isUnmovable[4] = { false, false, false, false }; - for (unsigned i = 0; i < 4; i++) { - RemapSwizzle[i] = i; - if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { - unsigned Idx = dyn_cast(NewBldVec[i].getOperand(1)) - ->getZExtValue(); - if (i == Idx) - isUnmovable[Idx] = true; - } - } - - for (unsigned i = 0; i < 4; i++) { - if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { - unsigned Idx = dyn_cast(NewBldVec[i].getOperand(1)) - ->getZExtValue(); - if (isUnmovable[Idx]) - continue; - // Swap i and Idx - std::swap(NewBldVec[Idx], NewBldVec[i]); - std::swap(RemapSwizzle[i], RemapSwizzle[Idx]); - break; - } - } - - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), - VectorEntry.getValueType(), NewBldVec); -} - - -SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, - SDValue Swz[4], SelectionDAG &DAG, - SDLoc DL) const { - assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR); - // Old -> New swizzle values - DenseMap SwizzleRemap; - - BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap); - for (unsigned i = 0; i < 4; i++) { - unsigned Idx = cast(Swz[i])->getZExtValue(); - if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) - Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); - } - - SwizzleRemap.clear(); - BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap); - for (unsigned i = 0; i < 4; i++) { - unsigned Idx = cast(Swz[i])->getZExtValue(); - if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) - Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); - } - - return BuildVector; -} - - -//===----------------------------------------------------------------------===// -// Custom DAG Optimizations -//===----------------------------------------------------------------------===// - -SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - SelectionDAG &DAG = DCI.DAG; - - switch (N->getOpcode()) { - default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); - // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a) - case ISD::FP_ROUND: { - SDValue Arg = N->getOperand(0); - if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) { - return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0), - Arg.getOperand(0)); - } - break; - } - - // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) -> - // (i32 select_cc f32, f32, -1, 0 cc) - // - // Mesa's GLSL frontend generates the above pattern a lot and we can lower - // this to one of the SET*_DX10 instructions. - case ISD::FP_TO_SINT: { - SDValue FNeg = N->getOperand(0); - if (FNeg.getOpcode() != ISD::FNEG) { - return SDValue(); - } - SDValue SelectCC = FNeg.getOperand(0); - if (SelectCC.getOpcode() != ISD::SELECT_CC || - SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS - SelectCC.getOperand(2).getValueType() != MVT::f32 || // True - !isHWTrueValue(SelectCC.getOperand(2)) || - !isHWFalseValue(SelectCC.getOperand(3))) { - return SDValue(); - } - - SDLoc dl(N); - return DAG.getNode(ISD::SELECT_CC, dl, N->getValueType(0), - SelectCC.getOperand(0), // LHS - SelectCC.getOperand(1), // RHS - DAG.getConstant(-1, dl, MVT::i32), // True - DAG.getConstant(0, dl, MVT::i32), // False - SelectCC.getOperand(4)); // CC - - break; - } - - // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx - // => build_vector elt0, ... , NewEltIdx, ... , eltN - case ISD::INSERT_VECTOR_ELT: { - SDValue InVec = N->getOperand(0); - SDValue InVal = N->getOperand(1); - SDValue EltNo = N->getOperand(2); - SDLoc dl(N); - - // If the inserted element is an UNDEF, just use the input vector. - if (InVal.getOpcode() == ISD::UNDEF) - return InVec; - - EVT VT = InVec.getValueType(); - - // If we can't generate a legal BUILD_VECTOR, exit - if (!isOperationLegal(ISD::BUILD_VECTOR, VT)) - return SDValue(); - - // Check that we know which element is being inserted - if (!isa(EltNo)) - return SDValue(); - unsigned Elt = cast(EltNo)->getZExtValue(); - - // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially - // be converted to a BUILD_VECTOR). Fill in the Ops vector with the - // vector elements. - SmallVector Ops; - if (InVec.getOpcode() == ISD::BUILD_VECTOR) { - Ops.append(InVec.getNode()->op_begin(), - InVec.getNode()->op_end()); - } else if (InVec.getOpcode() == ISD::UNDEF) { - unsigned NElts = VT.getVectorNumElements(); - Ops.append(NElts, DAG.getUNDEF(InVal.getValueType())); - } else { - return SDValue(); - } - - // Insert the element - if (Elt < Ops.size()) { - // All the operands of BUILD_VECTOR must have the same type; - // we enforce that here. - EVT OpVT = Ops[0].getValueType(); - if (InVal.getValueType() != OpVT) - InVal = OpVT.bitsGT(InVal.getValueType()) ? - DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) : - DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal); - Ops[Elt] = InVal; - } - - // Return the new vector - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); - } - - // Extract_vec (Build_vector) generated by custom lowering - // also needs to be customly combined - case ISD::EXTRACT_VECTOR_ELT: { - SDValue Arg = N->getOperand(0); - if (Arg.getOpcode() == ISD::BUILD_VECTOR) { - if (ConstantSDNode *Const = dyn_cast(N->getOperand(1))) { - unsigned Element = Const->getZExtValue(); - return Arg->getOperand(Element); - } - } - if (Arg.getOpcode() == ISD::BITCAST && - Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { - if (ConstantSDNode *Const = dyn_cast(N->getOperand(1))) { - unsigned Element = Const->getZExtValue(); - return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(), - Arg->getOperand(0).getOperand(Element)); - } - } - } - - case ISD::SELECT_CC: { - // Try common optimizations - SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI); - if (Ret.getNode()) - return Ret; - - // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq -> - // selectcc x, y, a, b, inv(cc) - // - // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne -> - // selectcc x, y, a, b, cc - SDValue LHS = N->getOperand(0); - if (LHS.getOpcode() != ISD::SELECT_CC) { - return SDValue(); - } - - SDValue RHS = N->getOperand(1); - SDValue True = N->getOperand(2); - SDValue False = N->getOperand(3); - ISD::CondCode NCC = cast(N->getOperand(4))->get(); - - if (LHS.getOperand(2).getNode() != True.getNode() || - LHS.getOperand(3).getNode() != False.getNode() || - RHS.getNode() != False.getNode()) { - return SDValue(); - } - - switch (NCC) { - default: return SDValue(); - case ISD::SETNE: return LHS; - case ISD::SETEQ: { - ISD::CondCode LHSCC = cast(LHS.getOperand(4))->get(); - LHSCC = ISD::getSetCCInverse(LHSCC, - LHS.getOperand(0).getValueType().isInteger()); - if (DCI.isBeforeLegalizeOps() || - isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType())) - return DAG.getSelectCC(SDLoc(N), - LHS.getOperand(0), - LHS.getOperand(1), - LHS.getOperand(2), - LHS.getOperand(3), - LHSCC); - break; - } - } - return SDValue(); - } - - case AMDGPUISD::EXPORT: { - SDValue Arg = N->getOperand(1); - if (Arg.getOpcode() != ISD::BUILD_VECTOR) - break; - - SDValue NewArgs[8] = { - N->getOperand(0), // Chain - SDValue(), - N->getOperand(2), // ArrayBase - N->getOperand(3), // Type - N->getOperand(4), // SWZ_X - N->getOperand(5), // SWZ_Y - N->getOperand(6), // SWZ_Z - N->getOperand(7) // SWZ_W - }; - SDLoc DL(N); - NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL); - return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs); - } - case AMDGPUISD::TEXTURE_FETCH: { - SDValue Arg = N->getOperand(1); - if (Arg.getOpcode() != ISD::BUILD_VECTOR) - break; - - SDValue NewArgs[19] = { - N->getOperand(0), - N->getOperand(1), - N->getOperand(2), - N->getOperand(3), - N->getOperand(4), - N->getOperand(5), - N->getOperand(6), - N->getOperand(7), - N->getOperand(8), - N->getOperand(9), - N->getOperand(10), - N->getOperand(11), - N->getOperand(12), - N->getOperand(13), - N->getOperand(14), - N->getOperand(15), - N->getOperand(16), - N->getOperand(17), - N->getOperand(18), - }; - SDLoc DL(N); - NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL); - return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs); - } - } - - return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); -} - -static bool -FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, - SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) { - const R600InstrInfo *TII = - static_cast(DAG.getSubtarget().getInstrInfo()); - if (!Src.isMachineOpcode()) - return false; - switch (Src.getMachineOpcode()) { - case AMDGPU::FNEG_R600: - if (!Neg.getNode()) - return false; - Src = Src.getOperand(0); - Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32); - return true; - case AMDGPU::FABS_R600: - if (!Abs.getNode()) - return false; - Src = Src.getOperand(0); - Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32); - return true; - case AMDGPU::CONST_COPY: { - unsigned Opcode = ParentNode->getMachineOpcode(); - bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; - - if (!Sel.getNode()) - return false; - - SDValue CstOffset = Src.getOperand(0); - if (ParentNode->getValueType(0).isVector()) - return false; - - // Gather constants values - int SrcIndices[] = { - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src2), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) - }; - std::vector Consts; - for (int OtherSrcIdx : SrcIndices) { - int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx); - if (OtherSrcIdx < 0 || OtherSelIdx < 0) - continue; - if (HasDst) { - OtherSrcIdx--; - OtherSelIdx--; - } - if (RegisterSDNode *Reg = - dyn_cast(ParentNode->getOperand(OtherSrcIdx))) { - if (Reg->getReg() == AMDGPU::ALU_CONST) { - ConstantSDNode *Cst - = cast(ParentNode->getOperand(OtherSelIdx)); - Consts.push_back(Cst->getZExtValue()); - } - } - } - - ConstantSDNode *Cst = cast(CstOffset); - Consts.push_back(Cst->getZExtValue()); - if (!TII->fitsConstReadLimitations(Consts)) { - return false; - } - - Sel = CstOffset; - Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32); - return true; - } - case AMDGPU::MOV_IMM_I32: - case AMDGPU::MOV_IMM_F32: { - unsigned ImmReg = AMDGPU::ALU_LITERAL_X; - uint64_t ImmValue = 0; - - - if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) { - ConstantFPSDNode *FPC = dyn_cast(Src.getOperand(0)); - float FloatValue = FPC->getValueAPF().convertToFloat(); - if (FloatValue == 0.0) { - ImmReg = AMDGPU::ZERO; - } else if (FloatValue == 0.5) { - ImmReg = AMDGPU::HALF; - } else if (FloatValue == 1.0) { - ImmReg = AMDGPU::ONE; - } else { - ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue(); - } - } else { - ConstantSDNode *C = dyn_cast(Src.getOperand(0)); - uint64_t Value = C->getZExtValue(); - if (Value == 0) { - ImmReg = AMDGPU::ZERO; - } else if (Value == 1) { - ImmReg = AMDGPU::ONE_INT; - } else { - ImmValue = Value; - } - } - - // Check that we aren't already using an immediate. - // XXX: It's possible for an instruction to have more than one - // immediate operand, but this is not supported yet. - if (ImmReg == AMDGPU::ALU_LITERAL_X) { - if (!Imm.getNode()) - return false; - ConstantSDNode *C = dyn_cast(Imm); - assert(C); - if (C->getZExtValue()) - return false; - Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32); - } - Src = DAG.getRegister(ImmReg, MVT::i32); - return true; - } - default: - return false; - } -} - - -/// \brief Fold the instructions after selecting them -SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, - SelectionDAG &DAG) const { - const R600InstrInfo *TII = - static_cast(DAG.getSubtarget().getInstrInfo()); - if (!Node->isMachineOpcode()) - return Node; - unsigned Opcode = Node->getMachineOpcode(); - SDValue FakeOp; - - std::vector Ops(Node->op_begin(), Node->op_end()); - - if (Opcode == AMDGPU::DOT_4) { - int OperandIdx[] = { - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) - }; - int NegIdx[] = { - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W) - }; - int AbsIdx[] = { - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W) - }; - for (unsigned i = 0; i < 8; i++) { - if (OperandIdx[i] < 0) - return Node; - SDValue &Src = Ops[OperandIdx[i] - 1]; - SDValue &Neg = Ops[NegIdx[i] - 1]; - SDValue &Abs = Ops[AbsIdx[i] - 1]; - bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; - int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); - if (HasDst) - SelIdx--; - SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; - if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG)) - return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); - } - } else if (Opcode == AMDGPU::REG_SEQUENCE) { - for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) { - SDValue &Src = Ops[i]; - if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG)) - return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); - } - } else if (Opcode == AMDGPU::CLAMP_R600) { - SDValue Src = Node->getOperand(0); - if (!Src.isMachineOpcode() || - !TII->hasInstrModifiers(Src.getMachineOpcode())) - return Node; - int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(), - AMDGPU::OpName::clamp); - if (ClampIdx < 0) - return Node; - SDLoc DL(Node); - std::vector Ops(Src->op_begin(), Src->op_end()); - Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32); - return DAG.getMachineNode(Src.getMachineOpcode(), DL, - Node->getVTList(), Ops); - } else { - if (!TII->hasInstrModifiers(Opcode)) - return Node; - int OperandIdx[] = { - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src2) - }; - int NegIdx[] = { - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg) - }; - int AbsIdx[] = { - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs), - -1 - }; - for (unsigned i = 0; i < 3; i++) { - if (OperandIdx[i] < 0) - return Node; - SDValue &Src = Ops[OperandIdx[i] - 1]; - SDValue &Neg = Ops[NegIdx[i] - 1]; - SDValue FakeAbs; - SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs; - bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; - int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); - int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal); - if (HasDst) { - SelIdx--; - ImmIdx--; - } - SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; - SDValue &Imm = Ops[ImmIdx]; - if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG)) - return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); - } - } - - return Node; -} diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h deleted file mode 100644 index c06d3c4fd30..00000000000 --- a/lib/Target/R600/R600ISelLowering.h +++ /dev/null @@ -1,80 +0,0 @@ -//===-- R600ISelLowering.h - R600 DAG Lowering Interface -*- C++ -*--------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief R600 DAG Lowering interface definition -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_R600ISELLOWERING_H -#define LLVM_LIB_TARGET_R600_R600ISELLOWERING_H - -#include "AMDGPUISelLowering.h" - -namespace llvm { - -class R600InstrInfo; - -class R600TargetLowering : public AMDGPUTargetLowering { -public: - R600TargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI); - MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI, - MachineBasicBlock * BB) const override; - SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; - SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; - void ReplaceNodeResults(SDNode * N, - SmallVectorImpl &Results, - SelectionDAG &DAG) const override; - SDValue LowerFormalArguments( - SDValue Chain, - CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl &Ins, - SDLoc DL, SelectionDAG &DAG, - SmallVectorImpl &InVals) const override; - EVT getSetCCResultType(LLVMContext &, EVT VT) const override; -private: - unsigned Gen; - /// Each OpenCL kernel has nine implicit parameters that are stored in the - /// first nine dwords of a Vertex Buffer. These implicit parameters are - /// lowered to load instructions which retrieve the values from the Vertex - /// Buffer. - SDValue LowerImplicitParameter(SelectionDAG &DAG, EVT VT, - SDLoc DL, unsigned DwordOffset) const; - - void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB, - MachineRegisterInfo & MRI, unsigned dword_offset) const; - SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG, - SDLoc DL) const; - SDValue vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const; - - SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSHLParts(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSRXParts(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerUADDSUBO(SDValue Op, SelectionDAG &DAG, - unsigned mainop, unsigned ovf) const; - - SDValue stackPtrToRegIndex(SDValue Ptr, unsigned StackWidth, - SelectionDAG &DAG) const; - void getStackAddress(unsigned StackWidth, unsigned ElemIdx, - unsigned &Channel, unsigned &PtrIncr) const; - bool isZero(SDValue Op) const; - SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override; -}; - -} // End namespace llvm; - -#endif diff --git a/lib/Target/R600/R600InstrFormats.td b/lib/Target/R600/R600InstrFormats.td deleted file mode 100644 index 0ffd485476e..00000000000 --- a/lib/Target/R600/R600InstrFormats.td +++ /dev/null @@ -1,495 +0,0 @@ -//===-- R600InstrFormats.td - R600 Instruction Encodings ------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// R600 Instruction format definitions. -// -//===----------------------------------------------------------------------===// - -class InstR600 pattern, - InstrItinClass itin> - : AMDGPUInst { - - field bits<64> Inst; - bit Trig = 0; - bit Op3 = 0; - bit isVector = 0; - bits<2> FlagOperandIdx = 0; - bit Op1 = 0; - bit Op2 = 0; - bit LDS_1A = 0; - bit LDS_1A1D = 0; - bit HasNativeOperands = 0; - bit VTXInst = 0; - bit TEXInst = 0; - bit ALUInst = 0; - bit IsExport = 0; - bit LDS_1A2D = 0; - - let Namespace = "AMDGPU"; - let OutOperandList = outs; - let InOperandList = ins; - let AsmString = asm; - let Pattern = pattern; - let Itinerary = itin; - - // No AsmMatcher support. - let isCodeGenOnly = 1; - - let TSFlags{4} = Trig; - let TSFlags{5} = Op3; - - // Vector instructions are instructions that must fill all slots in an - // instruction group - let TSFlags{6} = isVector; - let TSFlags{8-7} = FlagOperandIdx; - let TSFlags{9} = HasNativeOperands; - let TSFlags{10} = Op1; - let TSFlags{11} = Op2; - let TSFlags{12} = VTXInst; - let TSFlags{13} = TEXInst; - let TSFlags{14} = ALUInst; - let TSFlags{15} = LDS_1A; - let TSFlags{16} = LDS_1A1D; - let TSFlags{17} = IsExport; - let TSFlags{18} = LDS_1A2D; -} - -//===----------------------------------------------------------------------===// -// ALU instructions -//===----------------------------------------------------------------------===// - -class R600_ALU_LDS_Word0 { - field bits<32> Word0; - - bits<11> src0; - bits<1> src0_rel; - bits<11> src1; - bits<1> src1_rel; - bits<3> index_mode = 0; - bits<2> pred_sel; - bits<1> last; - - bits<9> src0_sel = src0{8-0}; - bits<2> src0_chan = src0{10-9}; - bits<9> src1_sel = src1{8-0}; - bits<2> src1_chan = src1{10-9}; - - let Word0{8-0} = src0_sel; - let Word0{9} = src0_rel; - let Word0{11-10} = src0_chan; - let Word0{21-13} = src1_sel; - let Word0{22} = src1_rel; - let Word0{24-23} = src1_chan; - let Word0{28-26} = index_mode; - let Word0{30-29} = pred_sel; - let Word0{31} = last; -} - -class R600ALU_Word0 : R600_ALU_LDS_Word0 { - - bits<1> src0_neg; - bits<1> src1_neg; - - let Word0{12} = src0_neg; - let Word0{25} = src1_neg; -} - -class R600ALU_Word1 { - field bits<32> Word1; - - bits<11> dst; - bits<3> bank_swizzle; - bits<1> dst_rel; - bits<1> clamp; - - bits<7> dst_sel = dst{6-0}; - bits<2> dst_chan = dst{10-9}; - - let Word1{20-18} = bank_swizzle; - let Word1{27-21} = dst_sel; - let Word1{28} = dst_rel; - let Word1{30-29} = dst_chan; - let Word1{31} = clamp; -} - -class R600ALU_Word1_OP2 alu_inst> : R600ALU_Word1{ - - bits<1> src0_abs; - bits<1> src1_abs; - bits<1> update_exec_mask; - bits<1> update_pred; - bits<1> write; - bits<2> omod; - - let Word1{0} = src0_abs; - let Word1{1} = src1_abs; - let Word1{2} = update_exec_mask; - let Word1{3} = update_pred; - let Word1{4} = write; - let Word1{6-5} = omod; - let Word1{17-7} = alu_inst; -} - -class R600ALU_Word1_OP3 alu_inst> : R600ALU_Word1{ - - bits<11> src2; - bits<1> src2_rel; - bits<1> src2_neg; - - bits<9> src2_sel = src2{8-0}; - bits<2> src2_chan = src2{10-9}; - - let Word1{8-0} = src2_sel; - let Word1{9} = src2_rel; - let Word1{11-10} = src2_chan; - let Word1{12} = src2_neg; - let Word1{17-13} = alu_inst; -} - -class R600LDS_Word1 { - field bits<32> Word1; - - bits<11> src2; - bits<9> src2_sel = src2{8-0}; - bits<2> src2_chan = src2{10-9}; - bits<1> src2_rel; - // offset specifies the stride offset to the second set of data to be read - // from. This is a dword offset. - bits<5> alu_inst = 17; // OP3_INST_LDS_IDX_OP - bits<3> bank_swizzle; - bits<6> lds_op; - bits<2> dst_chan = 0; - - let Word1{8-0} = src2_sel; - let Word1{9} = src2_rel; - let Word1{11-10} = src2_chan; - let Word1{17-13} = alu_inst; - let Word1{20-18} = bank_swizzle; - let Word1{26-21} = lds_op; - let Word1{30-29} = dst_chan; -} - - -/* -XXX: R600 subtarget uses a slightly different encoding than the other -subtargets. We currently handle this in R600MCCodeEmitter, but we may -want to use these instruction classes in the future. - -class R600ALU_Word1_OP2_r600 : R600ALU_Word1_OP2 { - - bits<1> fog_merge; - bits<10> alu_inst; - - let Inst{37} = fog_merge; - let Inst{39-38} = omod; - let Inst{49-40} = alu_inst; -} - -class R600ALU_Word1_OP2_r700 : R600ALU_Word1_OP2 { - - bits<11> alu_inst; - - let Inst{38-37} = omod; - let Inst{49-39} = alu_inst; -} -*/ - -//===----------------------------------------------------------------------===// -// Vertex Fetch instructions -//===----------------------------------------------------------------------===// - -class VTX_WORD0 { - field bits<32> Word0; - bits<7> src_gpr; - bits<5> VC_INST; - bits<2> FETCH_TYPE; - bits<1> FETCH_WHOLE_QUAD; - bits<8> BUFFER_ID; - bits<1> SRC_REL; - bits<2> SRC_SEL_X; - - let Word0{4-0} = VC_INST; - let Word0{6-5} = FETCH_TYPE; - let Word0{7} = FETCH_WHOLE_QUAD; - let Word0{15-8} = BUFFER_ID; - let Word0{22-16} = src_gpr; - let Word0{23} = SRC_REL; - let Word0{25-24} = SRC_SEL_X; -} - -class VTX_WORD0_eg : VTX_WORD0 { - - bits<6> MEGA_FETCH_COUNT; - - let Word0{31-26} = MEGA_FETCH_COUNT; -} - -class VTX_WORD0_cm : VTX_WORD0 { - - bits<2> SRC_SEL_Y; - bits<2> STRUCTURED_READ; - bits<1> LDS_REQ; - bits<1> COALESCED_READ; - - let Word0{27-26} = SRC_SEL_Y; - let Word0{29-28} = STRUCTURED_READ; - let Word0{30} = LDS_REQ; - let Word0{31} = COALESCED_READ; -} - -class VTX_WORD1_GPR { - field bits<32> Word1; - bits<7> dst_gpr; - bits<1> DST_REL; - bits<3> DST_SEL_X; - bits<3> DST_SEL_Y; - bits<3> DST_SEL_Z; - bits<3> DST_SEL_W; - bits<1> USE_CONST_FIELDS; - bits<6> DATA_FORMAT; - bits<2> NUM_FORMAT_ALL; - bits<1> FORMAT_COMP_ALL; - bits<1> SRF_MODE_ALL; - - let Word1{6-0} = dst_gpr; - let Word1{7} = DST_REL; - let Word1{8} = 0; // Reserved - let Word1{11-9} = DST_SEL_X; - let Word1{14-12} = DST_SEL_Y; - let Word1{17-15} = DST_SEL_Z; - let Word1{20-18} = DST_SEL_W; - let Word1{21} = USE_CONST_FIELDS; - let Word1{27-22} = DATA_FORMAT; - let Word1{29-28} = NUM_FORMAT_ALL; - let Word1{30} = FORMAT_COMP_ALL; - let Word1{31} = SRF_MODE_ALL; -} - -//===----------------------------------------------------------------------===// -// Texture fetch instructions -//===----------------------------------------------------------------------===// - -class TEX_WORD0 { - field bits<32> Word0; - - bits<5> TEX_INST; - bits<2> INST_MOD; - bits<1> FETCH_WHOLE_QUAD; - bits<8> RESOURCE_ID; - bits<7> SRC_GPR; - bits<1> SRC_REL; - bits<1> ALT_CONST; - bits<2> RESOURCE_INDEX_MODE; - bits<2> SAMPLER_INDEX_MODE; - - let Word0{4-0} = TEX_INST; - let Word0{6-5} = INST_MOD; - let Word0{7} = FETCH_WHOLE_QUAD; - let Word0{15-8} = RESOURCE_ID; - let Word0{22-16} = SRC_GPR; - let Word0{23} = SRC_REL; - let Word0{24} = ALT_CONST; - let Word0{26-25} = RESOURCE_INDEX_MODE; - let Word0{28-27} = SAMPLER_INDEX_MODE; -} - -class TEX_WORD1 { - field bits<32> Word1; - - bits<7> DST_GPR; - bits<1> DST_REL; - bits<3> DST_SEL_X; - bits<3> DST_SEL_Y; - bits<3> DST_SEL_Z; - bits<3> DST_SEL_W; - bits<7> LOD_BIAS; - bits<1> COORD_TYPE_X; - bits<1> COORD_TYPE_Y; - bits<1> COORD_TYPE_Z; - bits<1> COORD_TYPE_W; - - let Word1{6-0} = DST_GPR; - let Word1{7} = DST_REL; - let Word1{11-9} = DST_SEL_X; - let Word1{14-12} = DST_SEL_Y; - let Word1{17-15} = DST_SEL_Z; - let Word1{20-18} = DST_SEL_W; - let Word1{27-21} = LOD_BIAS; - let Word1{28} = COORD_TYPE_X; - let Word1{29} = COORD_TYPE_Y; - let Word1{30} = COORD_TYPE_Z; - let Word1{31} = COORD_TYPE_W; -} - -class TEX_WORD2 { - field bits<32> Word2; - - bits<5> OFFSET_X; - bits<5> OFFSET_Y; - bits<5> OFFSET_Z; - bits<5> SAMPLER_ID; - bits<3> SRC_SEL_X; - bits<3> SRC_SEL_Y; - bits<3> SRC_SEL_Z; - bits<3> SRC_SEL_W; - - let Word2{4-0} = OFFSET_X; - let Word2{9-5} = OFFSET_Y; - let Word2{14-10} = OFFSET_Z; - let Word2{19-15} = SAMPLER_ID; - let Word2{22-20} = SRC_SEL_X; - let Word2{25-23} = SRC_SEL_Y; - let Word2{28-26} = SRC_SEL_Z; - let Word2{31-29} = SRC_SEL_W; -} - -//===----------------------------------------------------------------------===// -// Control Flow Instructions -//===----------------------------------------------------------------------===// - -class CF_WORD1_R600 { - field bits<32> Word1; - - bits<3> POP_COUNT; - bits<5> CF_CONST; - bits<2> COND; - bits<3> COUNT; - bits<6> CALL_COUNT; - bits<1> COUNT_3; - bits<1> END_OF_PROGRAM; - bits<1> VALID_PIXEL_MODE; - bits<7> CF_INST; - bits<1> WHOLE_QUAD_MODE; - bits<1> BARRIER; - - let Word1{2-0} = POP_COUNT; - let Word1{7-3} = CF_CONST; - let Word1{9-8} = COND; - let Word1{12-10} = COUNT; - let Word1{18-13} = CALL_COUNT; - let Word1{19} = COUNT_3; - let Word1{21} = END_OF_PROGRAM; - let Word1{22} = VALID_PIXEL_MODE; - let Word1{29-23} = CF_INST; - let Word1{30} = WHOLE_QUAD_MODE; - let Word1{31} = BARRIER; -} - -class CF_WORD0_EG { - field bits<32> Word0; - - bits<24> ADDR; - bits<3> JUMPTABLE_SEL; - - let Word0{23-0} = ADDR; - let Word0{26-24} = JUMPTABLE_SEL; -} - -class CF_WORD1_EG { - field bits<32> Word1; - - bits<3> POP_COUNT; - bits<5> CF_CONST; - bits<2> COND; - bits<6> COUNT; - bits<1> VALID_PIXEL_MODE; - bits<1> END_OF_PROGRAM; - bits<8> CF_INST; - bits<1> BARRIER; - - let Word1{2-0} = POP_COUNT; - let Word1{7-3} = CF_CONST; - let Word1{9-8} = COND; - let Word1{15-10} = COUNT; - let Word1{20} = VALID_PIXEL_MODE; - let Word1{21} = END_OF_PROGRAM; - let Word1{29-22} = CF_INST; - let Word1{31} = BARRIER; -} - -class CF_ALU_WORD0 { - field bits<32> Word0; - - bits<22> ADDR; - bits<4> KCACHE_BANK0; - bits<4> KCACHE_BANK1; - bits<2> KCACHE_MODE0; - - let Word0{21-0} = ADDR; - let Word0{25-22} = KCACHE_BANK0; - let Word0{29-26} = KCACHE_BANK1; - let Word0{31-30} = KCACHE_MODE0; -} - -class CF_ALU_WORD1 { - field bits<32> Word1; - - bits<2> KCACHE_MODE1; - bits<8> KCACHE_ADDR0; - bits<8> KCACHE_ADDR1; - bits<7> COUNT; - bits<1> ALT_CONST; - bits<4> CF_INST; - bits<1> WHOLE_QUAD_MODE; - bits<1> BARRIER; - - let Word1{1-0} = KCACHE_MODE1; - let Word1{9-2} = KCACHE_ADDR0; - let Word1{17-10} = KCACHE_ADDR1; - let Word1{24-18} = COUNT; - let Word1{25} = ALT_CONST; - let Word1{29-26} = CF_INST; - let Word1{30} = WHOLE_QUAD_MODE; - let Word1{31} = BARRIER; -} - -class CF_ALLOC_EXPORT_WORD0_RAT { - field bits<32> Word0; - - bits<4> rat_id; - bits<6> rat_inst; - bits<2> rim; - bits<2> type; - bits<7> rw_gpr; - bits<1> rw_rel; - bits<7> index_gpr; - bits<2> elem_size; - - let Word0{3-0} = rat_id; - let Word0{9-4} = rat_inst; - let Word0{10} = 0; // Reserved - let Word0{12-11} = rim; - let Word0{14-13} = type; - let Word0{21-15} = rw_gpr; - let Word0{22} = rw_rel; - let Word0{29-23} = index_gpr; - let Word0{31-30} = elem_size; -} - -class CF_ALLOC_EXPORT_WORD1_BUF { - field bits<32> Word1; - - bits<12> array_size; - bits<4> comp_mask; - bits<4> burst_count; - bits<1> vpm; - bits<1> eop; - bits<8> cf_inst; - bits<1> mark; - bits<1> barrier; - - let Word1{11-0} = array_size; - let Word1{15-12} = comp_mask; - let Word1{19-16} = burst_count; - let Word1{20} = vpm; - let Word1{21} = eop; - let Word1{29-22} = cf_inst; - let Word1{30} = mark; - let Word1{31} = barrier; -} diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp deleted file mode 100644 index 5ef883cbcad..00000000000 --- a/lib/Target/R600/R600InstrInfo.cpp +++ /dev/null @@ -1,1435 +0,0 @@ -//===-- R600InstrInfo.cpp - R600 Instruction Information ------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief R600 Implementation of TargetInstrInfo. -// -//===----------------------------------------------------------------------===// - -#include "R600InstrInfo.h" -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "AMDGPUTargetMachine.h" -#include "R600Defines.h" -#include "R600MachineFunctionInfo.h" -#include "R600RegisterInfo.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" - -using namespace llvm; - -#define GET_INSTRINFO_CTOR_DTOR -#include "AMDGPUGenDFAPacketizer.inc" - -R600InstrInfo::R600InstrInfo(const AMDGPUSubtarget &st) - : AMDGPUInstrInfo(st), RI() {} - -const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const { - return RI; -} - -bool R600InstrInfo::isTrig(const MachineInstr &MI) const { - return get(MI.getOpcode()).TSFlags & R600_InstFlag::TRIG; -} - -bool R600InstrInfo::isVector(const MachineInstr &MI) const { - return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR; -} - -void -R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, - bool KillSrc) const { - unsigned VectorComponents = 0; - if ((AMDGPU::R600_Reg128RegClass.contains(DestReg) || - AMDGPU::R600_Reg128VerticalRegClass.contains(DestReg)) && - (AMDGPU::R600_Reg128RegClass.contains(SrcReg) || - AMDGPU::R600_Reg128VerticalRegClass.contains(SrcReg))) { - VectorComponents = 4; - } else if((AMDGPU::R600_Reg64RegClass.contains(DestReg) || - AMDGPU::R600_Reg64VerticalRegClass.contains(DestReg)) && - (AMDGPU::R600_Reg64RegClass.contains(SrcReg) || - AMDGPU::R600_Reg64VerticalRegClass.contains(SrcReg))) { - VectorComponents = 2; - } - - if (VectorComponents > 0) { - for (unsigned I = 0; I < VectorComponents; I++) { - unsigned SubRegIndex = RI.getSubRegFromChannel(I); - buildDefaultInstruction(MBB, MI, AMDGPU::MOV, - RI.getSubReg(DestReg, SubRegIndex), - RI.getSubReg(SrcReg, SubRegIndex)) - .addReg(DestReg, - RegState::Define | RegState::Implicit); - } - } else { - MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV, - DestReg, SrcReg); - NewMI->getOperand(getOperandIdx(*NewMI, AMDGPU::OpName::src0)) - .setIsKill(KillSrc); - } -} - -/// \returns true if \p MBBI can be moved into a new basic. -bool R600InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI) const { - for (MachineInstr::const_mop_iterator I = MBBI->operands_begin(), - E = MBBI->operands_end(); I != E; ++I) { - if (I->isReg() && !TargetRegisterInfo::isVirtualRegister(I->getReg()) && - I->isUse() && RI.isPhysRegLiveAcrossClauses(I->getReg())) - return false; - } - return true; -} - -bool R600InstrInfo::isMov(unsigned Opcode) const { - - - switch(Opcode) { - default: return false; - case AMDGPU::MOV: - case AMDGPU::MOV_IMM_F32: - case AMDGPU::MOV_IMM_I32: - return true; - } -} - -// Some instructions act as place holders to emulate operations that the GPU -// hardware does automatically. This function can be used to check if -// an opcode falls into this category. -bool R600InstrInfo::isPlaceHolderOpcode(unsigned Opcode) const { - switch (Opcode) { - default: return false; - case AMDGPU::RETURN: - return true; - } -} - -bool R600InstrInfo::isReductionOp(unsigned Opcode) const { - return false; -} - -bool R600InstrInfo::isCubeOp(unsigned Opcode) const { - switch(Opcode) { - default: return false; - case AMDGPU::CUBE_r600_pseudo: - case AMDGPU::CUBE_r600_real: - case AMDGPU::CUBE_eg_pseudo: - case AMDGPU::CUBE_eg_real: - return true; - } -} - -bool R600InstrInfo::isALUInstr(unsigned Opcode) const { - unsigned TargetFlags = get(Opcode).TSFlags; - - return (TargetFlags & R600_InstFlag::ALU_INST); -} - -bool R600InstrInfo::hasInstrModifiers(unsigned Opcode) const { - unsigned TargetFlags = get(Opcode).TSFlags; - - return ((TargetFlags & R600_InstFlag::OP1) | - (TargetFlags & R600_InstFlag::OP2) | - (TargetFlags & R600_InstFlag::OP3)); -} - -bool R600InstrInfo::isLDSInstr(unsigned Opcode) const { - unsigned TargetFlags = get(Opcode).TSFlags; - - return ((TargetFlags & R600_InstFlag::LDS_1A) | - (TargetFlags & R600_InstFlag::LDS_1A1D) | - (TargetFlags & R600_InstFlag::LDS_1A2D)); -} - -bool R600InstrInfo::isLDSNoRetInstr(unsigned Opcode) const { - return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) == -1; -} - -bool R600InstrInfo::isLDSRetInstr(unsigned Opcode) const { - return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) != -1; -} - -bool R600InstrInfo::canBeConsideredALU(const MachineInstr *MI) const { - if (isALUInstr(MI->getOpcode())) - return true; - if (isVector(*MI) || isCubeOp(MI->getOpcode())) - return true; - switch (MI->getOpcode()) { - case AMDGPU::PRED_X: - case AMDGPU::INTERP_PAIR_XY: - case AMDGPU::INTERP_PAIR_ZW: - case AMDGPU::INTERP_VEC_LOAD: - case AMDGPU::COPY: - case AMDGPU::DOT_4: - return true; - default: - return false; - } -} - -bool R600InstrInfo::isTransOnly(unsigned Opcode) const { - if (ST.hasCaymanISA()) - return false; - return (get(Opcode).getSchedClass() == AMDGPU::Sched::TransALU); -} - -bool R600InstrInfo::isTransOnly(const MachineInstr *MI) const { - return isTransOnly(MI->getOpcode()); -} - -bool R600InstrInfo::isVectorOnly(unsigned Opcode) const { - return (get(Opcode).getSchedClass() == AMDGPU::Sched::VecALU); -} - -bool R600InstrInfo::isVectorOnly(const MachineInstr *MI) const { - return isVectorOnly(MI->getOpcode()); -} - -bool R600InstrInfo::isExport(unsigned Opcode) const { - return (get(Opcode).TSFlags & R600_InstFlag::IS_EXPORT); -} - -bool R600InstrInfo::usesVertexCache(unsigned Opcode) const { - return ST.hasVertexCache() && IS_VTX(get(Opcode)); -} - -bool R600InstrInfo::usesVertexCache(const MachineInstr *MI) const { - const MachineFunction *MF = MI->getParent()->getParent(); - const R600MachineFunctionInfo *MFI = MF->getInfo(); - return MFI->getShaderType() != ShaderType::COMPUTE && - usesVertexCache(MI->getOpcode()); -} - -bool R600InstrInfo::usesTextureCache(unsigned Opcode) const { - return (!ST.hasVertexCache() && IS_VTX(get(Opcode))) || IS_TEX(get(Opcode)); -} - -bool R600InstrInfo::usesTextureCache(const MachineInstr *MI) const { - const MachineFunction *MF = MI->getParent()->getParent(); - const R600MachineFunctionInfo *MFI = MF->getInfo(); - return (MFI->getShaderType() == ShaderType::COMPUTE && - usesVertexCache(MI->getOpcode())) || - usesTextureCache(MI->getOpcode()); -} - -bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const { - switch (Opcode) { - case AMDGPU::KILLGT: - case AMDGPU::GROUP_BARRIER: - return true; - default: - return false; - } -} - -bool R600InstrInfo::usesAddressRegister(MachineInstr *MI) const { - return MI->findRegisterUseOperandIdx(AMDGPU::AR_X) != -1; -} - -bool R600InstrInfo::definesAddressRegister(MachineInstr *MI) const { - return MI->findRegisterDefOperandIdx(AMDGPU::AR_X) != -1; -} - -bool R600InstrInfo::readsLDSSrcReg(const MachineInstr *MI) const { - if (!isALUInstr(MI->getOpcode())) { - return false; - } - for (MachineInstr::const_mop_iterator I = MI->operands_begin(), - E = MI->operands_end(); I != E; ++I) { - if (!I->isReg() || !I->isUse() || - TargetRegisterInfo::isVirtualRegister(I->getReg())) - continue; - - if (AMDGPU::R600_LDS_SRC_REGRegClass.contains(I->getReg())) - return true; - } - return false; -} - -int R600InstrInfo::getSrcIdx(unsigned Opcode, unsigned SrcNum) const { - static const unsigned OpTable[] = { - AMDGPU::OpName::src0, - AMDGPU::OpName::src1, - AMDGPU::OpName::src2 - }; - - assert (SrcNum < 3); - return getOperandIdx(Opcode, OpTable[SrcNum]); -} - -int R600InstrInfo::getSelIdx(unsigned Opcode, unsigned SrcIdx) const { - static const unsigned SrcSelTable[][2] = { - {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel}, - {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel}, - {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel}, - {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X}, - {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y}, - {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z}, - {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W}, - {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X}, - {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y}, - {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z}, - {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W} - }; - - for (const auto &Row : SrcSelTable) { - if (getOperandIdx(Opcode, Row[0]) == (int)SrcIdx) { - return getOperandIdx(Opcode, Row[1]); - } - } - return -1; -} - -SmallVector, 3> -R600InstrInfo::getSrcs(MachineInstr *MI) const { - SmallVector, 3> Result; - - if (MI->getOpcode() == AMDGPU::DOT_4) { - static const unsigned OpTable[8][2] = { - {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X}, - {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y}, - {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z}, - {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W}, - {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X}, - {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y}, - {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z}, - {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W}, - }; - - for (unsigned j = 0; j < 8; j++) { - MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(), - OpTable[j][0])); - unsigned Reg = MO.getReg(); - if (Reg == AMDGPU::ALU_CONST) { - unsigned Sel = MI->getOperand(getOperandIdx(MI->getOpcode(), - OpTable[j][1])).getImm(); - Result.push_back(std::pair(&MO, Sel)); - continue; - } - - } - return Result; - } - - static const unsigned OpTable[3][2] = { - {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel}, - {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel}, - {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel}, - }; - - for (unsigned j = 0; j < 3; j++) { - int SrcIdx = getOperandIdx(MI->getOpcode(), OpTable[j][0]); - if (SrcIdx < 0) - break; - MachineOperand &MO = MI->getOperand(SrcIdx); - unsigned Reg = MI->getOperand(SrcIdx).getReg(); - if (Reg == AMDGPU::ALU_CONST) { - unsigned Sel = MI->getOperand( - getOperandIdx(MI->getOpcode(), OpTable[j][1])).getImm(); - Result.push_back(std::pair(&MO, Sel)); - continue; - } - if (Reg == AMDGPU::ALU_LITERAL_X) { - unsigned Imm = MI->getOperand( - getOperandIdx(MI->getOpcode(), AMDGPU::OpName::literal)).getImm(); - Result.push_back(std::pair(&MO, Imm)); - continue; - } - Result.push_back(std::pair(&MO, 0)); - } - return Result; -} - -std::vector > -R600InstrInfo::ExtractSrcs(MachineInstr *MI, - const DenseMap &PV, - unsigned &ConstCount) const { - ConstCount = 0; - ArrayRef> Srcs = getSrcs(MI); - const std::pair DummyPair(-1, 0); - std::vector > Result; - unsigned i = 0; - for (unsigned n = Srcs.size(); i < n; ++i) { - unsigned Reg = Srcs[i].first->getReg(); - unsigned Index = RI.getEncodingValue(Reg) & 0xff; - if (Reg == AMDGPU::OQAP) { - Result.push_back(std::pair(Index, 0)); - } - if (PV.find(Reg) != PV.end()) { - // 255 is used to tells its a PS/PV reg - Result.push_back(std::pair(255, 0)); - continue; - } - if (Index > 127) { - ConstCount++; - Result.push_back(DummyPair); - continue; - } - unsigned Chan = RI.getHWRegChan(Reg); - Result.push_back(std::pair(Index, Chan)); - } - for (; i < 3; ++i) - Result.push_back(DummyPair); - return Result; -} - -static std::vector > -Swizzle(std::vector > Src, - R600InstrInfo::BankSwizzle Swz) { - if (Src[0] == Src[1]) - Src[1].first = -1; - switch (Swz) { - case R600InstrInfo::ALU_VEC_012_SCL_210: - break; - case R600InstrInfo::ALU_VEC_021_SCL_122: - std::swap(Src[1], Src[2]); - break; - case R600InstrInfo::ALU_VEC_102_SCL_221: - std::swap(Src[0], Src[1]); - break; - case R600InstrInfo::ALU_VEC_120_SCL_212: - std::swap(Src[0], Src[1]); - std::swap(Src[0], Src[2]); - break; - case R600InstrInfo::ALU_VEC_201: - std::swap(Src[0], Src[2]); - std::swap(Src[0], Src[1]); - break; - case R600InstrInfo::ALU_VEC_210: - std::swap(Src[0], Src[2]); - break; - } - return Src; -} - -static unsigned -getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) { - switch (Swz) { - case R600InstrInfo::ALU_VEC_012_SCL_210: { - unsigned Cycles[3] = { 2, 1, 0}; - return Cycles[Op]; - } - case R600InstrInfo::ALU_VEC_021_SCL_122: { - unsigned Cycles[3] = { 1, 2, 2}; - return Cycles[Op]; - } - case R600InstrInfo::ALU_VEC_120_SCL_212: { - unsigned Cycles[3] = { 2, 1, 2}; - return Cycles[Op]; - } - case R600InstrInfo::ALU_VEC_102_SCL_221: { - unsigned Cycles[3] = { 2, 2, 1}; - return Cycles[Op]; - } - default: - llvm_unreachable("Wrong Swizzle for Trans Slot"); - return 0; - } -} - -/// returns how many MIs (whose inputs are represented by IGSrcs) can be packed -/// in the same Instruction Group while meeting read port limitations given a -/// Swz swizzle sequence. -unsigned R600InstrInfo::isLegalUpTo( - const std::vector > > &IGSrcs, - const std::vector &Swz, - const std::vector > &TransSrcs, - R600InstrInfo::BankSwizzle TransSwz) const { - int Vector[4][3]; - memset(Vector, -1, sizeof(Vector)); - for (unsigned i = 0, e = IGSrcs.size(); i < e; i++) { - const std::vector > &Srcs = - Swizzle(IGSrcs[i], Swz[i]); - for (unsigned j = 0; j < 3; j++) { - const std::pair &Src = Srcs[j]; - if (Src.first < 0 || Src.first == 255) - continue; - if (Src.first == GET_REG_INDEX(RI.getEncodingValue(AMDGPU::OQAP))) { - if (Swz[i] != R600InstrInfo::ALU_VEC_012_SCL_210 && - Swz[i] != R600InstrInfo::ALU_VEC_021_SCL_122) { - // The value from output queue A (denoted by register OQAP) can - // only be fetched during the first cycle. - return false; - } - // OQAP does not count towards the normal read port restrictions - continue; - } - if (Vector[Src.second][j] < 0) - Vector[Src.second][j] = Src.first; - if (Vector[Src.second][j] != Src.first) - return i; - } - } - // Now check Trans Alu - for (unsigned i = 0, e = TransSrcs.size(); i < e; ++i) { - const std::pair &Src = TransSrcs[i]; - unsigned Cycle = getTransSwizzle(TransSwz, i); - if (Src.first < 0) - continue; - if (Src.first == 255) - continue; - if (Vector[Src.second][Cycle] < 0) - Vector[Src.second][Cycle] = Src.first; - if (Vector[Src.second][Cycle] != Src.first) - return IGSrcs.size() - 1; - } - return IGSrcs.size(); -} - -/// Given a swizzle sequence SwzCandidate and an index Idx, returns the next -/// (in lexicographic term) swizzle sequence assuming that all swizzles after -/// Idx can be skipped -static bool -NextPossibleSolution( - std::vector &SwzCandidate, - unsigned Idx) { - assert(Idx < SwzCandidate.size()); - int ResetIdx = Idx; - while (ResetIdx > -1 && SwzCandidate[ResetIdx] == R600InstrInfo::ALU_VEC_210) - ResetIdx --; - for (unsigned i = ResetIdx + 1, e = SwzCandidate.size(); i < e; i++) { - SwzCandidate[i] = R600InstrInfo::ALU_VEC_012_SCL_210; - } - if (ResetIdx == -1) - return false; - int NextSwizzle = SwzCandidate[ResetIdx] + 1; - SwzCandidate[ResetIdx] = (R600InstrInfo::BankSwizzle)NextSwizzle; - return true; -} - -/// Enumerate all possible Swizzle sequence to find one that can meet all -/// read port requirements. -bool R600InstrInfo::FindSwizzleForVectorSlot( - const std::vector > > &IGSrcs, - std::vector &SwzCandidate, - const std::vector > &TransSrcs, - R600InstrInfo::BankSwizzle TransSwz) const { - unsigned ValidUpTo = 0; - do { - ValidUpTo = isLegalUpTo(IGSrcs, SwzCandidate, TransSrcs, TransSwz); - if (ValidUpTo == IGSrcs.size()) - return true; - } while (NextPossibleSolution(SwzCandidate, ValidUpTo)); - return false; -} - -/// Instructions in Trans slot can't read gpr at cycle 0 if they also read -/// a const, and can't read a gpr at cycle 1 if they read 2 const. -static bool -isConstCompatible(R600InstrInfo::BankSwizzle TransSwz, - const std::vector > &TransOps, - unsigned ConstCount) { - // TransALU can't read 3 constants - if (ConstCount > 2) - return false; - for (unsigned i = 0, e = TransOps.size(); i < e; ++i) { - const std::pair &Src = TransOps[i]; - unsigned Cycle = getTransSwizzle(TransSwz, i); - if (Src.first < 0) - continue; - if (ConstCount > 0 && Cycle == 0) - return false; - if (ConstCount > 1 && Cycle == 1) - return false; - } - return true; -} - -bool -R600InstrInfo::fitsReadPortLimitations(const std::vector &IG, - const DenseMap &PV, - std::vector &ValidSwizzle, - bool isLastAluTrans) - const { - //Todo : support shared src0 - src1 operand - - std::vector > > IGSrcs; - ValidSwizzle.clear(); - unsigned ConstCount; - BankSwizzle TransBS = ALU_VEC_012_SCL_210; - for (unsigned i = 0, e = IG.size(); i < e; ++i) { - IGSrcs.push_back(ExtractSrcs(IG[i], PV, ConstCount)); - unsigned Op = getOperandIdx(IG[i]->getOpcode(), - AMDGPU::OpName::bank_swizzle); - ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle) - IG[i]->getOperand(Op).getImm()); - } - std::vector > TransOps; - if (!isLastAluTrans) - return FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps, TransBS); - - TransOps = std::move(IGSrcs.back()); - IGSrcs.pop_back(); - ValidSwizzle.pop_back(); - - static const R600InstrInfo::BankSwizzle TransSwz[] = { - ALU_VEC_012_SCL_210, - ALU_VEC_021_SCL_122, - ALU_VEC_120_SCL_212, - ALU_VEC_102_SCL_221 - }; - for (unsigned i = 0; i < 4; i++) { - TransBS = TransSwz[i]; - if (!isConstCompatible(TransBS, TransOps, ConstCount)) - continue; - bool Result = FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps, - TransBS); - if (Result) { - ValidSwizzle.push_back(TransBS); - return true; - } - } - - return false; -} - - -bool -R600InstrInfo::fitsConstReadLimitations(const std::vector &Consts) - const { - assert (Consts.size() <= 12 && "Too many operands in instructions group"); - unsigned Pair1 = 0, Pair2 = 0; - for (unsigned i = 0, n = Consts.size(); i < n; ++i) { - unsigned ReadConstHalf = Consts[i] & 2; - unsigned ReadConstIndex = Consts[i] & (~3); - unsigned ReadHalfConst = ReadConstIndex | ReadConstHalf; - if (!Pair1) { - Pair1 = ReadHalfConst; - continue; - } - if (Pair1 == ReadHalfConst) - continue; - if (!Pair2) { - Pair2 = ReadHalfConst; - continue; - } - if (Pair2 != ReadHalfConst) - return false; - } - return true; -} - -bool -R600InstrInfo::fitsConstReadLimitations(const std::vector &MIs) - const { - std::vector Consts; - SmallSet Literals; - for (unsigned i = 0, n = MIs.size(); i < n; i++) { - MachineInstr *MI = MIs[i]; - if (!isALUInstr(MI->getOpcode())) - continue; - - ArrayRef> Srcs = getSrcs(MI); - - for (unsigned j = 0, e = Srcs.size(); j < e; j++) { - std::pair Src = Srcs[j]; - if (Src.first->getReg() == AMDGPU::ALU_LITERAL_X) - Literals.insert(Src.second); - if (Literals.size() > 4) - return false; - if (Src.first->getReg() == AMDGPU::ALU_CONST) - Consts.push_back(Src.second); - if (AMDGPU::R600_KC0RegClass.contains(Src.first->getReg()) || - AMDGPU::R600_KC1RegClass.contains(Src.first->getReg())) { - unsigned Index = RI.getEncodingValue(Src.first->getReg()) & 0xff; - unsigned Chan = RI.getHWRegChan(Src.first->getReg()); - Consts.push_back((Index << 2) | Chan); - } - } - } - return fitsConstReadLimitations(Consts); -} - -DFAPacketizer * -R600InstrInfo::CreateTargetScheduleState(const TargetSubtargetInfo &STI) const { - const InstrItineraryData *II = STI.getInstrItineraryData(); - return static_cast(STI).createDFAPacketizer(II); -} - -static bool -isPredicateSetter(unsigned Opcode) { - switch (Opcode) { - case AMDGPU::PRED_X: - return true; - default: - return false; - } -} - -static MachineInstr * -findFirstPredicateSetterFrom(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) { - while (I != MBB.begin()) { - --I; - MachineInstr *MI = I; - if (isPredicateSetter(MI->getOpcode())) - return MI; - } - - return nullptr; -} - -static -bool isJump(unsigned Opcode) { - return Opcode == AMDGPU::JUMP || Opcode == AMDGPU::JUMP_COND; -} - -static bool isBranch(unsigned Opcode) { - return Opcode == AMDGPU::BRANCH || Opcode == AMDGPU::BRANCH_COND_i32 || - Opcode == AMDGPU::BRANCH_COND_f32; -} - -bool -R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, - MachineBasicBlock *&TBB, - MachineBasicBlock *&FBB, - SmallVectorImpl &Cond, - bool AllowModify) const { - // Most of the following comes from the ARM implementation of AnalyzeBranch - - // If the block has no terminators, it just falls into the block after it. - MachineBasicBlock::iterator I = MBB.end(); - if (I == MBB.begin()) - return false; - --I; - while (I->isDebugValue()) { - if (I == MBB.begin()) - return false; - --I; - } - // AMDGPU::BRANCH* instructions are only available after isel and are not - // handled - if (isBranch(I->getOpcode())) - return true; - if (!isJump(static_cast(I)->getOpcode())) { - return false; - } - - // Remove successive JUMP - while (I != MBB.begin() && std::prev(I)->getOpcode() == AMDGPU::JUMP) { - MachineBasicBlock::iterator PriorI = std::prev(I); - if (AllowModify) - I->removeFromParent(); - I = PriorI; - } - MachineInstr *LastInst = I; - - // If there is only one terminator instruction, process it. - unsigned LastOpc = LastInst->getOpcode(); - if (I == MBB.begin() || - !isJump(static_cast(--I)->getOpcode())) { - if (LastOpc == AMDGPU::JUMP) { - TBB = LastInst->getOperand(0).getMBB(); - return false; - } else if (LastOpc == AMDGPU::JUMP_COND) { - MachineInstr *predSet = I; - while (!isPredicateSetter(predSet->getOpcode())) { - predSet = --I; - } - TBB = LastInst->getOperand(0).getMBB(); - Cond.push_back(predSet->getOperand(1)); - Cond.push_back(predSet->getOperand(2)); - Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false)); - return false; - } - return true; // Can't handle indirect branch. - } - - // Get the instruction before it if it is a terminator. - MachineInstr *SecondLastInst = I; - unsigned SecondLastOpc = SecondLastInst->getOpcode(); - - // If the block ends with a B and a Bcc, handle it. - if (SecondLastOpc == AMDGPU::JUMP_COND && LastOpc == AMDGPU::JUMP) { - MachineInstr *predSet = --I; - while (!isPredicateSetter(predSet->getOpcode())) { - predSet = --I; - } - TBB = SecondLastInst->getOperand(0).getMBB(); - FBB = LastInst->getOperand(0).getMBB(); - Cond.push_back(predSet->getOperand(1)); - Cond.push_back(predSet->getOperand(2)); - Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false)); - return false; - } - - // Otherwise, can't handle this. - return true; -} - -static -MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) { - for (MachineBasicBlock::reverse_iterator It = MBB.rbegin(), E = MBB.rend(); - It != E; ++It) { - if (It->getOpcode() == AMDGPU::CF_ALU || - It->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE) - return std::prev(It.base()); - } - return MBB.end(); -} - -unsigned -R600InstrInfo::InsertBranch(MachineBasicBlock &MBB, - MachineBasicBlock *TBB, - MachineBasicBlock *FBB, - ArrayRef Cond, - DebugLoc DL) const { - assert(TBB && "InsertBranch must not be told to insert a fallthrough"); - - if (!FBB) { - if (Cond.empty()) { - BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB); - return 1; - } else { - MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end()); - assert(PredSet && "No previous predicate !"); - addFlag(PredSet, 0, MO_FLAG_PUSH); - PredSet->getOperand(2).setImm(Cond[1].getImm()); - - BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND)) - .addMBB(TBB) - .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); - MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); - if (CfAlu == MBB.end()) - return 1; - assert (CfAlu->getOpcode() == AMDGPU::CF_ALU); - CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE)); - return 1; - } - } else { - MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end()); - assert(PredSet && "No previous predicate !"); - addFlag(PredSet, 0, MO_FLAG_PUSH); - PredSet->getOperand(2).setImm(Cond[1].getImm()); - BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND)) - .addMBB(TBB) - .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); - BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB); - MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); - if (CfAlu == MBB.end()) - return 2; - assert (CfAlu->getOpcode() == AMDGPU::CF_ALU); - CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE)); - return 2; - } -} - -unsigned -R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { - - // Note : we leave PRED* instructions there. - // They may be needed when predicating instructions. - - MachineBasicBlock::iterator I = MBB.end(); - - if (I == MBB.begin()) { - return 0; - } - --I; - switch (I->getOpcode()) { - default: - return 0; - case AMDGPU::JUMP_COND: { - MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I); - clearFlag(predSet, 0, MO_FLAG_PUSH); - I->eraseFromParent(); - MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); - if (CfAlu == MBB.end()) - break; - assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE); - CfAlu->setDesc(get(AMDGPU::CF_ALU)); - break; - } - case AMDGPU::JUMP: - I->eraseFromParent(); - break; - } - I = MBB.end(); - - if (I == MBB.begin()) { - return 1; - } - --I; - switch (I->getOpcode()) { - // FIXME: only one case?? - default: - return 1; - case AMDGPU::JUMP_COND: { - MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I); - clearFlag(predSet, 0, MO_FLAG_PUSH); - I->eraseFromParent(); - MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); - if (CfAlu == MBB.end()) - break; - assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE); - CfAlu->setDesc(get(AMDGPU::CF_ALU)); - break; - } - case AMDGPU::JUMP: - I->eraseFromParent(); - break; - } - return 2; -} - -bool -R600InstrInfo::isPredicated(const MachineInstr *MI) const { - int idx = MI->findFirstPredOperandIdx(); - if (idx < 0) - return false; - - unsigned Reg = MI->getOperand(idx).getReg(); - switch (Reg) { - default: return false; - case AMDGPU::PRED_SEL_ONE: - case AMDGPU::PRED_SEL_ZERO: - case AMDGPU::PREDICATE_BIT: - return true; - } -} - -bool -R600InstrInfo::isPredicable(MachineInstr *MI) const { - // XXX: KILL* instructions can be predicated, but they must be the last - // instruction in a clause, so this means any instructions after them cannot - // be predicated. Until we have proper support for instruction clauses in the - // backend, we will mark KILL* instructions as unpredicable. - - if (MI->getOpcode() == AMDGPU::KILLGT) { - return false; - } else if (MI->getOpcode() == AMDGPU::CF_ALU) { - // If the clause start in the middle of MBB then the MBB has more - // than a single clause, unable to predicate several clauses. - if (MI->getParent()->begin() != MachineBasicBlock::iterator(MI)) - return false; - // TODO: We don't support KC merging atm - if (MI->getOperand(3).getImm() != 0 || MI->getOperand(4).getImm() != 0) - return false; - return true; - } else if (isVector(*MI)) { - return false; - } else { - return AMDGPUInstrInfo::isPredicable(MI); - } -} - - -bool -R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB, - unsigned NumCyles, - unsigned ExtraPredCycles, - const BranchProbability &Probability) const{ - return true; -} - -bool -R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB, - unsigned NumTCycles, - unsigned ExtraTCycles, - MachineBasicBlock &FMBB, - unsigned NumFCycles, - unsigned ExtraFCycles, - const BranchProbability &Probability) const { - return true; -} - -bool -R600InstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB, - unsigned NumCyles, - const BranchProbability &Probability) - const { - return true; -} - -bool -R600InstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB, - MachineBasicBlock &FMBB) const { - return false; -} - - -bool -R600InstrInfo::ReverseBranchCondition(SmallVectorImpl &Cond) const { - MachineOperand &MO = Cond[1]; - switch (MO.getImm()) { - case OPCODE_IS_ZERO_INT: - MO.setImm(OPCODE_IS_NOT_ZERO_INT); - break; - case OPCODE_IS_NOT_ZERO_INT: - MO.setImm(OPCODE_IS_ZERO_INT); - break; - case OPCODE_IS_ZERO: - MO.setImm(OPCODE_IS_NOT_ZERO); - break; - case OPCODE_IS_NOT_ZERO: - MO.setImm(OPCODE_IS_ZERO); - break; - default: - return true; - } - - MachineOperand &MO2 = Cond[2]; - switch (MO2.getReg()) { - case AMDGPU::PRED_SEL_ZERO: - MO2.setReg(AMDGPU::PRED_SEL_ONE); - break; - case AMDGPU::PRED_SEL_ONE: - MO2.setReg(AMDGPU::PRED_SEL_ZERO); - break; - default: - return true; - } - return false; -} - -bool -R600InstrInfo::DefinesPredicate(MachineInstr *MI, - std::vector &Pred) const { - return isPredicateSetter(MI->getOpcode()); -} - - -bool -R600InstrInfo::SubsumesPredicate(ArrayRef Pred1, - ArrayRef Pred2) const { - return false; -} - - -bool -R600InstrInfo::PredicateInstruction(MachineInstr *MI, - ArrayRef Pred) const { - int PIdx = MI->findFirstPredOperandIdx(); - - if (MI->getOpcode() == AMDGPU::CF_ALU) { - MI->getOperand(8).setImm(0); - return true; - } - - if (MI->getOpcode() == AMDGPU::DOT_4) { - MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_X)) - .setReg(Pred[2].getReg()); - MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Y)) - .setReg(Pred[2].getReg()); - MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Z)) - .setReg(Pred[2].getReg()); - MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_W)) - .setReg(Pred[2].getReg()); - MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI); - MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit); - return true; - } - - if (PIdx != -1) { - MachineOperand &PMO = MI->getOperand(PIdx); - PMO.setReg(Pred[2].getReg()); - MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI); - MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit); - return true; - } - - return false; -} - -unsigned int R600InstrInfo::getPredicationCost(const MachineInstr *) const { - return 2; -} - -unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData, - const MachineInstr *MI, - unsigned *PredCost) const { - if (PredCost) - *PredCost = 2; - return 2; -} - -bool R600InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { - - switch(MI->getOpcode()) { - default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); - case AMDGPU::R600_EXTRACT_ELT_V2: - case AMDGPU::R600_EXTRACT_ELT_V4: - buildIndirectRead(MI->getParent(), MI, MI->getOperand(0).getReg(), - RI.getHWRegIndex(MI->getOperand(1).getReg()), // Address - MI->getOperand(2).getReg(), - RI.getHWRegChan(MI->getOperand(1).getReg())); - break; - case AMDGPU::R600_INSERT_ELT_V2: - case AMDGPU::R600_INSERT_ELT_V4: - buildIndirectWrite(MI->getParent(), MI, MI->getOperand(2).getReg(), // Value - RI.getHWRegIndex(MI->getOperand(1).getReg()), // Address - MI->getOperand(3).getReg(), // Offset - RI.getHWRegChan(MI->getOperand(1).getReg())); // Channel - break; - } - MI->eraseFromParent(); - return true; -} - -void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved, - const MachineFunction &MF) const { - const AMDGPUFrameLowering *TFL = static_cast( - MF.getSubtarget().getFrameLowering()); - - unsigned StackWidth = TFL->getStackWidth(MF); - int End = getIndirectIndexEnd(MF); - - if (End == -1) - return; - - for (int Index = getIndirectIndexBegin(MF); Index <= End; ++Index) { - unsigned SuperReg = AMDGPU::R600_Reg128RegClass.getRegister(Index); - Reserved.set(SuperReg); - for (unsigned Chan = 0; Chan < StackWidth; ++Chan) { - unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister((4 * Index) + Chan); - Reserved.set(Reg); - } - } -} - -unsigned R600InstrInfo::calculateIndirectAddress(unsigned RegIndex, - unsigned Channel) const { - // XXX: Remove when we support a stack width > 2 - assert(Channel == 0); - return RegIndex; -} - -const TargetRegisterClass *R600InstrInfo::getIndirectAddrRegClass() const { - return &AMDGPU::R600_TReg32_XRegClass; -} - -MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, unsigned Address, - unsigned OffsetReg) const { - return buildIndirectWrite(MBB, I, ValueReg, Address, OffsetReg, 0); -} - -MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, unsigned Address, - unsigned OffsetReg, - unsigned AddrChan) const { - unsigned AddrReg; - switch (AddrChan) { - default: llvm_unreachable("Invalid Channel"); - case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break; - case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break; - case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break; - case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break; - } - MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg, - AMDGPU::AR_X, OffsetReg); - setImmOperand(MOVA, AMDGPU::OpName::write, 0); - - MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV, - AddrReg, ValueReg) - .addReg(AMDGPU::AR_X, - RegState::Implicit | RegState::Kill); - setImmOperand(Mov, AMDGPU::OpName::dst_rel, 1); - return Mov; -} - -MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, unsigned Address, - unsigned OffsetReg) const { - return buildIndirectRead(MBB, I, ValueReg, Address, OffsetReg, 0); -} - -MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, unsigned Address, - unsigned OffsetReg, - unsigned AddrChan) const { - unsigned AddrReg; - switch (AddrChan) { - default: llvm_unreachable("Invalid Channel"); - case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break; - case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break; - case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break; - case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break; - } - MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg, - AMDGPU::AR_X, - OffsetReg); - setImmOperand(MOVA, AMDGPU::OpName::write, 0); - MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV, - ValueReg, - AddrReg) - .addReg(AMDGPU::AR_X, - RegState::Implicit | RegState::Kill); - setImmOperand(Mov, AMDGPU::OpName::src0_rel, 1); - - return Mov; -} - -unsigned R600InstrInfo::getMaxAlusPerClause() const { - return 115; -} - -MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - unsigned Opcode, - unsigned DstReg, - unsigned Src0Reg, - unsigned Src1Reg) const { - MachineInstrBuilder MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opcode), - DstReg); // $dst - - if (Src1Reg) { - MIB.addImm(0) // $update_exec_mask - .addImm(0); // $update_predicate - } - MIB.addImm(1) // $write - .addImm(0) // $omod - .addImm(0) // $dst_rel - .addImm(0) // $dst_clamp - .addReg(Src0Reg) // $src0 - .addImm(0) // $src0_neg - .addImm(0) // $src0_rel - .addImm(0) // $src0_abs - .addImm(-1); // $src0_sel - - if (Src1Reg) { - MIB.addReg(Src1Reg) // $src1 - .addImm(0) // $src1_neg - .addImm(0) // $src1_rel - .addImm(0) // $src1_abs - .addImm(-1); // $src1_sel - } - - //XXX: The r600g finalizer expects this to be 1, once we've moved the - //scheduling to the backend, we can change the default to 0. - MIB.addImm(1) // $last - .addReg(AMDGPU::PRED_SEL_OFF) // $pred_sel - .addImm(0) // $literal - .addImm(0); // $bank_swizzle - - return MIB; -} - -#define OPERAND_CASE(Label) \ - case Label: { \ - static const unsigned Ops[] = \ - { \ - Label##_X, \ - Label##_Y, \ - Label##_Z, \ - Label##_W \ - }; \ - return Ops[Slot]; \ - } - -static unsigned getSlotedOps(unsigned Op, unsigned Slot) { - switch (Op) { - OPERAND_CASE(AMDGPU::OpName::update_exec_mask) - OPERAND_CASE(AMDGPU::OpName::update_pred) - OPERAND_CASE(AMDGPU::OpName::write) - OPERAND_CASE(AMDGPU::OpName::omod) - OPERAND_CASE(AMDGPU::OpName::dst_rel) - OPERAND_CASE(AMDGPU::OpName::clamp) - OPERAND_CASE(AMDGPU::OpName::src0) - OPERAND_CASE(AMDGPU::OpName::src0_neg) - OPERAND_CASE(AMDGPU::OpName::src0_rel) - OPERAND_CASE(AMDGPU::OpName::src0_abs) - OPERAND_CASE(AMDGPU::OpName::src0_sel) - OPERAND_CASE(AMDGPU::OpName::src1) - OPERAND_CASE(AMDGPU::OpName::src1_neg) - OPERAND_CASE(AMDGPU::OpName::src1_rel) - OPERAND_CASE(AMDGPU::OpName::src1_abs) - OPERAND_CASE(AMDGPU::OpName::src1_sel) - OPERAND_CASE(AMDGPU::OpName::pred_sel) - default: - llvm_unreachable("Wrong Operand"); - } -} - -#undef OPERAND_CASE - -MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction( - MachineBasicBlock &MBB, MachineInstr *MI, unsigned Slot, unsigned DstReg) - const { - assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented"); - unsigned Opcode; - if (ST.getGeneration() <= AMDGPUSubtarget::R700) - Opcode = AMDGPU::DOT4_r600; - else - Opcode = AMDGPU::DOT4_eg; - MachineBasicBlock::iterator I = MI; - MachineOperand &Src0 = MI->getOperand( - getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src0, Slot))); - MachineOperand &Src1 = MI->getOperand( - getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src1, Slot))); - MachineInstr *MIB = buildDefaultInstruction( - MBB, I, Opcode, DstReg, Src0.getReg(), Src1.getReg()); - static const unsigned Operands[14] = { - AMDGPU::OpName::update_exec_mask, - AMDGPU::OpName::update_pred, - AMDGPU::OpName::write, - AMDGPU::OpName::omod, - AMDGPU::OpName::dst_rel, - AMDGPU::OpName::clamp, - AMDGPU::OpName::src0_neg, - AMDGPU::OpName::src0_rel, - AMDGPU::OpName::src0_abs, - AMDGPU::OpName::src0_sel, - AMDGPU::OpName::src1_neg, - AMDGPU::OpName::src1_rel, - AMDGPU::OpName::src1_abs, - AMDGPU::OpName::src1_sel, - }; - - MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(), - getSlotedOps(AMDGPU::OpName::pred_sel, Slot))); - MIB->getOperand(getOperandIdx(Opcode, AMDGPU::OpName::pred_sel)) - .setReg(MO.getReg()); - - for (unsigned i = 0; i < 14; i++) { - MachineOperand &MO = MI->getOperand( - getOperandIdx(MI->getOpcode(), getSlotedOps(Operands[i], Slot))); - assert (MO.isImm()); - setImmOperand(MIB, Operands[i], MO.getImm()); - } - MIB->getOperand(20).setImm(0); - return MIB; -} - -MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB, - MachineBasicBlock::iterator I, - unsigned DstReg, - uint64_t Imm) const { - MachineInstr *MovImm = buildDefaultInstruction(BB, I, AMDGPU::MOV, DstReg, - AMDGPU::ALU_LITERAL_X); - setImmOperand(MovImm, AMDGPU::OpName::literal, Imm); - return MovImm; -} - -MachineInstr *R600InstrInfo::buildMovInstr(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned DstReg, unsigned SrcReg) const { - return buildDefaultInstruction(*MBB, I, AMDGPU::MOV, DstReg, SrcReg); -} - -int R600InstrInfo::getOperandIdx(const MachineInstr &MI, unsigned Op) const { - return getOperandIdx(MI.getOpcode(), Op); -} - -int R600InstrInfo::getOperandIdx(unsigned Opcode, unsigned Op) const { - return AMDGPU::getNamedOperandIdx(Opcode, Op); -} - -void R600InstrInfo::setImmOperand(MachineInstr *MI, unsigned Op, - int64_t Imm) const { - int Idx = getOperandIdx(*MI, Op); - assert(Idx != -1 && "Operand not supported for this instruction."); - assert(MI->getOperand(Idx).isImm()); - MI->getOperand(Idx).setImm(Imm); -} - -//===----------------------------------------------------------------------===// -// Instruction flag getters/setters -//===----------------------------------------------------------------------===// - -bool R600InstrInfo::hasFlagOperand(const MachineInstr &MI) const { - return GET_FLAG_OPERAND_IDX(get(MI.getOpcode()).TSFlags) != 0; -} - -MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx, - unsigned Flag) const { - unsigned TargetFlags = get(MI->getOpcode()).TSFlags; - int FlagIndex = 0; - if (Flag != 0) { - // If we pass something other than the default value of Flag to this - // function, it means we are want to set a flag on an instruction - // that uses native encoding. - assert(HAS_NATIVE_OPERANDS(TargetFlags)); - bool IsOP3 = (TargetFlags & R600_InstFlag::OP3) == R600_InstFlag::OP3; - switch (Flag) { - case MO_FLAG_CLAMP: - FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::clamp); - break; - case MO_FLAG_MASK: - FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::write); - break; - case MO_FLAG_NOT_LAST: - case MO_FLAG_LAST: - FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::last); - break; - case MO_FLAG_NEG: - switch (SrcIdx) { - case 0: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src0_neg); break; - case 1: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src1_neg); break; - case 2: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src2_neg); break; - } - break; - - case MO_FLAG_ABS: - assert(!IsOP3 && "Cannot set absolute value modifier for OP3 " - "instructions."); - (void)IsOP3; - switch (SrcIdx) { - case 0: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src0_abs); break; - case 1: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src1_abs); break; - } - break; - - default: - FlagIndex = -1; - break; - } - assert(FlagIndex != -1 && "Flag not supported for this instruction"); - } else { - FlagIndex = GET_FLAG_OPERAND_IDX(TargetFlags); - assert(FlagIndex != 0 && - "Instruction flags not supported for this instruction"); - } - - MachineOperand &FlagOp = MI->getOperand(FlagIndex); - assert(FlagOp.isImm()); - return FlagOp; -} - -void R600InstrInfo::addFlag(MachineInstr *MI, unsigned Operand, - unsigned Flag) const { - unsigned TargetFlags = get(MI->getOpcode()).TSFlags; - if (Flag == 0) { - return; - } - if (HAS_NATIVE_OPERANDS(TargetFlags)) { - MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag); - if (Flag == MO_FLAG_NOT_LAST) { - clearFlag(MI, Operand, MO_FLAG_LAST); - } else if (Flag == MO_FLAG_MASK) { - clearFlag(MI, Operand, Flag); - } else { - FlagOp.setImm(1); - } - } else { - MachineOperand &FlagOp = getFlagOp(MI, Operand); - FlagOp.setImm(FlagOp.getImm() | (Flag << (NUM_MO_FLAGS * Operand))); - } -} - -void R600InstrInfo::clearFlag(MachineInstr *MI, unsigned Operand, - unsigned Flag) const { - unsigned TargetFlags = get(MI->getOpcode()).TSFlags; - if (HAS_NATIVE_OPERANDS(TargetFlags)) { - MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag); - FlagOp.setImm(0); - } else { - MachineOperand &FlagOp = getFlagOp(MI); - unsigned InstFlags = FlagOp.getImm(); - InstFlags &= ~(Flag << (NUM_MO_FLAGS * Operand)); - FlagOp.setImm(InstFlags); - } -} diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h deleted file mode 100644 index dee4c2b9ae3..00000000000 --- a/lib/Target/R600/R600InstrInfo.h +++ /dev/null @@ -1,303 +0,0 @@ -//===-- R600InstrInfo.h - R600 Instruction Info Interface -------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Interface definition for R600InstrInfo -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_R600INSTRINFO_H -#define LLVM_LIB_TARGET_R600_R600INSTRINFO_H - -#include "AMDGPUInstrInfo.h" -#include "R600Defines.h" -#include "R600RegisterInfo.h" -#include - -namespace llvm { - - class AMDGPUTargetMachine; - class DFAPacketizer; - class ScheduleDAG; - class MachineFunction; - class MachineInstr; - class MachineInstrBuilder; - - class R600InstrInfo : public AMDGPUInstrInfo { - private: - const R600RegisterInfo RI; - - std::vector > - ExtractSrcs(MachineInstr *MI, const DenseMap &PV, unsigned &ConstCount) const; - - - MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, unsigned Address, - unsigned OffsetReg, - unsigned AddrChan) const; - - MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, unsigned Address, - unsigned OffsetReg, - unsigned AddrChan) const; - public: - enum BankSwizzle { - ALU_VEC_012_SCL_210 = 0, - ALU_VEC_021_SCL_122, - ALU_VEC_120_SCL_212, - ALU_VEC_102_SCL_221, - ALU_VEC_201, - ALU_VEC_210 - }; - - explicit R600InstrInfo(const AMDGPUSubtarget &st); - - const R600RegisterInfo &getRegisterInfo() const override; - void copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, - bool KillSrc) const override; - bool isLegalToSplitMBBAt(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI) const override; - - bool isTrig(const MachineInstr &MI) const; - bool isPlaceHolderOpcode(unsigned opcode) const; - bool isReductionOp(unsigned opcode) const; - bool isCubeOp(unsigned opcode) const; - - /// \returns true if this \p Opcode represents an ALU instruction. - bool isALUInstr(unsigned Opcode) const; - bool hasInstrModifiers(unsigned Opcode) const; - bool isLDSInstr(unsigned Opcode) const; - bool isLDSNoRetInstr(unsigned Opcode) const; - bool isLDSRetInstr(unsigned Opcode) const; - - /// \returns true if this \p Opcode represents an ALU instruction or an - /// instruction that will be lowered in ExpandSpecialInstrs Pass. - bool canBeConsideredALU(const MachineInstr *MI) const; - - bool isTransOnly(unsigned Opcode) const; - bool isTransOnly(const MachineInstr *MI) const; - bool isVectorOnly(unsigned Opcode) const; - bool isVectorOnly(const MachineInstr *MI) const; - bool isExport(unsigned Opcode) const; - - bool usesVertexCache(unsigned Opcode) const; - bool usesVertexCache(const MachineInstr *MI) const; - bool usesTextureCache(unsigned Opcode) const; - bool usesTextureCache(const MachineInstr *MI) const; - - bool mustBeLastInClause(unsigned Opcode) const; - bool usesAddressRegister(MachineInstr *MI) const; - bool definesAddressRegister(MachineInstr *MI) const; - bool readsLDSSrcReg(const MachineInstr *MI) const; - - /// \returns The operand index for the given source number. Legal values - /// for SrcNum are 0, 1, and 2. - int getSrcIdx(unsigned Opcode, unsigned SrcNum) const; - /// \returns The operand Index for the Sel operand given an index to one - /// of the instruction's src operands. - int getSelIdx(unsigned Opcode, unsigned SrcIdx) const; - - /// \returns a pair for each src of an ALU instructions. - /// The first member of a pair is the register id. - /// If register is ALU_CONST, second member is SEL. - /// If register is ALU_LITERAL, second member is IMM. - /// Otherwise, second member value is undefined. - SmallVector, 3> - getSrcs(MachineInstr *MI) const; - - unsigned isLegalUpTo( - const std::vector > > &IGSrcs, - const std::vector &Swz, - const std::vector > &TransSrcs, - R600InstrInfo::BankSwizzle TransSwz) const; - - bool FindSwizzleForVectorSlot( - const std::vector > > &IGSrcs, - std::vector &SwzCandidate, - const std::vector > &TransSrcs, - R600InstrInfo::BankSwizzle TransSwz) const; - - /// Given the order VEC_012 < VEC_021 < VEC_120 < VEC_102 < VEC_201 < VEC_210 - /// returns true and the first (in lexical order) BankSwizzle affectation - /// starting from the one already provided in the Instruction Group MIs that - /// fits Read Port limitations in BS if available. Otherwise returns false - /// and undefined content in BS. - /// isLastAluTrans should be set if the last Alu of MIs will be executed on - /// Trans ALU. In this case, ValidTSwizzle returns the BankSwizzle value to - /// apply to the last instruction. - /// PV holds GPR to PV registers in the Instruction Group MIs. - bool fitsReadPortLimitations(const std::vector &MIs, - const DenseMap &PV, - std::vector &BS, - bool isLastAluTrans) const; - - /// An instruction group can only access 2 channel pair (either [XY] or [ZW]) - /// from KCache bank on R700+. This function check if MI set in input meet - /// this limitations - bool fitsConstReadLimitations(const std::vector &) const; - /// Same but using const index set instead of MI set. - bool fitsConstReadLimitations(const std::vector&) const; - - /// \brief Vector instructions are instructions that must fill all - /// instruction slots within an instruction group. - bool isVector(const MachineInstr &MI) const; - - bool isMov(unsigned Opcode) const override; - - DFAPacketizer * - CreateTargetScheduleState(const TargetSubtargetInfo &) const override; - - bool ReverseBranchCondition(SmallVectorImpl &Cond) const override; - - bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, - SmallVectorImpl &Cond, bool AllowModify) const override; - - unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, - MachineBasicBlock *FBB, ArrayRef Cond, - DebugLoc DL) const override; - - unsigned RemoveBranch(MachineBasicBlock &MBB) const override; - - bool isPredicated(const MachineInstr *MI) const override; - - bool isPredicable(MachineInstr *MI) const override; - - bool - isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, - const BranchProbability &Probability) const override; - - bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, - unsigned ExtraPredCycles, - const BranchProbability &Probability) const override ; - - bool - isProfitableToIfCvt(MachineBasicBlock &TMBB, - unsigned NumTCycles, unsigned ExtraTCycles, - MachineBasicBlock &FMBB, - unsigned NumFCycles, unsigned ExtraFCycles, - const BranchProbability &Probability) const override; - - bool DefinesPredicate(MachineInstr *MI, - std::vector &Pred) const override; - - bool SubsumesPredicate(ArrayRef Pred1, - ArrayRef Pred2) const override; - - bool isProfitableToUnpredicate(MachineBasicBlock &TMBB, - MachineBasicBlock &FMBB) const override; - - bool PredicateInstruction(MachineInstr *MI, - ArrayRef Pred) const override; - - unsigned int getPredicationCost(const MachineInstr *) const override; - - unsigned int getInstrLatency(const InstrItineraryData *ItinData, - const MachineInstr *MI, - unsigned *PredCost = nullptr) const override; - - int getInstrLatency(const InstrItineraryData *ItinData, - SDNode *Node) const override { return 1;} - - bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; - - /// \brief Reserve the registers that may be accesed using indirect addressing. - void reserveIndirectRegisters(BitVector &Reserved, - const MachineFunction &MF) const; - - unsigned calculateIndirectAddress(unsigned RegIndex, - unsigned Channel) const override; - - const TargetRegisterClass *getIndirectAddrRegClass() const override; - - MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, unsigned Address, - unsigned OffsetReg) const override; - - MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, unsigned Address, - unsigned OffsetReg) const override; - - unsigned getMaxAlusPerClause() const; - - ///buildDefaultInstruction - This function returns a MachineInstr with - /// all the instruction modifiers initialized to their default values. - /// You can use this function to avoid manually specifying each instruction - /// modifier operand when building a new instruction. - /// - /// \returns a MachineInstr with all the instruction modifiers initialized - /// to their default values. - MachineInstrBuilder buildDefaultInstruction(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - unsigned Opcode, - unsigned DstReg, - unsigned Src0Reg, - unsigned Src1Reg = 0) const; - - MachineInstr *buildSlotOfVectorInstruction(MachineBasicBlock &MBB, - MachineInstr *MI, - unsigned Slot, - unsigned DstReg) const; - - MachineInstr *buildMovImm(MachineBasicBlock &BB, - MachineBasicBlock::iterator I, - unsigned DstReg, - uint64_t Imm) const; - - MachineInstr *buildMovInstr(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned DstReg, unsigned SrcReg) const override; - - /// \brief Get the index of Op in the MachineInstr. - /// - /// \returns -1 if the Instruction does not contain the specified \p Op. - int getOperandIdx(const MachineInstr &MI, unsigned Op) const; - - /// \brief Get the index of \p Op for the given Opcode. - /// - /// \returns -1 if the Instruction does not contain the specified \p Op. - int getOperandIdx(unsigned Opcode, unsigned Op) const; - - /// \brief Helper function for setting instruction flag values. - void setImmOperand(MachineInstr *MI, unsigned Op, int64_t Imm) const; - - /// \returns true if this instruction has an operand for storing target flags. - bool hasFlagOperand(const MachineInstr &MI) const; - - ///\brief Add one of the MO_FLAG* flags to the specified \p Operand. - void addFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const; - - ///\brief Determine if the specified \p Flag is set on this \p Operand. - bool isFlagSet(const MachineInstr &MI, unsigned Operand, unsigned Flag) const; - - /// \param SrcIdx The register source to set the flag on (e.g src0, src1, src2) - /// \param Flag The flag being set. - /// - /// \returns the operand containing the flags for this instruction. - MachineOperand &getFlagOp(MachineInstr *MI, unsigned SrcIdx = 0, - unsigned Flag = 0) const; - - /// \brief Clear the specified flag on the instruction. - void clearFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const; -}; - -namespace AMDGPU { - -int getLDSNoRetOp(uint16_t Opcode); - -} //End namespace AMDGPU - -} // End llvm namespace - -#endif diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td deleted file mode 100644 index 7beed092b3f..00000000000 --- a/lib/Target/R600/R600Instructions.td +++ /dev/null @@ -1,1744 +0,0 @@ -//===-- R600Instructions.td - R600 Instruction defs -------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// TableGen definitions for instructions which are available on R600 family -// GPUs. -// -//===----------------------------------------------------------------------===// - -include "R600Intrinsics.td" -include "R600InstrFormats.td" - -class InstR600ISA pattern> : - InstR600 { - - let Namespace = "AMDGPU"; -} - -def MEMxi : Operand { - let MIOperandInfo = (ops R600_TReg32_X:$ptr, i32imm:$index); - let PrintMethod = "printMemOperand"; -} - -def MEMrr : Operand { - let MIOperandInfo = (ops R600_Reg32:$ptr, R600_Reg32:$index); -} - -// Operands for non-registers - -class InstFlag - : OperandWithDefaultOps { - let PrintMethod = PM; -} - -// src_sel for ALU src operands, see also ALU_CONST, ALU_PARAM registers -def SEL : OperandWithDefaultOps { - let PrintMethod = "printSel"; -} -def BANK_SWIZZLE : OperandWithDefaultOps { - let PrintMethod = "printBankSwizzle"; -} - -def LITERAL : InstFlag<"printLiteral">; - -def WRITE : InstFlag <"printWrite", 1>; -def OMOD : InstFlag <"printOMOD">; -def REL : InstFlag <"printRel">; -def CLAMP : InstFlag <"printClamp">; -def NEG : InstFlag <"printNeg">; -def ABS : InstFlag <"printAbs">; -def UEM : InstFlag <"printUpdateExecMask">; -def UP : InstFlag <"printUpdatePred">; - -// XXX: The r600g finalizer in Mesa expects last to be one in most cases. -// Once we start using the packetizer in this backend we should have this -// default to 0. -def LAST : InstFlag<"printLast", 1>; -def RSel : Operand { - let PrintMethod = "printRSel"; -} -def CT: Operand { - let PrintMethod = "printCT"; -} - -def FRAMEri : Operand { - let MIOperandInfo = (ops R600_Reg32:$ptr, i32imm:$index); -} - -def ADDRParam : ComplexPattern; -def ADDRDWord : ComplexPattern; -def ADDRVTX_READ : ComplexPattern; -def ADDRGA_CONST_OFFSET : ComplexPattern; -def ADDRGA_VAR_OFFSET : ComplexPattern; - - -def R600_Pred : PredicateOperand; - - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { - -// Class for instructions with only one source register. -// If you add new ins to this instruction, make sure they are listed before -// $literal, because the backend currently assumes that the last operand is -// a literal. Also be sure to update the enum R600Op1OperandIndex::ROI in -// R600Defines.h, R600InstrInfo::buildDefaultInstruction(), -// and R600InstrInfo::getOperandIdx(). -class R600_1OP inst, string opName, list pattern, - InstrItinClass itin = AnyALU> : - InstR600 <(outs R600_Reg32:$dst), - (ins WRITE:$write, OMOD:$omod, REL:$dst_rel, CLAMP:$clamp, - R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel, - LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal, - BANK_SWIZZLE:$bank_swizzle), - !strconcat(" ", opName, - "$clamp $last $dst$write$dst_rel$omod, " - "$src0_neg$src0_abs$src0$src0_abs$src0_rel, " - "$pred_sel $bank_swizzle"), - pattern, - itin>, - R600ALU_Word0, - R600ALU_Word1_OP2 { - - let src1 = 0; - let src1_rel = 0; - let src1_neg = 0; - let src1_abs = 0; - let update_exec_mask = 0; - let update_pred = 0; - let HasNativeOperands = 1; - let Op1 = 1; - let ALUInst = 1; - let DisableEncoding = "$literal"; - let UseNamedOperandTable = 1; - - let Inst{31-0} = Word0; - let Inst{63-32} = Word1; -} - -class R600_1OP_Helper inst, string opName, SDPatternOperator node, - InstrItinClass itin = AnyALU> : - R600_1OP ; - -// If you add or change the operands for R600_2OP instructions, you must -// also update the R600Op2OperandIndex::ROI enum in R600Defines.h, -// R600InstrInfo::buildDefaultInstruction(), and R600InstrInfo::getOperandIdx(). -class R600_2OP inst, string opName, list pattern, - InstrItinClass itin = AnyALU> : - InstR600 <(outs R600_Reg32:$dst), - (ins UEM:$update_exec_mask, UP:$update_pred, WRITE:$write, - OMOD:$omod, REL:$dst_rel, CLAMP:$clamp, - R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel, - R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, ABS:$src1_abs, SEL:$src1_sel, - LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal, - BANK_SWIZZLE:$bank_swizzle), - !strconcat(" ", opName, - "$clamp $last $update_exec_mask$update_pred$dst$write$dst_rel$omod, " - "$src0_neg$src0_abs$src0$src0_abs$src0_rel, " - "$src1_neg$src1_abs$src1$src1_abs$src1_rel, " - "$pred_sel $bank_swizzle"), - pattern, - itin>, - R600ALU_Word0, - R600ALU_Word1_OP2 { - - let HasNativeOperands = 1; - let Op2 = 1; - let ALUInst = 1; - let DisableEncoding = "$literal"; - let UseNamedOperandTable = 1; - - let Inst{31-0} = Word0; - let Inst{63-32} = Word1; -} - -class R600_2OP_Helper inst, string opName, SDPatternOperator node, - InstrItinClass itin = AnyALU> : - R600_2OP ; - -// If you add our change the operands for R600_3OP instructions, you must -// also update the R600Op3OperandIndex::ROI enum in R600Defines.h, -// R600InstrInfo::buildDefaultInstruction(), and -// R600InstrInfo::getOperandIdx(). -class R600_3OP inst, string opName, list pattern, - InstrItinClass itin = AnyALU> : - InstR600 <(outs R600_Reg32:$dst), - (ins REL:$dst_rel, CLAMP:$clamp, - R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, SEL:$src0_sel, - R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, SEL:$src1_sel, - R600_Reg32:$src2, NEG:$src2_neg, REL:$src2_rel, SEL:$src2_sel, - LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal, - BANK_SWIZZLE:$bank_swizzle), - !strconcat(" ", opName, "$clamp $last $dst$dst_rel, " - "$src0_neg$src0$src0_rel, " - "$src1_neg$src1$src1_rel, " - "$src2_neg$src2$src2_rel, " - "$pred_sel" - "$bank_swizzle"), - pattern, - itin>, - R600ALU_Word0, - R600ALU_Word1_OP3{ - - let HasNativeOperands = 1; - let DisableEncoding = "$literal"; - let Op3 = 1; - let UseNamedOperandTable = 1; - let ALUInst = 1; - - let Inst{31-0} = Word0; - let Inst{63-32} = Word1; -} - -class R600_REDUCTION inst, dag ins, string asm, list pattern, - InstrItinClass itin = VecALU> : - InstR600 <(outs R600_Reg32:$dst), - ins, - asm, - pattern, - itin>; - - - -} // End mayLoad = 1, mayStore = 0, hasSideEffects = 0 - -def TEX_SHADOW : PatLeaf< - (imm), - [{uint32_t TType = (uint32_t)N->getZExtValue(); - return (TType >= 6 && TType <= 8) || TType == 13; - }] ->; - -def TEX_RECT : PatLeaf< - (imm), - [{uint32_t TType = (uint32_t)N->getZExtValue(); - return TType == 5; - }] ->; - -def TEX_ARRAY : PatLeaf< - (imm), - [{uint32_t TType = (uint32_t)N->getZExtValue(); - return TType == 9 || TType == 10 || TType == 16; - }] ->; - -def TEX_SHADOW_ARRAY : PatLeaf< - (imm), - [{uint32_t TType = (uint32_t)N->getZExtValue(); - return TType == 11 || TType == 12 || TType == 17; - }] ->; - -def TEX_MSAA : PatLeaf< - (imm), - [{uint32_t TType = (uint32_t)N->getZExtValue(); - return TType == 14; - }] ->; - -def TEX_ARRAY_MSAA : PatLeaf< - (imm), - [{uint32_t TType = (uint32_t)N->getZExtValue(); - return TType == 15; - }] ->; - -class EG_CF_RAT cfinst, bits <6> ratinst, bits<4> ratid, bits<4> mask, - dag outs, dag ins, string asm, list pattern> : - InstR600ISA , - CF_ALLOC_EXPORT_WORD0_RAT, CF_ALLOC_EXPORT_WORD1_BUF { - - let rat_id = ratid; - let rat_inst = ratinst; - let rim = 0; - // XXX: Have a separate instruction for non-indexed writes. - let type = 1; - let rw_rel = 0; - let elem_size = 0; - - let array_size = 0; - let comp_mask = mask; - let burst_count = 0; - let vpm = 0; - let cf_inst = cfinst; - let mark = 0; - let barrier = 1; - - let Inst{31-0} = Word0; - let Inst{63-32} = Word1; - let IsExport = 1; - -} - -class VTX_READ buffer_id, dag outs, list pattern> - : InstR600ISA , - VTX_WORD1_GPR { - - // Static fields - let DST_REL = 0; - // The docs say that if this bit is set, then DATA_FORMAT, NUM_FORMAT_ALL, - // FORMAT_COMP_ALL, SRF_MODE_ALL, and ENDIAN_SWAP fields will be ignored, - // however, based on my testing if USE_CONST_FIELDS is set, then all - // these fields need to be set to 0. - let USE_CONST_FIELDS = 0; - let NUM_FORMAT_ALL = 1; - let FORMAT_COMP_ALL = 0; - let SRF_MODE_ALL = 0; - - let Inst{63-32} = Word1; - // LLVM can only encode 64-bit instructions, so these fields are manually - // encoded in R600CodeEmitter - // - // bits<16> OFFSET; - // bits<2> ENDIAN_SWAP = 0; - // bits<1> CONST_BUF_NO_STRIDE = 0; - // bits<1> MEGA_FETCH = 0; - // bits<1> ALT_CONST = 0; - // bits<2> BUFFER_INDEX_MODE = 0; - - // VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding - // is done in R600CodeEmitter - // - // Inst{79-64} = OFFSET; - // Inst{81-80} = ENDIAN_SWAP; - // Inst{82} = CONST_BUF_NO_STRIDE; - // Inst{83} = MEGA_FETCH; - // Inst{84} = ALT_CONST; - // Inst{86-85} = BUFFER_INDEX_MODE; - // Inst{95-86} = 0; Reserved - - // VTX_WORD3 (Padding) - // - // Inst{127-96} = 0; - - let VTXInst = 1; -} - -class LoadParamFrag : PatFrag < - (ops node:$ptr), (load_type node:$ptr), - [{ return isConstantLoad(dyn_cast(N), 0); }] ->; - -def load_param : LoadParamFrag; -def load_param_exti8 : LoadParamFrag; -def load_param_exti16 : LoadParamFrag; - -def isR600 : Predicate<"Subtarget->getGeneration() <= AMDGPUSubtarget::R700">; - -def isR600toCayman - : Predicate< - "Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS">; - -//===----------------------------------------------------------------------===// -// R600 SDNodes -//===----------------------------------------------------------------------===// - -def INTERP_PAIR_XY : AMDGPUShaderInst < - (outs R600_TReg32_X:$dst0, R600_TReg32_Y:$dst1), - (ins i32imm:$src0, R600_TReg32_Y:$src1, R600_TReg32_X:$src2), - "INTERP_PAIR_XY $src0 $src1 $src2 : $dst0 dst1", - []>; - -def INTERP_PAIR_ZW : AMDGPUShaderInst < - (outs R600_TReg32_Z:$dst0, R600_TReg32_W:$dst1), - (ins i32imm:$src0, R600_TReg32_Y:$src1, R600_TReg32_X:$src2), - "INTERP_PAIR_ZW $src0 $src1 $src2 : $dst0 dst1", - []>; - -def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS", - SDTypeProfile<1, -1, [SDTCisInt<0>, SDTCisPtrTy<1>]>, - [SDNPVariadic] ->; - -def DOT4 : SDNode<"AMDGPUISD::DOT4", - SDTypeProfile<1, 8, [SDTCisFP<0>, SDTCisVT<1, f32>, SDTCisVT<2, f32>, - SDTCisVT<3, f32>, SDTCisVT<4, f32>, SDTCisVT<5, f32>, - SDTCisVT<6, f32>, SDTCisVT<7, f32>, SDTCisVT<8, f32>]>, - [] ->; - -def COS_HW : SDNode<"AMDGPUISD::COS_HW", - SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]> ->; - -def SIN_HW : SDNode<"AMDGPUISD::SIN_HW", - SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]> ->; - -def TEXTURE_FETCH_Type : SDTypeProfile<1, 19, [SDTCisFP<0>]>; - -def TEXTURE_FETCH: SDNode<"AMDGPUISD::TEXTURE_FETCH", TEXTURE_FETCH_Type, []>; - -multiclass TexPattern TextureOp, Instruction inst, ValueType vt = v4f32> { -def : Pat<(TEXTURE_FETCH (i32 TextureOp), vt:$SRC_GPR, - (i32 imm:$srcx), (i32 imm:$srcy), (i32 imm:$srcz), (i32 imm:$srcw), - (i32 imm:$offsetx), (i32 imm:$offsety), (i32 imm:$offsetz), - (i32 imm:$DST_SEL_X), (i32 imm:$DST_SEL_Y), (i32 imm:$DST_SEL_Z), - (i32 imm:$DST_SEL_W), - (i32 imm:$RESOURCE_ID), (i32 imm:$SAMPLER_ID), - (i32 imm:$COORD_TYPE_X), (i32 imm:$COORD_TYPE_Y), (i32 imm:$COORD_TYPE_Z), - (i32 imm:$COORD_TYPE_W)), - (inst R600_Reg128:$SRC_GPR, - imm:$srcx, imm:$srcy, imm:$srcz, imm:$srcw, - imm:$offsetx, imm:$offsety, imm:$offsetz, - imm:$DST_SEL_X, imm:$DST_SEL_Y, imm:$DST_SEL_Z, - imm:$DST_SEL_W, - imm:$RESOURCE_ID, imm:$SAMPLER_ID, - imm:$COORD_TYPE_X, imm:$COORD_TYPE_Y, imm:$COORD_TYPE_Z, - imm:$COORD_TYPE_W)>; -} - -//===----------------------------------------------------------------------===// -// Interpolation Instructions -//===----------------------------------------------------------------------===// - -def INTERP_VEC_LOAD : AMDGPUShaderInst < - (outs R600_Reg128:$dst), - (ins i32imm:$src0), - "INTERP_LOAD $src0 : $dst", - [(set R600_Reg128:$dst, (int_R600_interp_const imm:$src0))]>; - -def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> { - let bank_swizzle = 5; -} - -def INTERP_ZW : R600_2OP <0xD7, "INTERP_ZW", []> { - let bank_swizzle = 5; -} - -def INTERP_LOAD_P0 : R600_1OP <0xE0, "INTERP_LOAD_P0", []>; - -//===----------------------------------------------------------------------===// -// Export Instructions -//===----------------------------------------------------------------------===// - -def ExportType : SDTypeProfile<0, 7, [SDTCisFP<0>, SDTCisInt<1>]>; - -def EXPORT: SDNode<"AMDGPUISD::EXPORT", ExportType, - [SDNPHasChain, SDNPSideEffect]>; - -class ExportWord0 { - field bits<32> Word0; - - bits<13> arraybase; - bits<2> type; - bits<7> gpr; - bits<2> elem_size; - - let Word0{12-0} = arraybase; - let Word0{14-13} = type; - let Word0{21-15} = gpr; - let Word0{22} = 0; // RW_REL - let Word0{29-23} = 0; // INDEX_GPR - let Word0{31-30} = elem_size; -} - -class ExportSwzWord1 { - field bits<32> Word1; - - bits<3> sw_x; - bits<3> sw_y; - bits<3> sw_z; - bits<3> sw_w; - bits<1> eop; - bits<8> inst; - - let Word1{2-0} = sw_x; - let Word1{5-3} = sw_y; - let Word1{8-6} = sw_z; - let Word1{11-9} = sw_w; -} - -class ExportBufWord1 { - field bits<32> Word1; - - bits<12> arraySize; - bits<4> compMask; - bits<1> eop; - bits<8> inst; - - let Word1{11-0} = arraySize; - let Word1{15-12} = compMask; -} - -multiclass ExportPattern cf_inst> { - def : Pat<(int_R600_store_pixel_depth R600_Reg32:$reg), - (ExportInst - (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $reg, sub0), - 0, 61, 0, 7, 7, 7, cf_inst, 0) - >; - - def : Pat<(int_R600_store_pixel_stencil R600_Reg32:$reg), - (ExportInst - (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $reg, sub0), - 0, 61, 7, 0, 7, 7, cf_inst, 0) - >; - - def : Pat<(int_R600_store_dummy (i32 imm:$type)), - (ExportInst - (v4f32 (IMPLICIT_DEF)), imm:$type, 0, 7, 7, 7, 7, cf_inst, 0) - >; - - def : Pat<(int_R600_store_dummy 1), - (ExportInst - (v4f32 (IMPLICIT_DEF)), 1, 60, 7, 7, 7, 7, cf_inst, 0) - >; - - def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type), - (i32 imm:$swz_x), (i32 imm:$swz_y), (i32 imm:$swz_z), (i32 imm:$swz_w)), - (ExportInst R600_Reg128:$src, imm:$type, imm:$base, - imm:$swz_x, imm:$swz_y, imm:$swz_z, imm:$swz_w, cf_inst, 0) - >; - -} - -multiclass SteamOutputExportPattern buf0inst, bits<8> buf1inst, bits<8> buf2inst, bits<8> buf3inst> { -// Stream0 - def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), - (i32 imm:$arraybase), (i32 0), (i32 imm:$mask)), - (ExportInst R600_Reg128:$src, 0, imm:$arraybase, - 4095, imm:$mask, buf0inst, 0)>; -// Stream1 - def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), - (i32 imm:$arraybase), (i32 1), (i32 imm:$mask)), - (ExportInst $src, 0, imm:$arraybase, - 4095, imm:$mask, buf1inst, 0)>; -// Stream2 - def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), - (i32 imm:$arraybase), (i32 2), (i32 imm:$mask)), - (ExportInst $src, 0, imm:$arraybase, - 4095, imm:$mask, buf2inst, 0)>; -// Stream3 - def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), - (i32 imm:$arraybase), (i32 3), (i32 imm:$mask)), - (ExportInst $src, 0, imm:$arraybase, - 4095, imm:$mask, buf3inst, 0)>; -} - -// Export Instructions should not be duplicated by TailDuplication pass -// (which assumes that duplicable instruction are affected by exec mask) -let usesCustomInserter = 1, isNotDuplicable = 1 in { - -class ExportSwzInst : InstR600ISA<( - outs), - (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase, - RSel:$sw_x, RSel:$sw_y, RSel:$sw_z, RSel:$sw_w, i32imm:$inst, - i32imm:$eop), - !strconcat("EXPORT", " $gpr.$sw_x$sw_y$sw_z$sw_w"), - []>, ExportWord0, ExportSwzWord1 { - let elem_size = 3; - let Inst{31-0} = Word0; - let Inst{63-32} = Word1; - let IsExport = 1; -} - -} // End usesCustomInserter = 1 - -class ExportBufInst : InstR600ISA<( - outs), - (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase, - i32imm:$arraySize, i32imm:$compMask, i32imm:$inst, i32imm:$eop), - !strconcat("EXPORT", " $gpr"), - []>, ExportWord0, ExportBufWord1 { - let elem_size = 0; - let Inst{31-0} = Word0; - let Inst{63-32} = Word1; - let IsExport = 1; -} - -//===----------------------------------------------------------------------===// -// Control Flow Instructions -//===----------------------------------------------------------------------===// - - -def KCACHE : InstFlag<"printKCache">; - -class ALU_CLAUSE inst, string OpName> : AMDGPUInst <(outs), -(ins i32imm:$ADDR, i32imm:$KCACHE_BANK0, i32imm:$KCACHE_BANK1, -KCACHE:$KCACHE_MODE0, KCACHE:$KCACHE_MODE1, -i32imm:$KCACHE_ADDR0, i32imm:$KCACHE_ADDR1, -i32imm:$COUNT, i32imm:$Enabled), -!strconcat(OpName, " $COUNT, @$ADDR, " -"KC0[$KCACHE_MODE0], KC1[$KCACHE_MODE1]"), -[] >, CF_ALU_WORD0, CF_ALU_WORD1 { - field bits<64> Inst; - - let CF_INST = inst; - let ALT_CONST = 0; - let WHOLE_QUAD_MODE = 0; - let BARRIER = 1; - let isCodeGenOnly = 1; - let UseNamedOperandTable = 1; - - let Inst{31-0} = Word0; - let Inst{63-32} = Word1; -} - -class CF_WORD0_R600 { - field bits<32> Word0; - - bits<32> ADDR; - - let Word0 = ADDR; -} - -class CF_CLAUSE_R600 inst, dag ins, string AsmPrint> : AMDGPUInst <(outs), -ins, AsmPrint, [] >, CF_WORD0_R600, CF_WORD1_R600 { - field bits<64> Inst; - bits<4> CNT; - - let CF_INST = inst; - let BARRIER = 1; - let CF_CONST = 0; - let VALID_PIXEL_MODE = 0; - let COND = 0; - let COUNT = CNT{2-0}; - let CALL_COUNT = 0; - let COUNT_3 = CNT{3}; - let END_OF_PROGRAM = 0; - let WHOLE_QUAD_MODE = 0; - - let Inst{31-0} = Word0; - let Inst{63-32} = Word1; -} - -class CF_CLAUSE_EG inst, dag ins, string AsmPrint> : AMDGPUInst <(outs), -ins, AsmPrint, [] >, CF_WORD0_EG, CF_WORD1_EG { - field bits<64> Inst; - - let CF_INST = inst; - let BARRIER = 1; - let JUMPTABLE_SEL = 0; - let CF_CONST = 0; - let VALID_PIXEL_MODE = 0; - let COND = 0; - let END_OF_PROGRAM = 0; - - let Inst{31-0} = Word0; - let Inst{63-32} = Word1; -} - -def CF_ALU : ALU_CLAUSE<8, "ALU">; -def CF_ALU_PUSH_BEFORE : ALU_CLAUSE<9, "ALU_PUSH_BEFORE">; -def CF_ALU_POP_AFTER : ALU_CLAUSE<10, "ALU_POP_AFTER">; -def CF_ALU_CONTINUE : ALU_CLAUSE<13, "ALU_CONTINUE">; -def CF_ALU_BREAK : ALU_CLAUSE<14, "ALU_BREAK">; -def CF_ALU_ELSE_AFTER : ALU_CLAUSE<15, "ALU_ELSE_AFTER">; - -def FETCH_CLAUSE : AMDGPUInst <(outs), -(ins i32imm:$addr), "Fetch clause starting at $addr:", [] > { - field bits<8> Inst; - bits<8> num; - let Inst = num; - let isCodeGenOnly = 1; -} - -def ALU_CLAUSE : AMDGPUInst <(outs), -(ins i32imm:$addr), "ALU clause starting at $addr:", [] > { - field bits<8> Inst; - bits<8> num; - let Inst = num; - let isCodeGenOnly = 1; -} - -def LITERALS : AMDGPUInst <(outs), -(ins LITERAL:$literal1, LITERAL:$literal2), "$literal1, $literal2", [] > { - let isCodeGenOnly = 1; - - field bits<64> Inst; - bits<32> literal1; - bits<32> literal2; - - let Inst{31-0} = literal1; - let Inst{63-32} = literal2; -} - -def PAD : AMDGPUInst <(outs), (ins), "PAD", [] > { - field bits<64> Inst; -} - -let Predicates = [isR600toCayman] in { - -//===----------------------------------------------------------------------===// -// Common Instructions R600, R700, Evergreen, Cayman -//===----------------------------------------------------------------------===// - -def ADD : R600_2OP_Helper <0x0, "ADD", fadd>; -// Non-IEEE MUL: 0 * anything = 0 -def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE", int_AMDGPU_mul>; -def MUL_IEEE : R600_2OP_Helper <0x2, "MUL_IEEE", fmul>; -// TODO: Do these actually match the regular fmin/fmax behavior? -def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax_legacy>; -def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin_legacy>; -// According to https://msdn.microsoft.com/en-us/library/windows/desktop/cc308050%28v=vs.85%29.aspx -// DX10 min/max returns the other operand if one is NaN, -// this matches http://llvm.org/docs/LangRef.html#llvm-minnum-intrinsic -def MAX_DX10 : R600_2OP_Helper <0x5, "MAX_DX10", fmaxnum>; -def MIN_DX10 : R600_2OP_Helper <0x6, "MIN_DX10", fminnum>; - -// For the SET* instructions there is a naming conflict in TargetSelectionDAG.td, -// so some of the instruction names don't match the asm string. -// XXX: Use the defs in TargetSelectionDAG.td instead of intrinsics. -def SETE : R600_2OP < - 0x08, "SETE", - [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OEQ))] ->; - -def SGT : R600_2OP < - 0x09, "SETGT", - [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGT))] ->; - -def SGE : R600_2OP < - 0xA, "SETGE", - [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGE))] ->; - -def SNE : R600_2OP < - 0xB, "SETNE", - [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_UNE_NE))] ->; - -def SETE_DX10 : R600_2OP < - 0xC, "SETE_DX10", - [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OEQ))] ->; - -def SETGT_DX10 : R600_2OP < - 0xD, "SETGT_DX10", - [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGT))] ->; - -def SETGE_DX10 : R600_2OP < - 0xE, "SETGE_DX10", - [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGE))] ->; - -// FIXME: This should probably be COND_ONE -def SETNE_DX10 : R600_2OP < - 0xF, "SETNE_DX10", - [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_UNE_NE))] ->; - -def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>; -def TRUNC : R600_1OP_Helper <0x11, "TRUNC", ftrunc>; -def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>; -def RNDNE : R600_1OP_Helper <0x13, "RNDNE", frint>; -def FLOOR : R600_1OP_Helper <0x14, "FLOOR", ffloor>; - -def MOV : R600_1OP <0x19, "MOV", []>; - -let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in { - -class MOV_IMM : AMDGPUInst < - (outs R600_Reg32:$dst), - (ins immType:$imm), - "", - [] ->; - -} // end let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 - -def MOV_IMM_I32 : MOV_IMM; -def : Pat < - (imm:$val), - (MOV_IMM_I32 imm:$val) ->; - -def MOV_IMM_F32 : MOV_IMM; -def : Pat < - (fpimm:$val), - (MOV_IMM_F32 fpimm:$val) ->; - -def PRED_SETE : R600_2OP <0x20, "PRED_SETE", []>; -def PRED_SETGT : R600_2OP <0x21, "PRED_SETGT", []>; -def PRED_SETGE : R600_2OP <0x22, "PRED_SETGE", []>; -def PRED_SETNE : R600_2OP <0x23, "PRED_SETNE", []>; - -let hasSideEffects = 1 in { - -def KILLGT : R600_2OP <0x2D, "KILLGT", []>; - -} // end hasSideEffects - -def AND_INT : R600_2OP_Helper <0x30, "AND_INT", and>; -def OR_INT : R600_2OP_Helper <0x31, "OR_INT", or>; -def XOR_INT : R600_2OP_Helper <0x32, "XOR_INT", xor>; -def NOT_INT : R600_1OP_Helper <0x33, "NOT_INT", not>; -def ADD_INT : R600_2OP_Helper <0x34, "ADD_INT", add>; -def SUB_INT : R600_2OP_Helper <0x35, "SUB_INT", sub>; -def MAX_INT : R600_2OP_Helper <0x36, "MAX_INT", smax>; -def MIN_INT : R600_2OP_Helper <0x37, "MIN_INT", smin>; -def MAX_UINT : R600_2OP_Helper <0x38, "MAX_UINT", umax>; -def MIN_UINT : R600_2OP_Helper <0x39, "MIN_UINT", umin>; - -def SETE_INT : R600_2OP < - 0x3A, "SETE_INT", - [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETEQ))] ->; - -def SETGT_INT : R600_2OP < - 0x3B, "SETGT_INT", - [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETGT))] ->; - -def SETGE_INT : R600_2OP < - 0x3C, "SETGE_INT", - [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETGE))] ->; - -def SETNE_INT : R600_2OP < - 0x3D, "SETNE_INT", - [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETNE))] ->; - -def SETGT_UINT : R600_2OP < - 0x3E, "SETGT_UINT", - [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETUGT))] ->; - -def SETGE_UINT : R600_2OP < - 0x3F, "SETGE_UINT", - [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETUGE))] ->; - -def PRED_SETE_INT : R600_2OP <0x42, "PRED_SETE_INT", []>; -def PRED_SETGT_INT : R600_2OP <0x43, "PRED_SETGE_INT", []>; -def PRED_SETGE_INT : R600_2OP <0x44, "PRED_SETGE_INT", []>; -def PRED_SETNE_INT : R600_2OP <0x45, "PRED_SETNE_INT", []>; - -def CNDE_INT : R600_3OP < - 0x1C, "CNDE_INT", - [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_EQ))] ->; - -def CNDGE_INT : R600_3OP < - 0x1E, "CNDGE_INT", - [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_SGE))] ->; - -def CNDGT_INT : R600_3OP < - 0x1D, "CNDGT_INT", - [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_SGT))] ->; - -//===----------------------------------------------------------------------===// -// Texture instructions -//===----------------------------------------------------------------------===// - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { - -class R600_TEX inst, string opName> : - InstR600 <(outs R600_Reg128:$DST_GPR), - (ins R600_Reg128:$SRC_GPR, - RSel:$srcx, RSel:$srcy, RSel:$srcz, RSel:$srcw, - i32imm:$offsetx, i32imm:$offsety, i32imm:$offsetz, - RSel:$DST_SEL_X, RSel:$DST_SEL_Y, RSel:$DST_SEL_Z, RSel:$DST_SEL_W, - i32imm:$RESOURCE_ID, i32imm:$SAMPLER_ID, - CT:$COORD_TYPE_X, CT:$COORD_TYPE_Y, CT:$COORD_TYPE_Z, - CT:$COORD_TYPE_W), - !strconcat(opName, - " $DST_GPR.$DST_SEL_X$DST_SEL_Y$DST_SEL_Z$DST_SEL_W, " - "$SRC_GPR.$srcx$srcy$srcz$srcw " - "RID:$RESOURCE_ID SID:$SAMPLER_ID " - "CT:$COORD_TYPE_X$COORD_TYPE_Y$COORD_TYPE_Z$COORD_TYPE_W"), - [], - NullALU>, TEX_WORD0, TEX_WORD1, TEX_WORD2 { - let Inst{31-0} = Word0; - let Inst{63-32} = Word1; - - let TEX_INST = inst{4-0}; - let SRC_REL = 0; - let DST_REL = 0; - let LOD_BIAS = 0; - - let INST_MOD = 0; - let FETCH_WHOLE_QUAD = 0; - let ALT_CONST = 0; - let SAMPLER_INDEX_MODE = 0; - let RESOURCE_INDEX_MODE = 0; - - let TEXInst = 1; -} - -} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0 - - - -def TEX_SAMPLE : R600_TEX <0x10, "TEX_SAMPLE">; -def TEX_SAMPLE_C : R600_TEX <0x18, "TEX_SAMPLE_C">; -def TEX_SAMPLE_L : R600_TEX <0x11, "TEX_SAMPLE_L">; -def TEX_SAMPLE_C_L : R600_TEX <0x19, "TEX_SAMPLE_C_L">; -def TEX_SAMPLE_LB : R600_TEX <0x12, "TEX_SAMPLE_LB">; -def TEX_SAMPLE_C_LB : R600_TEX <0x1A, "TEX_SAMPLE_C_LB">; -def TEX_LD : R600_TEX <0x03, "TEX_LD">; -def TEX_LDPTR : R600_TEX <0x03, "TEX_LDPTR"> { - let INST_MOD = 1; -} -def TEX_GET_TEXTURE_RESINFO : R600_TEX <0x04, "TEX_GET_TEXTURE_RESINFO">; -def TEX_GET_GRADIENTS_H : R600_TEX <0x07, "TEX_GET_GRADIENTS_H">; -def TEX_GET_GRADIENTS_V : R600_TEX <0x08, "TEX_GET_GRADIENTS_V">; -def TEX_SET_GRADIENTS_H : R600_TEX <0x0B, "TEX_SET_GRADIENTS_H">; -def TEX_SET_GRADIENTS_V : R600_TEX <0x0C, "TEX_SET_GRADIENTS_V">; -def TEX_SAMPLE_G : R600_TEX <0x14, "TEX_SAMPLE_G">; -def TEX_SAMPLE_C_G : R600_TEX <0x1C, "TEX_SAMPLE_C_G">; - -defm : TexPattern<0, TEX_SAMPLE>; -defm : TexPattern<1, TEX_SAMPLE_C>; -defm : TexPattern<2, TEX_SAMPLE_L>; -defm : TexPattern<3, TEX_SAMPLE_C_L>; -defm : TexPattern<4, TEX_SAMPLE_LB>; -defm : TexPattern<5, TEX_SAMPLE_C_LB>; -defm : TexPattern<6, TEX_LD, v4i32>; -defm : TexPattern<7, TEX_GET_TEXTURE_RESINFO, v4i32>; -defm : TexPattern<8, TEX_GET_GRADIENTS_H>; -defm : TexPattern<9, TEX_GET_GRADIENTS_V>; -defm : TexPattern<10, TEX_LDPTR, v4i32>; - -//===----------------------------------------------------------------------===// -// Helper classes for common instructions -//===----------------------------------------------------------------------===// - -class MUL_LIT_Common inst> : R600_3OP < - inst, "MUL_LIT", - [] ->; - -class MULADD_Common inst> : R600_3OP < - inst, "MULADD", - [] ->; - -class MULADD_IEEE_Common inst> : R600_3OP < - inst, "MULADD_IEEE", - [(set f32:$dst, (fmad f32:$src0, f32:$src1, f32:$src2))] ->; - -class FMA_Common inst> : R600_3OP < - inst, "FMA", - [(set f32:$dst, (fma f32:$src0, f32:$src1, f32:$src2))], VecALU ->; - -class CNDE_Common inst> : R600_3OP < - inst, "CNDE", - [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OEQ))] ->; - -class CNDGT_Common inst> : R600_3OP < - inst, "CNDGT", - [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGT))] -> { - let Itinerary = VecALU; -} - -class CNDGE_Common inst> : R600_3OP < - inst, "CNDGE", - [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGE))] -> { - let Itinerary = VecALU; -} - - -let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in { -class R600_VEC2OP pattern> : InstR600 <(outs R600_Reg32:$dst), (ins -// Slot X - UEM:$update_exec_mask_X, UP:$update_pred_X, WRITE:$write_X, - OMOD:$omod_X, REL:$dst_rel_X, CLAMP:$clamp_X, - R600_TReg32_X:$src0_X, NEG:$src0_neg_X, REL:$src0_rel_X, ABS:$src0_abs_X, SEL:$src0_sel_X, - R600_TReg32_X:$src1_X, NEG:$src1_neg_X, REL:$src1_rel_X, ABS:$src1_abs_X, SEL:$src1_sel_X, - R600_Pred:$pred_sel_X, -// Slot Y - UEM:$update_exec_mask_Y, UP:$update_pred_Y, WRITE:$write_Y, - OMOD:$omod_Y, REL:$dst_rel_Y, CLAMP:$clamp_Y, - R600_TReg32_Y:$src0_Y, NEG:$src0_neg_Y, REL:$src0_rel_Y, ABS:$src0_abs_Y, SEL:$src0_sel_Y, - R600_TReg32_Y:$src1_Y, NEG:$src1_neg_Y, REL:$src1_rel_Y, ABS:$src1_abs_Y, SEL:$src1_sel_Y, - R600_Pred:$pred_sel_Y, -// Slot Z - UEM:$update_exec_mask_Z, UP:$update_pred_Z, WRITE:$write_Z, - OMOD:$omod_Z, REL:$dst_rel_Z, CLAMP:$clamp_Z, - R600_TReg32_Z:$src0_Z, NEG:$src0_neg_Z, REL:$src0_rel_Z, ABS:$src0_abs_Z, SEL:$src0_sel_Z, - R600_TReg32_Z:$src1_Z, NEG:$src1_neg_Z, REL:$src1_rel_Z, ABS:$src1_abs_Z, SEL:$src1_sel_Z, - R600_Pred:$pred_sel_Z, -// Slot W - UEM:$update_exec_mask_W, UP:$update_pred_W, WRITE:$write_W, - OMOD:$omod_W, REL:$dst_rel_W, CLAMP:$clamp_W, - R600_TReg32_W:$src0_W, NEG:$src0_neg_W, REL:$src0_rel_W, ABS:$src0_abs_W, SEL:$src0_sel_W, - R600_TReg32_W:$src1_W, NEG:$src1_neg_W, REL:$src1_rel_W, ABS:$src1_abs_W, SEL:$src1_sel_W, - R600_Pred:$pred_sel_W, - LITERAL:$literal0, LITERAL:$literal1), - "", - pattern, - AnyALU> { - - let UseNamedOperandTable = 1; - -} -} - -def DOT_4 : R600_VEC2OP<[(set R600_Reg32:$dst, (DOT4 - R600_TReg32_X:$src0_X, R600_TReg32_X:$src1_X, - R600_TReg32_Y:$src0_Y, R600_TReg32_Y:$src1_Y, - R600_TReg32_Z:$src0_Z, R600_TReg32_Z:$src1_Z, - R600_TReg32_W:$src0_W, R600_TReg32_W:$src1_W))]>; - - -class DOT4_Common inst> : R600_2OP ; - - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { -multiclass CUBE_Common inst> { - - def _pseudo : InstR600 < - (outs R600_Reg128:$dst), - (ins R600_Reg128:$src0), - "CUBE $dst $src0", - [(set v4f32:$dst, (int_AMDGPU_cube v4f32:$src0))], - VecALU - > { - let isPseudo = 1; - let UseNamedOperandTable = 1; - } - - def _real : R600_2OP ; -} -} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0 - -class EXP_IEEE_Common inst> : R600_1OP_Helper < - inst, "EXP_IEEE", fexp2 -> { - let Itinerary = TransALU; -} - -class FLT_TO_INT_Common inst> : R600_1OP_Helper < - inst, "FLT_TO_INT", fp_to_sint -> { - let Itinerary = TransALU; -} - -class INT_TO_FLT_Common inst> : R600_1OP_Helper < - inst, "INT_TO_FLT", sint_to_fp -> { - let Itinerary = TransALU; -} - -class FLT_TO_UINT_Common inst> : R600_1OP_Helper < - inst, "FLT_TO_UINT", fp_to_uint -> { - let Itinerary = TransALU; -} - -class UINT_TO_FLT_Common inst> : R600_1OP_Helper < - inst, "UINT_TO_FLT", uint_to_fp -> { - let Itinerary = TransALU; -} - -class LOG_CLAMPED_Common inst> : R600_1OP < - inst, "LOG_CLAMPED", [] ->; - -class LOG_IEEE_Common inst> : R600_1OP_Helper < - inst, "LOG_IEEE", flog2 -> { - let Itinerary = TransALU; -} - -class LSHL_Common inst> : R600_2OP_Helper ; -class LSHR_Common inst> : R600_2OP_Helper ; -class ASHR_Common inst> : R600_2OP_Helper ; -class MULHI_INT_Common inst> : R600_2OP_Helper < - inst, "MULHI_INT", mulhs -> { - let Itinerary = TransALU; -} -class MULHI_UINT_Common inst> : R600_2OP_Helper < - inst, "MULHI", mulhu -> { - let Itinerary = TransALU; -} -class MULLO_INT_Common inst> : R600_2OP_Helper < - inst, "MULLO_INT", mul -> { - let Itinerary = TransALU; -} -class MULLO_UINT_Common inst> : R600_2OP { - let Itinerary = TransALU; -} - -class RECIP_CLAMPED_Common inst> : R600_1OP < - inst, "RECIP_CLAMPED", [] -> { - let Itinerary = TransALU; -} - -class RECIP_IEEE_Common inst> : R600_1OP < - inst, "RECIP_IEEE", [(set f32:$dst, (AMDGPUrcp f32:$src0))] -> { - let Itinerary = TransALU; -} - -class RECIP_UINT_Common inst> : R600_1OP_Helper < - inst, "RECIP_UINT", AMDGPUurecip -> { - let Itinerary = TransALU; -} - -// Clamped to maximum. -class RECIPSQRT_CLAMPED_Common inst> : R600_1OP_Helper < - inst, "RECIPSQRT_CLAMPED", AMDGPUrsq_clamped -> { - let Itinerary = TransALU; -} - -class RECIPSQRT_IEEE_Common inst> : R600_1OP_Helper < - inst, "RECIPSQRT_IEEE", AMDGPUrsq_legacy -> { - let Itinerary = TransALU; -} - -// TODO: There is also RECIPSQRT_FF which clamps to zero. - -class SIN_Common inst> : R600_1OP < - inst, "SIN", [(set f32:$dst, (SIN_HW f32:$src0))]>{ - let Trig = 1; - let Itinerary = TransALU; -} - -class COS_Common inst> : R600_1OP < - inst, "COS", [(set f32:$dst, (COS_HW f32:$src0))]> { - let Trig = 1; - let Itinerary = TransALU; -} - -def CLAMP_R600 : CLAMP ; -def FABS_R600 : FABS; -def FNEG_R600 : FNEG; - -//===----------------------------------------------------------------------===// -// Helper patterns for complex intrinsics -//===----------------------------------------------------------------------===// - -// FIXME: Should be predicated on unsafe fp math. -multiclass DIV_Common { -def : Pat< - (int_AMDGPU_div f32:$src0, f32:$src1), - (MUL_IEEE $src0, (recip_ieee $src1)) ->; - -def : Pat< - (fdiv f32:$src0, f32:$src1), - (MUL_IEEE $src0, (recip_ieee $src1)) ->; - -def : RcpPat; -} - -class TGSI_LIT_Z_Common - : Pat < - (int_TGSI_lit_z f32:$src_x, f32:$src_y, f32:$src_w), - (exp_ieee (mul_lit (log_clamped (MAX $src_y, (f32 ZERO))), $src_w, $src_x)) ->; - -//===----------------------------------------------------------------------===// -// R600 / R700 Instructions -//===----------------------------------------------------------------------===// - -let Predicates = [isR600] in { - - def MUL_LIT_r600 : MUL_LIT_Common<0x0C>; - def MULADD_r600 : MULADD_Common<0x10>; - def MULADD_IEEE_r600 : MULADD_IEEE_Common<0x14>; - def CNDE_r600 : CNDE_Common<0x18>; - def CNDGT_r600 : CNDGT_Common<0x19>; - def CNDGE_r600 : CNDGE_Common<0x1A>; - def DOT4_r600 : DOT4_Common<0x50>; - defm CUBE_r600 : CUBE_Common<0x52>; - def EXP_IEEE_r600 : EXP_IEEE_Common<0x61>; - def LOG_CLAMPED_r600 : LOG_CLAMPED_Common<0x62>; - def LOG_IEEE_r600 : LOG_IEEE_Common<0x63>; - def RECIP_CLAMPED_r600 : RECIP_CLAMPED_Common<0x64>; - def RECIP_IEEE_r600 : RECIP_IEEE_Common<0x66>; - def RECIPSQRT_CLAMPED_r600 : RECIPSQRT_CLAMPED_Common<0x67>; - def RECIPSQRT_IEEE_r600 : RECIPSQRT_IEEE_Common<0x69>; - def FLT_TO_INT_r600 : FLT_TO_INT_Common<0x6b>; - def INT_TO_FLT_r600 : INT_TO_FLT_Common<0x6c>; - def FLT_TO_UINT_r600 : FLT_TO_UINT_Common<0x79>; - def UINT_TO_FLT_r600 : UINT_TO_FLT_Common<0x6d>; - def SIN_r600 : SIN_Common<0x6E>; - def COS_r600 : COS_Common<0x6F>; - def ASHR_r600 : ASHR_Common<0x70>; - def LSHR_r600 : LSHR_Common<0x71>; - def LSHL_r600 : LSHL_Common<0x72>; - def MULLO_INT_r600 : MULLO_INT_Common<0x73>; - def MULHI_INT_r600 : MULHI_INT_Common<0x74>; - def MULLO_UINT_r600 : MULLO_UINT_Common<0x75>; - def MULHI_UINT_r600 : MULHI_UINT_Common<0x76>; - def RECIP_UINT_r600 : RECIP_UINT_Common <0x78>; - - defm DIV_r600 : DIV_Common; - def : POW_Common ; - def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common; - - def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>; - def : RsqPat; - - def R600_ExportSwz : ExportSwzInst { - let Word1{20-17} = 0; // BURST_COUNT - let Word1{21} = eop; - let Word1{22} = 0; // VALID_PIXEL_MODE - let Word1{30-23} = inst; - let Word1{31} = 1; // BARRIER - } - defm : ExportPattern; - - def R600_ExportBuf : ExportBufInst { - let Word1{20-17} = 0; // BURST_COUNT - let Word1{21} = eop; - let Word1{22} = 0; // VALID_PIXEL_MODE - let Word1{30-23} = inst; - let Word1{31} = 1; // BARRIER - } - defm : SteamOutputExportPattern; - - def CF_TC_R600 : CF_CLAUSE_R600<1, (ins i32imm:$ADDR, i32imm:$CNT), - "TEX $CNT @$ADDR"> { - let POP_COUNT = 0; - } - def CF_VC_R600 : CF_CLAUSE_R600<2, (ins i32imm:$ADDR, i32imm:$CNT), - "VTX $CNT @$ADDR"> { - let POP_COUNT = 0; - } - def WHILE_LOOP_R600 : CF_CLAUSE_R600<6, (ins i32imm:$ADDR), - "LOOP_START_DX10 @$ADDR"> { - let POP_COUNT = 0; - let CNT = 0; - } - def END_LOOP_R600 : CF_CLAUSE_R600<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> { - let POP_COUNT = 0; - let CNT = 0; - } - def LOOP_BREAK_R600 : CF_CLAUSE_R600<9, (ins i32imm:$ADDR), - "LOOP_BREAK @$ADDR"> { - let POP_COUNT = 0; - let CNT = 0; - } - def CF_CONTINUE_R600 : CF_CLAUSE_R600<8, (ins i32imm:$ADDR), - "CONTINUE @$ADDR"> { - let POP_COUNT = 0; - let CNT = 0; - } - def CF_JUMP_R600 : CF_CLAUSE_R600<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT), - "JUMP @$ADDR POP:$POP_COUNT"> { - let CNT = 0; - } - def CF_PUSH_ELSE_R600 : CF_CLAUSE_R600<12, (ins i32imm:$ADDR), - "PUSH_ELSE @$ADDR"> { - let CNT = 0; - let POP_COUNT = 0; // FIXME? - } - def CF_ELSE_R600 : CF_CLAUSE_R600<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT), - "ELSE @$ADDR POP:$POP_COUNT"> { - let CNT = 0; - } - def CF_CALL_FS_R600 : CF_CLAUSE_R600<19, (ins), "CALL_FS"> { - let ADDR = 0; - let CNT = 0; - let POP_COUNT = 0; - } - def POP_R600 : CF_CLAUSE_R600<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT), - "POP @$ADDR POP:$POP_COUNT"> { - let CNT = 0; - } - def CF_END_R600 : CF_CLAUSE_R600<0, (ins), "CF_END"> { - let CNT = 0; - let POP_COUNT = 0; - let ADDR = 0; - let END_OF_PROGRAM = 1; - } - -} - - -//===----------------------------------------------------------------------===// -// Regist loads and stores - for indirect addressing -//===----------------------------------------------------------------------===// - -defm R600_ : RegisterLoadStore ; - - -//===----------------------------------------------------------------------===// -// Pseudo instructions -//===----------------------------------------------------------------------===// - -let isPseudo = 1 in { - -def PRED_X : InstR600 < - (outs R600_Predicate_Bit:$dst), - (ins R600_Reg32:$src0, i32imm:$src1, i32imm:$flags), - "", [], NullALU> { - let FlagOperandIdx = 3; -} - -let isTerminator = 1, isBranch = 1 in { -def JUMP_COND : InstR600 < - (outs), - (ins brtarget:$target, R600_Predicate_Bit:$p), - "JUMP $target ($p)", - [], AnyALU - >; - -def JUMP : InstR600 < - (outs), - (ins brtarget:$target), - "JUMP $target", - [], AnyALU - > -{ - let isPredicable = 1; - let isBarrier = 1; -} - -} // End isTerminator = 1, isBranch = 1 - -let usesCustomInserter = 1 in { - -let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in { - -def MASK_WRITE : AMDGPUShaderInst < - (outs), - (ins R600_Reg32:$src), - "MASK_WRITE $src", - [] ->; - -} // End mayLoad = 0, mayStore = 0, hasSideEffects = 1 - - -def TXD: InstR600 < - (outs R600_Reg128:$dst), - (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, - i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget), - "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget", - [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2, - imm:$resourceId, imm:$samplerId, imm:$textureTarget))], - NullALU > { - let TEXInst = 1; -} - -def TXD_SHADOW: InstR600 < - (outs R600_Reg128:$dst), - (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, - i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget), - "TXD_SHADOW $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget", - [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2, - imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))], - NullALU -> { - let TEXInst = 1; -} -} // End isPseudo = 1 -} // End usesCustomInserter = 1 - - -//===----------------------------------------------------------------------===// -// Constant Buffer Addressing Support -//===----------------------------------------------------------------------===// - -let usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in { -def CONST_COPY : Instruction { - let OutOperandList = (outs R600_Reg32:$dst); - let InOperandList = (ins i32imm:$src); - let Pattern = - [(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))]; - let AsmString = "CONST_COPY"; - let hasSideEffects = 0; - let isAsCheapAsAMove = 1; - let Itinerary = NullALU; -} -} // end usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" - -def TEX_VTX_CONSTBUF : - InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "VTX_READ_eg $dst, $ptr", - [(set v4i32:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr, (i32 imm:$BUFFER_ID)))]>, - VTX_WORD1_GPR, VTX_WORD0_eg { - - let VC_INST = 0; - let FETCH_TYPE = 2; - let FETCH_WHOLE_QUAD = 0; - let SRC_REL = 0; - let SRC_SEL_X = 0; - let DST_REL = 0; - let USE_CONST_FIELDS = 0; - let NUM_FORMAT_ALL = 2; - let FORMAT_COMP_ALL = 1; - let SRF_MODE_ALL = 1; - let MEGA_FETCH_COUNT = 16; - let DST_SEL_X = 0; - let DST_SEL_Y = 1; - let DST_SEL_Z = 2; - let DST_SEL_W = 3; - let DATA_FORMAT = 35; - - let Inst{31-0} = Word0; - let Inst{63-32} = Word1; - -// LLVM can only encode 64-bit instructions, so these fields are manually -// encoded in R600CodeEmitter -// -// bits<16> OFFSET; -// bits<2> ENDIAN_SWAP = 0; -// bits<1> CONST_BUF_NO_STRIDE = 0; -// bits<1> MEGA_FETCH = 0; -// bits<1> ALT_CONST = 0; -// bits<2> BUFFER_INDEX_MODE = 0; - - - -// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding -// is done in R600CodeEmitter -// -// Inst{79-64} = OFFSET; -// Inst{81-80} = ENDIAN_SWAP; -// Inst{82} = CONST_BUF_NO_STRIDE; -// Inst{83} = MEGA_FETCH; -// Inst{84} = ALT_CONST; -// Inst{86-85} = BUFFER_INDEX_MODE; -// Inst{95-86} = 0; Reserved - -// VTX_WORD3 (Padding) -// -// Inst{127-96} = 0; - let VTXInst = 1; -} - -def TEX_VTX_TEXBUF: - InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "TEX_VTX_EXPLICIT_READ $dst, $ptr", - [(set v4f32:$dst, (int_R600_load_texbuf ADDRGA_VAR_OFFSET:$ptr, imm:$BUFFER_ID))]>, -VTX_WORD1_GPR, VTX_WORD0_eg { - -let VC_INST = 0; -let FETCH_TYPE = 2; -let FETCH_WHOLE_QUAD = 0; -let SRC_REL = 0; -let SRC_SEL_X = 0; -let DST_REL = 0; -let USE_CONST_FIELDS = 1; -let NUM_FORMAT_ALL = 0; -let FORMAT_COMP_ALL = 0; -let SRF_MODE_ALL = 1; -let MEGA_FETCH_COUNT = 16; -let DST_SEL_X = 0; -let DST_SEL_Y = 1; -let DST_SEL_Z = 2; -let DST_SEL_W = 3; -let DATA_FORMAT = 0; - -let Inst{31-0} = Word0; -let Inst{63-32} = Word1; - -// LLVM can only encode 64-bit instructions, so these fields are manually -// encoded in R600CodeEmitter -// -// bits<16> OFFSET; -// bits<2> ENDIAN_SWAP = 0; -// bits<1> CONST_BUF_NO_STRIDE = 0; -// bits<1> MEGA_FETCH = 0; -// bits<1> ALT_CONST = 0; -// bits<2> BUFFER_INDEX_MODE = 0; - - - -// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding -// is done in R600CodeEmitter -// -// Inst{79-64} = OFFSET; -// Inst{81-80} = ENDIAN_SWAP; -// Inst{82} = CONST_BUF_NO_STRIDE; -// Inst{83} = MEGA_FETCH; -// Inst{84} = ALT_CONST; -// Inst{86-85} = BUFFER_INDEX_MODE; -// Inst{95-86} = 0; Reserved - -// VTX_WORD3 (Padding) -// -// Inst{127-96} = 0; - let VTXInst = 1; -} - -//===---------------------------------------------------------------------===// -// Flow and Program control Instructions -//===---------------------------------------------------------------------===// -class ILFormat pattern> -: Instruction { - - let Namespace = "AMDGPU"; - dag OutOperandList = outs; - dag InOperandList = ins; - let Pattern = pattern; - let AsmString = !strconcat(asmstr, "\n"); - let isPseudo = 1; - let Itinerary = NullALU; - bit hasIEEEFlag = 0; - bit hasZeroOpFlag = 0; - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let isCodeGenOnly = 1; -} - -multiclass BranchConditional { - def _i32 : ILFormat<(outs), - (ins brtarget:$target, rci:$src0), - "; i32 Pseudo branch instruction", - [(Op bb:$target, (i32 rci:$src0))]>; - def _f32 : ILFormat<(outs), - (ins brtarget:$target, rcf:$src0), - "; f32 Pseudo branch instruction", - [(Op bb:$target, (f32 rcf:$src0))]>; -} - -// Only scalar types should generate flow control -multiclass BranchInstr { - def _i32 : ILFormat<(outs), (ins R600_Reg32:$src), - !strconcat(name, " $src"), []>; - def _f32 : ILFormat<(outs), (ins R600_Reg32:$src), - !strconcat(name, " $src"), []>; -} -// Only scalar types should generate flow control -multiclass BranchInstr2 { - def _i32 : ILFormat<(outs), (ins R600_Reg32:$src0, R600_Reg32:$src1), - !strconcat(name, " $src0, $src1"), []>; - def _f32 : ILFormat<(outs), (ins R600_Reg32:$src0, R600_Reg32:$src1), - !strconcat(name, " $src0, $src1"), []>; -} - -//===---------------------------------------------------------------------===// -// Custom Inserter for Branches and returns, this eventually will be a -// separate pass -//===---------------------------------------------------------------------===// -let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in { - def BRANCH : ILFormat<(outs), (ins brtarget:$target), - "; Pseudo unconditional branch instruction", - [(br bb:$target)]>; - defm BRANCH_COND : BranchConditional; -} - -//===---------------------------------------------------------------------===// -// Return instruction -//===---------------------------------------------------------------------===// -let isTerminator = 1, isReturn = 1, hasCtrlDep = 1, - usesCustomInserter = 1 in { - def RETURN : ILFormat<(outs), (ins variable_ops), - "RETURN", [(IL_retflag)]>; -} - -//===----------------------------------------------------------------------===// -// Branch Instructions -//===----------------------------------------------------------------------===// - -def IF_PREDICATE_SET : ILFormat<(outs), (ins R600_Reg32:$src), - "IF_PREDICATE_SET $src", []>; - -let isTerminator=1 in { - def BREAK : ILFormat< (outs), (ins), - "BREAK", []>; - def CONTINUE : ILFormat< (outs), (ins), - "CONTINUE", []>; - def DEFAULT : ILFormat< (outs), (ins), - "DEFAULT", []>; - def ELSE : ILFormat< (outs), (ins), - "ELSE", []>; - def ENDSWITCH : ILFormat< (outs), (ins), - "ENDSWITCH", []>; - def ENDMAIN : ILFormat< (outs), (ins), - "ENDMAIN", []>; - def END : ILFormat< (outs), (ins), - "END", []>; - def ENDFUNC : ILFormat< (outs), (ins), - "ENDFUNC", []>; - def ENDIF : ILFormat< (outs), (ins), - "ENDIF", []>; - def WHILELOOP : ILFormat< (outs), (ins), - "WHILE", []>; - def ENDLOOP : ILFormat< (outs), (ins), - "ENDLOOP", []>; - def FUNC : ILFormat< (outs), (ins), - "FUNC", []>; - def RETDYN : ILFormat< (outs), (ins), - "RET_DYN", []>; - // This opcode has custom swizzle pattern encoded in Swizzle Encoder - defm IF_LOGICALNZ : BranchInstr<"IF_LOGICALNZ">; - // This opcode has custom swizzle pattern encoded in Swizzle Encoder - defm IF_LOGICALZ : BranchInstr<"IF_LOGICALZ">; - // This opcode has custom swizzle pattern encoded in Swizzle Encoder - defm BREAK_LOGICALNZ : BranchInstr<"BREAK_LOGICALNZ">; - // This opcode has custom swizzle pattern encoded in Swizzle Encoder - defm BREAK_LOGICALZ : BranchInstr<"BREAK_LOGICALZ">; - // This opcode has custom swizzle pattern encoded in Swizzle Encoder - defm CONTINUE_LOGICALNZ : BranchInstr<"CONTINUE_LOGICALNZ">; - // This opcode has custom swizzle pattern encoded in Swizzle Encoder - defm CONTINUE_LOGICALZ : BranchInstr<"CONTINUE_LOGICALZ">; - defm IFC : BranchInstr2<"IFC">; - defm BREAKC : BranchInstr2<"BREAKC">; - defm CONTINUEC : BranchInstr2<"CONTINUEC">; -} - -//===----------------------------------------------------------------------===// -// Indirect addressing pseudo instructions -//===----------------------------------------------------------------------===// - -let isPseudo = 1 in { - -class ExtractVertical : InstR600 < - (outs R600_Reg32:$dst), - (ins vec_rc:$vec, R600_Reg32:$index), "", - [], - AnyALU ->; - -let Constraints = "$dst = $vec" in { - -class InsertVertical : InstR600 < - (outs vec_rc:$dst), - (ins vec_rc:$vec, R600_Reg32:$value, R600_Reg32:$index), "", - [], - AnyALU ->; - -} // End Constraints = "$dst = $vec" - -} // End isPseudo = 1 - -def R600_EXTRACT_ELT_V2 : ExtractVertical ; -def R600_EXTRACT_ELT_V4 : ExtractVertical ; - -def R600_INSERT_ELT_V2 : InsertVertical ; -def R600_INSERT_ELT_V4 : InsertVertical ; - -class ExtractVerticalPat : Pat < - (scalar_ty (extractelt vec_ty:$vec, i32:$index)), - (inst $vec, $index) ->; - -def : ExtractVerticalPat ; -def : ExtractVerticalPat ; -def : ExtractVerticalPat ; -def : ExtractVerticalPat ; - -class InsertVerticalPat : Pat < - (vec_ty (insertelt vec_ty:$vec, scalar_ty:$value, i32:$index)), - (inst $vec, $value, $index) ->; - -def : InsertVerticalPat ; -def : InsertVerticalPat ; -def : InsertVerticalPat ; -def : InsertVerticalPat ; - -//===----------------------------------------------------------------------===// -// ISel Patterns -//===----------------------------------------------------------------------===// - -// CND*_INT Pattterns for f32 True / False values - -class CND_INT_f32 : Pat < - (selectcc i32:$src0, 0, f32:$src1, f32:$src2, cc), - (cnd $src0, $src1, $src2) ->; - -def : CND_INT_f32 ; -def : CND_INT_f32 ; -def : CND_INT_f32 ; - -//CNDGE_INT extra pattern -def : Pat < - (selectcc i32:$src0, -1, i32:$src1, i32:$src2, COND_SGT), - (CNDGE_INT $src0, $src1, $src2) ->; - -// KIL Patterns -def KILP : Pat < - (int_AMDGPU_kilp), - (MASK_WRITE (KILLGT (f32 ONE), (f32 ZERO))) ->; - -def KIL : Pat < - (int_AMDGPU_kill f32:$src0), - (MASK_WRITE (KILLGT (f32 ZERO), $src0)) ->; - -def : Extract_Element ; -def : Extract_Element ; -def : Extract_Element ; -def : Extract_Element ; - -def : Insert_Element ; -def : Insert_Element ; -def : Insert_Element ; -def : Insert_Element ; - -def : Extract_Element ; -def : Extract_Element ; -def : Extract_Element ; -def : Extract_Element ; - -def : Insert_Element ; -def : Insert_Element ; -def : Insert_Element ; -def : Insert_Element ; - -def : Extract_Element ; -def : Extract_Element ; - -def : Insert_Element ; -def : Insert_Element ; - -def : Extract_Element ; -def : Extract_Element ; - -def : Insert_Element ; -def : Insert_Element ; - -// bitconvert patterns - -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; - -// DWORDADDR pattern -def : DwordAddrPat ; - -} // End isR600toCayman Predicate - -let Predicates = [isR600] in { -// Intrinsic patterns -defm : Expand24IBitOps; -defm : Expand24UBitOps; -} // End isR600 - -def getLDSNoRetOp : InstrMapping { - let FilterClass = "R600_LDS_1A1D"; - let RowFields = ["BaseOp"]; - let ColFields = ["DisableEncoding"]; - let KeyCol = ["$dst"]; - let ValueCols = [[""""]]; -} diff --git a/lib/Target/R600/R600Intrinsics.td b/lib/Target/R600/R600Intrinsics.td deleted file mode 100644 index 9681747006d..00000000000 --- a/lib/Target/R600/R600Intrinsics.td +++ /dev/null @@ -1,75 +0,0 @@ -//===-- R600Intrinsics.td - R600 Instrinsic defs -------*- tablegen -*-----===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// R600 Intrinsic Definitions -// -//===----------------------------------------------------------------------===// - -let TargetPrefix = "R600", isTarget = 1 in { - class TextureIntrinsicFloatInput : - Intrinsic<[llvm_v4f32_ty], [ - llvm_v4f32_ty, // Coord - llvm_i32_ty, // offset_x - llvm_i32_ty, // offset_y, - llvm_i32_ty, // offset_z, - llvm_i32_ty, // resource_id - llvm_i32_ty, // samplerid - llvm_i32_ty, // coord_type_x - llvm_i32_ty, // coord_type_y - llvm_i32_ty, // coord_type_z - llvm_i32_ty // coord_type_w - ], [IntrNoMem]>; - class TextureIntrinsicInt32Input : - Intrinsic<[llvm_v4i32_ty], [ - llvm_v4i32_ty, // Coord - llvm_i32_ty, // offset_x - llvm_i32_ty, // offset_y, - llvm_i32_ty, // offset_z, - llvm_i32_ty, // resource_id - llvm_i32_ty, // samplerid - llvm_i32_ty, // coord_type_x - llvm_i32_ty, // coord_type_y - llvm_i32_ty, // coord_type_z - llvm_i32_ty // coord_type_w - ], [IntrNoMem]>; - - def int_R600_load_input : - Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_R600_interp_input : - Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_R600_interp_const : - Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], [IntrNoMem]>; -def int_R600_interp_xy : - Intrinsic<[llvm_v2f32_ty], [llvm_i32_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; -def int_R600_interp_zw : - Intrinsic<[llvm_v2f32_ty], [llvm_i32_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_R600_load_texbuf : - Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_R600_tex : TextureIntrinsicFloatInput; - def int_R600_texc : TextureIntrinsicFloatInput; - def int_R600_txl : TextureIntrinsicFloatInput; - def int_R600_txlc : TextureIntrinsicFloatInput; - def int_R600_txb : TextureIntrinsicFloatInput; - def int_R600_txbc : TextureIntrinsicFloatInput; - def int_R600_txf : TextureIntrinsicInt32Input; - def int_R600_ldptr : TextureIntrinsicInt32Input; - def int_R600_txq : TextureIntrinsicInt32Input; - def int_R600_ddx : TextureIntrinsicFloatInput; - def int_R600_ddy : TextureIntrinsicFloatInput; - def int_R600_store_swizzle : - Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>; - def int_R600_store_stream_output : - Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; - def int_R600_store_pixel_depth : - Intrinsic<[], [llvm_float_ty], []>; - def int_R600_store_pixel_stencil : - Intrinsic<[], [llvm_float_ty], []>; - def int_R600_store_dummy : - Intrinsic<[], [llvm_i32_ty], []>; -} diff --git a/lib/Target/R600/R600MachineFunctionInfo.cpp b/lib/Target/R600/R600MachineFunctionInfo.cpp deleted file mode 100644 index 01105c614c5..00000000000 --- a/lib/Target/R600/R600MachineFunctionInfo.cpp +++ /dev/null @@ -1,20 +0,0 @@ -//===-- R600MachineFunctionInfo.cpp - R600 Machine Function Info-*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -/// \file -//===----------------------------------------------------------------------===// - -#include "R600MachineFunctionInfo.h" - -using namespace llvm; - - -// Pin the vtable to this file. -void R600MachineFunctionInfo::anchor() {} - -R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF) - : AMDGPUMachineFunction(MF) { } diff --git a/lib/Target/R600/R600MachineFunctionInfo.h b/lib/Target/R600/R600MachineFunctionInfo.h deleted file mode 100644 index 263561edd30..00000000000 --- a/lib/Target/R600/R600MachineFunctionInfo.h +++ /dev/null @@ -1,34 +0,0 @@ -//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_R600MACHINEFUNCTIONINFO_H -#define LLVM_LIB_TARGET_R600_R600MACHINEFUNCTIONINFO_H - -#include "AMDGPUMachineFunction.h" -#include "llvm/ADT/BitVector.h" -#include "llvm/CodeGen/SelectionDAG.h" -#include - -namespace llvm { - -class R600MachineFunctionInfo : public AMDGPUMachineFunction { - void anchor() override; -public: - R600MachineFunctionInfo(const MachineFunction &MF); - SmallVector LiveOuts; - std::vector IndirectRegs; - unsigned StackSize; -}; - -} // End llvm namespace - -#endif diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp deleted file mode 100644 index bcde5fb50da..00000000000 --- a/lib/Target/R600/R600MachineScheduler.cpp +++ /dev/null @@ -1,469 +0,0 @@ -//===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief R600 Machine Scheduler interface -// -//===----------------------------------------------------------------------===// - -#include "R600MachineScheduler.h" -#include "AMDGPUSubtarget.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Pass.h" -#include "llvm/IR/LegacyPassManager.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -#define DEBUG_TYPE "misched" - -void R600SchedStrategy::initialize(ScheduleDAGMI *dag) { - assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness"); - DAG = static_cast(dag); - const AMDGPUSubtarget &ST = DAG->MF.getSubtarget(); - TII = static_cast(DAG->TII); - TRI = static_cast(DAG->TRI); - VLIW5 = !ST.hasCaymanISA(); - MRI = &DAG->MRI; - CurInstKind = IDOther; - CurEmitted = 0; - OccupedSlotsMask = 31; - InstKindLimit[IDAlu] = TII->getMaxAlusPerClause(); - InstKindLimit[IDOther] = 32; - InstKindLimit[IDFetch] = ST.getTexVTXClauseSize(); - AluInstCount = 0; - FetchInstCount = 0; -} - -void R600SchedStrategy::MoveUnits(std::vector &QSrc, - std::vector &QDst) -{ - QDst.insert(QDst.end(), QSrc.begin(), QSrc.end()); - QSrc.clear(); -} - -static -unsigned getWFCountLimitedByGPR(unsigned GPRCount) { - assert (GPRCount && "GPRCount cannot be 0"); - return 248 / GPRCount; -} - -SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) { - SUnit *SU = nullptr; - NextInstKind = IDOther; - - IsTopNode = false; - - // check if we might want to switch current clause type - bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) || - (Available[CurInstKind].empty()); - bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) && - (!Available[IDFetch].empty() || !Available[IDOther].empty()); - - if (CurInstKind == IDAlu && !Available[IDFetch].empty()) { - // We use the heuristic provided by AMD Accelerated Parallel Processing - // OpenCL Programming Guide : - // The approx. number of WF that allows TEX inst to hide ALU inst is : - // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU)) - float ALUFetchRationEstimate = - (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) / - (FetchInstCount + Available[IDFetch].size()); - if (ALUFetchRationEstimate == 0) { - AllowSwitchFromAlu = true; - } else { - unsigned NeededWF = 62.5f / ALUFetchRationEstimate; - DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" ); - // We assume the local GPR requirements to be "dominated" by the requirement - // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and - // after TEX are indeed likely to consume or generate values from/for the - // TEX clause. - // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause - // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need - // one GPR) or TmXYZW = TnXYZW (need 2 GPR). - // (TODO : use RegisterPressure) - // If we are going too use too many GPR, we flush Fetch instruction to lower - // register pressure on 128 bits regs. - unsigned NearRegisterRequirement = 2 * Available[IDFetch].size(); - if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement)) - AllowSwitchFromAlu = true; - } - } - - if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) || - (!AllowSwitchFromAlu && CurInstKind == IDAlu))) { - // try to pick ALU - SU = pickAlu(); - if (!SU && !PhysicalRegCopy.empty()) { - SU = PhysicalRegCopy.front(); - PhysicalRegCopy.erase(PhysicalRegCopy.begin()); - } - if (SU) { - if (CurEmitted >= InstKindLimit[IDAlu]) - CurEmitted = 0; - NextInstKind = IDAlu; - } - } - - if (!SU) { - // try to pick FETCH - SU = pickOther(IDFetch); - if (SU) - NextInstKind = IDFetch; - } - - // try to pick other - if (!SU) { - SU = pickOther(IDOther); - if (SU) - NextInstKind = IDOther; - } - - DEBUG( - if (SU) { - dbgs() << " ** Pick node **\n"; - SU->dump(DAG); - } else { - dbgs() << "NO NODE \n"; - for (unsigned i = 0; i < DAG->SUnits.size(); i++) { - const SUnit &S = DAG->SUnits[i]; - if (!S.isScheduled) - S.dump(DAG); - } - } - ); - - return SU; -} - -void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { - if (NextInstKind != CurInstKind) { - DEBUG(dbgs() << "Instruction Type Switch\n"); - if (NextInstKind != IDAlu) - OccupedSlotsMask |= 31; - CurEmitted = 0; - CurInstKind = NextInstKind; - } - - if (CurInstKind == IDAlu) { - AluInstCount ++; - switch (getAluKind(SU)) { - case AluT_XYZW: - CurEmitted += 4; - break; - case AluDiscarded: - break; - default: { - ++CurEmitted; - for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(), - E = SU->getInstr()->operands_end(); It != E; ++It) { - MachineOperand &MO = *It; - if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X) - ++CurEmitted; - } - } - } - } else { - ++CurEmitted; - } - - - DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n"); - - if (CurInstKind != IDFetch) { - MoveUnits(Pending[IDFetch], Available[IDFetch]); - } else - FetchInstCount++; -} - -static bool -isPhysicalRegCopy(MachineInstr *MI) { - if (MI->getOpcode() != AMDGPU::COPY) - return false; - - return !TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg()); -} - -void R600SchedStrategy::releaseTopNode(SUnit *SU) { - DEBUG(dbgs() << "Top Releasing ";SU->dump(DAG);); -} - -void R600SchedStrategy::releaseBottomNode(SUnit *SU) { - DEBUG(dbgs() << "Bottom Releasing ";SU->dump(DAG);); - if (isPhysicalRegCopy(SU->getInstr())) { - PhysicalRegCopy.push_back(SU); - return; - } - - int IK = getInstKind(SU); - - // There is no export clause, we can schedule one as soon as its ready - if (IK == IDOther) - Available[IDOther].push_back(SU); - else - Pending[IK].push_back(SU); - -} - -bool R600SchedStrategy::regBelongsToClass(unsigned Reg, - const TargetRegisterClass *RC) const { - if (!TargetRegisterInfo::isVirtualRegister(Reg)) { - return RC->contains(Reg); - } else { - return MRI->getRegClass(Reg) == RC; - } -} - -R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { - MachineInstr *MI = SU->getInstr(); - - if (TII->isTransOnly(MI)) - return AluTrans; - - switch (MI->getOpcode()) { - case AMDGPU::PRED_X: - return AluPredX; - case AMDGPU::INTERP_PAIR_XY: - case AMDGPU::INTERP_PAIR_ZW: - case AMDGPU::INTERP_VEC_LOAD: - case AMDGPU::DOT_4: - return AluT_XYZW; - case AMDGPU::COPY: - if (MI->getOperand(1).isUndef()) { - // MI will become a KILL, don't considers it in scheduling - return AluDiscarded; - } - default: - break; - } - - // Does the instruction take a whole IG ? - // XXX: Is it possible to add a helper function in R600InstrInfo that can - // be used here and in R600PacketizerList::isSoloInstruction() ? - if(TII->isVector(*MI) || - TII->isCubeOp(MI->getOpcode()) || - TII->isReductionOp(MI->getOpcode()) || - MI->getOpcode() == AMDGPU::GROUP_BARRIER) { - return AluT_XYZW; - } - - if (TII->isLDSInstr(MI->getOpcode())) { - return AluT_X; - } - - // Is the result already assigned to a channel ? - unsigned DestSubReg = MI->getOperand(0).getSubReg(); - switch (DestSubReg) { - case AMDGPU::sub0: - return AluT_X; - case AMDGPU::sub1: - return AluT_Y; - case AMDGPU::sub2: - return AluT_Z; - case AMDGPU::sub3: - return AluT_W; - default: - break; - } - - // Is the result already member of a X/Y/Z/W class ? - unsigned DestReg = MI->getOperand(0).getReg(); - if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) || - regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass)) - return AluT_X; - if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass)) - return AluT_Y; - if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass)) - return AluT_Z; - if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass)) - return AluT_W; - if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass)) - return AluT_XYZW; - - // LDS src registers cannot be used in the Trans slot. - if (TII->readsLDSSrcReg(MI)) - return AluT_XYZW; - - return AluAny; - -} - -int R600SchedStrategy::getInstKind(SUnit* SU) { - int Opcode = SU->getInstr()->getOpcode(); - - if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode)) - return IDFetch; - - if (TII->isALUInstr(Opcode)) { - return IDAlu; - } - - switch (Opcode) { - case AMDGPU::PRED_X: - case AMDGPU::COPY: - case AMDGPU::CONST_COPY: - case AMDGPU::INTERP_PAIR_XY: - case AMDGPU::INTERP_PAIR_ZW: - case AMDGPU::INTERP_VEC_LOAD: - case AMDGPU::DOT_4: - return IDAlu; - default: - return IDOther; - } -} - -SUnit *R600SchedStrategy::PopInst(std::vector &Q, bool AnyALU) { - if (Q.empty()) - return nullptr; - for (std::vector::reverse_iterator It = Q.rbegin(), E = Q.rend(); - It != E; ++It) { - SUnit *SU = *It; - InstructionsGroupCandidate.push_back(SU->getInstr()); - if (TII->fitsConstReadLimitations(InstructionsGroupCandidate) - && (!AnyALU || !TII->isVectorOnly(SU->getInstr())) - ) { - InstructionsGroupCandidate.pop_back(); - Q.erase((It + 1).base()); - return SU; - } else { - InstructionsGroupCandidate.pop_back(); - } - } - return nullptr; -} - -void R600SchedStrategy::LoadAlu() { - std::vector &QSrc = Pending[IDAlu]; - for (unsigned i = 0, e = QSrc.size(); i < e; ++i) { - AluKind AK = getAluKind(QSrc[i]); - AvailableAlus[AK].push_back(QSrc[i]); - } - QSrc.clear(); -} - -void R600SchedStrategy::PrepareNextSlot() { - DEBUG(dbgs() << "New Slot\n"); - assert (OccupedSlotsMask && "Slot wasn't filled"); - OccupedSlotsMask = 0; -// if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS) -// OccupedSlotsMask |= 16; - InstructionsGroupCandidate.clear(); - LoadAlu(); -} - -void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) { - int DstIndex = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); - if (DstIndex == -1) { - return; - } - unsigned DestReg = MI->getOperand(DstIndex).getReg(); - // PressureRegister crashes if an operand is def and used in the same inst - // and we try to constraint its regclass - for (MachineInstr::mop_iterator It = MI->operands_begin(), - E = MI->operands_end(); It != E; ++It) { - MachineOperand &MO = *It; - if (MO.isReg() && !MO.isDef() && - MO.getReg() == DestReg) - return; - } - // Constrains the regclass of DestReg to assign it to Slot - switch (Slot) { - case 0: - MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_XRegClass); - break; - case 1: - MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_YRegClass); - break; - case 2: - MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass); - break; - case 3: - MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_WRegClass); - break; - } -} - -SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) { - static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W}; - SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu); - if (SlotedSU) - return SlotedSU; - SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu); - if (UnslotedSU) - AssignSlot(UnslotedSU->getInstr(), Slot); - return UnslotedSU; -} - -unsigned R600SchedStrategy::AvailablesAluCount() const { - return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() + - AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() + - AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() + - AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() + - AvailableAlus[AluPredX].size(); -} - -SUnit* R600SchedStrategy::pickAlu() { - while (AvailablesAluCount() || !Pending[IDAlu].empty()) { - if (!OccupedSlotsMask) { - // Bottom up scheduling : predX must comes first - if (!AvailableAlus[AluPredX].empty()) { - OccupedSlotsMask |= 31; - return PopInst(AvailableAlus[AluPredX], false); - } - // Flush physical reg copies (RA will discard them) - if (!AvailableAlus[AluDiscarded].empty()) { - OccupedSlotsMask |= 31; - return PopInst(AvailableAlus[AluDiscarded], false); - } - // If there is a T_XYZW alu available, use it - if (!AvailableAlus[AluT_XYZW].empty()) { - OccupedSlotsMask |= 15; - return PopInst(AvailableAlus[AluT_XYZW], false); - } - } - bool TransSlotOccuped = OccupedSlotsMask & 16; - if (!TransSlotOccuped && VLIW5) { - if (!AvailableAlus[AluTrans].empty()) { - OccupedSlotsMask |= 16; - return PopInst(AvailableAlus[AluTrans], false); - } - SUnit *SU = AttemptFillSlot(3, true); - if (SU) { - OccupedSlotsMask |= 16; - return SU; - } - } - for (int Chan = 3; Chan > -1; --Chan) { - bool isOccupied = OccupedSlotsMask & (1 << Chan); - if (!isOccupied) { - SUnit *SU = AttemptFillSlot(Chan, false); - if (SU) { - OccupedSlotsMask |= (1 << Chan); - InstructionsGroupCandidate.push_back(SU->getInstr()); - return SU; - } - } - } - PrepareNextSlot(); - } - return nullptr; -} - -SUnit* R600SchedStrategy::pickOther(int QID) { - SUnit *SU = nullptr; - std::vector &AQ = Available[QID]; - - if (AQ.empty()) { - MoveUnits(Pending[QID], AQ); - } - if (!AQ.empty()) { - SU = AQ.back(); - AQ.resize(AQ.size() - 1); - } - return SU; -} diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h deleted file mode 100644 index fc5b95c28e7..00000000000 --- a/lib/Target/R600/R600MachineScheduler.h +++ /dev/null @@ -1,103 +0,0 @@ -//===-- R600MachineScheduler.h - R600 Scheduler Interface -*- C++ -*-------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief R600 Machine Scheduler interface -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_R600MACHINESCHEDULER_H -#define LLVM_LIB_TARGET_R600_R600MACHINESCHEDULER_H - -#include "R600InstrInfo.h" -#include "llvm/ADT/PriorityQueue.h" -#include "llvm/CodeGen/MachineScheduler.h" -#include "llvm/Support/Debug.h" - -using namespace llvm; - -namespace llvm { - -class R600SchedStrategy : public MachineSchedStrategy { - - const ScheduleDAGMILive *DAG; - const R600InstrInfo *TII; - const R600RegisterInfo *TRI; - MachineRegisterInfo *MRI; - - enum InstKind { - IDAlu, - IDFetch, - IDOther, - IDLast - }; - - enum AluKind { - AluAny, - AluT_X, - AluT_Y, - AluT_Z, - AluT_W, - AluT_XYZW, - AluPredX, - AluTrans, - AluDiscarded, // LLVM Instructions that are going to be eliminated - AluLast - }; - - std::vector Available[IDLast], Pending[IDLast]; - std::vector AvailableAlus[AluLast]; - std::vector PhysicalRegCopy; - - InstKind CurInstKind; - int CurEmitted; - InstKind NextInstKind; - - unsigned AluInstCount; - unsigned FetchInstCount; - - int InstKindLimit[IDLast]; - - int OccupedSlotsMask; - -public: - R600SchedStrategy() : - DAG(nullptr), TII(nullptr), TRI(nullptr), MRI(nullptr) { - } - - virtual ~R600SchedStrategy() {} - - void initialize(ScheduleDAGMI *dag) override; - SUnit *pickNode(bool &IsTopNode) override; - void schedNode(SUnit *SU, bool IsTopNode) override; - void releaseTopNode(SUnit *SU) override; - void releaseBottomNode(SUnit *SU) override; - -private: - std::vector InstructionsGroupCandidate; - bool VLIW5; - - int getInstKind(SUnit *SU); - bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const; - AluKind getAluKind(SUnit *SU) const; - void LoadAlu(); - unsigned AvailablesAluCount() const; - SUnit *AttemptFillSlot (unsigned Slot, bool AnyAlu); - void PrepareNextSlot(); - SUnit *PopInst(std::vector &Q, bool AnyALU); - - void AssignSlot(MachineInstr *MI, unsigned Slot); - SUnit* pickAlu(); - SUnit* pickOther(int QID); - void MoveUnits(std::vector &QSrc, std::vector &QDst); -}; - -} // namespace llvm - -#endif /* R600MACHINESCHEDULER_H_ */ diff --git a/lib/Target/R600/R600OptimizeVectorRegisters.cpp b/lib/Target/R600/R600OptimizeVectorRegisters.cpp deleted file mode 100644 index 0c06ccc736d..00000000000 --- a/lib/Target/R600/R600OptimizeVectorRegisters.cpp +++ /dev/null @@ -1,382 +0,0 @@ -//===--------------------- R600MergeVectorRegisters.cpp -------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// This pass merges inputs of swizzeable instructions into vector sharing -/// common data and/or have enough undef subreg using swizzle abilities. -/// -/// For instance let's consider the following pseudo code : -/// vreg5 = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3 -/// ... -/// vreg7 = REG_SEQ vreg1, sub0, vreg3, sub1, undef, sub2, vreg4, sub3 -/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub1, sub2, sub3 -/// -/// is turned into : -/// vreg5 = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3 -/// ... -/// vreg7 = INSERT_SUBREG vreg4, sub3 -/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub2, sub1, sub3 -/// -/// This allow regalloc to reduce register pressure for vector registers and -/// to reduce MOV count. -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "R600InstrInfo.h" -#include "llvm/CodeGen/DFAPacketizer.h" -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineLoopInfo.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -#define DEBUG_TYPE "vec-merger" - -namespace { - -static bool -isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) { - for (MachineRegisterInfo::def_instr_iterator It = MRI.def_instr_begin(Reg), - E = MRI.def_instr_end(); It != E; ++It) { - return (*It).isImplicitDef(); - } - if (MRI.isReserved(Reg)) { - return false; - } - llvm_unreachable("Reg without a def"); - return false; -} - -class RegSeqInfo { -public: - MachineInstr *Instr; - DenseMap RegToChan; - std::vector UndefReg; - RegSeqInfo(MachineRegisterInfo &MRI, MachineInstr *MI) : Instr(MI) { - assert(MI->getOpcode() == AMDGPU::REG_SEQUENCE); - for (unsigned i = 1, e = Instr->getNumOperands(); i < e; i+=2) { - MachineOperand &MO = Instr->getOperand(i); - unsigned Chan = Instr->getOperand(i + 1).getImm(); - if (isImplicitlyDef(MRI, MO.getReg())) - UndefReg.push_back(Chan); - else - RegToChan[MO.getReg()] = Chan; - } - } - RegSeqInfo() {} - - bool operator==(const RegSeqInfo &RSI) const { - return RSI.Instr == Instr; - } -}; - -class R600VectorRegMerger : public MachineFunctionPass { -private: - MachineRegisterInfo *MRI; - const R600InstrInfo *TII; - bool canSwizzle(const MachineInstr &) const; - bool areAllUsesSwizzeable(unsigned Reg) const; - void SwizzleInput(MachineInstr &, - const std::vector > &) const; - bool tryMergeVector(const RegSeqInfo *, RegSeqInfo *, - std::vector > &Remap) const; - bool tryMergeUsingCommonSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI, - std::vector > &RemapChan); - bool tryMergeUsingFreeSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI, - std::vector > &RemapChan); - MachineInstr *RebuildVector(RegSeqInfo *MI, - const RegSeqInfo *BaseVec, - const std::vector > &RemapChan) const; - void RemoveMI(MachineInstr *); - void trackRSI(const RegSeqInfo &RSI); - - typedef DenseMap > InstructionSetMap; - DenseMap PreviousRegSeq; - InstructionSetMap PreviousRegSeqByReg; - InstructionSetMap PreviousRegSeqByUndefCount; -public: - static char ID; - R600VectorRegMerger(TargetMachine &tm) : MachineFunctionPass(ID), - TII(nullptr) { } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired(); - AU.addPreserved(); - AU.addRequired(); - AU.addPreserved(); - MachineFunctionPass::getAnalysisUsage(AU); - } - - const char *getPassName() const override { - return "R600 Vector Registers Merge Pass"; - } - - bool runOnMachineFunction(MachineFunction &Fn) override; -}; - -char R600VectorRegMerger::ID = 0; - -bool R600VectorRegMerger::canSwizzle(const MachineInstr &MI) - const { - if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) - return true; - switch (MI.getOpcode()) { - case AMDGPU::R600_ExportSwz: - case AMDGPU::EG_ExportSwz: - return true; - default: - return false; - } -} - -bool R600VectorRegMerger::tryMergeVector(const RegSeqInfo *Untouched, - RegSeqInfo *ToMerge, std::vector< std::pair > &Remap) - const { - unsigned CurrentUndexIdx = 0; - for (DenseMap::iterator It = ToMerge->RegToChan.begin(), - E = ToMerge->RegToChan.end(); It != E; ++It) { - DenseMap::const_iterator PosInUntouched = - Untouched->RegToChan.find((*It).first); - if (PosInUntouched != Untouched->RegToChan.end()) { - Remap.push_back(std::pair - ((*It).second, (*PosInUntouched).second)); - continue; - } - if (CurrentUndexIdx >= Untouched->UndefReg.size()) - return false; - Remap.push_back(std::pair - ((*It).second, Untouched->UndefReg[CurrentUndexIdx++])); - } - - return true; -} - -static -unsigned getReassignedChan( - const std::vector > &RemapChan, - unsigned Chan) { - for (unsigned j = 0, je = RemapChan.size(); j < je; j++) { - if (RemapChan[j].first == Chan) - return RemapChan[j].second; - } - llvm_unreachable("Chan wasn't reassigned"); -} - -MachineInstr *R600VectorRegMerger::RebuildVector( - RegSeqInfo *RSI, const RegSeqInfo *BaseRSI, - const std::vector > &RemapChan) const { - unsigned Reg = RSI->Instr->getOperand(0).getReg(); - MachineBasicBlock::iterator Pos = RSI->Instr; - MachineBasicBlock &MBB = *Pos->getParent(); - DebugLoc DL = Pos->getDebugLoc(); - - unsigned SrcVec = BaseRSI->Instr->getOperand(0).getReg(); - DenseMap UpdatedRegToChan = BaseRSI->RegToChan; - std::vector UpdatedUndef = BaseRSI->UndefReg; - for (DenseMap::iterator It = RSI->RegToChan.begin(), - E = RSI->RegToChan.end(); It != E; ++It) { - unsigned DstReg = MRI->createVirtualRegister(&AMDGPU::R600_Reg128RegClass); - unsigned SubReg = (*It).first; - unsigned Swizzle = (*It).second; - unsigned Chan = getReassignedChan(RemapChan, Swizzle); - - MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::INSERT_SUBREG), - DstReg) - .addReg(SrcVec) - .addReg(SubReg) - .addImm(Chan); - UpdatedRegToChan[SubReg] = Chan; - std::vector::iterator ChanPos = - std::find(UpdatedUndef.begin(), UpdatedUndef.end(), Chan); - if (ChanPos != UpdatedUndef.end()) - UpdatedUndef.erase(ChanPos); - assert(std::find(UpdatedUndef.begin(), UpdatedUndef.end(), Chan) == - UpdatedUndef.end() && - "UpdatedUndef shouldn't contain Chan more than once!"); - DEBUG(dbgs() << " ->"; Tmp->dump();); - (void)Tmp; - SrcVec = DstReg; - } - Pos = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::COPY), Reg) - .addReg(SrcVec); - DEBUG(dbgs() << " ->"; Pos->dump();); - - DEBUG(dbgs() << " Updating Swizzle:\n"); - for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg), - E = MRI->use_instr_end(); It != E; ++It) { - DEBUG(dbgs() << " ";(*It).dump(); dbgs() << " ->"); - SwizzleInput(*It, RemapChan); - DEBUG((*It).dump()); - } - RSI->Instr->eraseFromParent(); - - // Update RSI - RSI->Instr = Pos; - RSI->RegToChan = UpdatedRegToChan; - RSI->UndefReg = UpdatedUndef; - - return Pos; -} - -void R600VectorRegMerger::RemoveMI(MachineInstr *MI) { - for (InstructionSetMap::iterator It = PreviousRegSeqByReg.begin(), - E = PreviousRegSeqByReg.end(); It != E; ++It) { - std::vector &MIs = (*It).second; - MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end()); - } - for (InstructionSetMap::iterator It = PreviousRegSeqByUndefCount.begin(), - E = PreviousRegSeqByUndefCount.end(); It != E; ++It) { - std::vector &MIs = (*It).second; - MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end()); - } -} - -void R600VectorRegMerger::SwizzleInput(MachineInstr &MI, - const std::vector > &RemapChan) const { - unsigned Offset; - if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) - Offset = 2; - else - Offset = 3; - for (unsigned i = 0; i < 4; i++) { - unsigned Swizzle = MI.getOperand(i + Offset).getImm() + 1; - for (unsigned j = 0, e = RemapChan.size(); j < e; j++) { - if (RemapChan[j].first == Swizzle) { - MI.getOperand(i + Offset).setImm(RemapChan[j].second - 1); - break; - } - } - } -} - -bool R600VectorRegMerger::areAllUsesSwizzeable(unsigned Reg) const { - for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg), - E = MRI->use_instr_end(); It != E; ++It) { - if (!canSwizzle(*It)) - return false; - } - return true; -} - -bool R600VectorRegMerger::tryMergeUsingCommonSlot(RegSeqInfo &RSI, - RegSeqInfo &CompatibleRSI, - std::vector > &RemapChan) { - for (MachineInstr::mop_iterator MOp = RSI.Instr->operands_begin(), - MOE = RSI.Instr->operands_end(); MOp != MOE; ++MOp) { - if (!MOp->isReg()) - continue; - if (PreviousRegSeqByReg[MOp->getReg()].empty()) - continue; - for (MachineInstr *MI : PreviousRegSeqByReg[MOp->getReg()]) { - CompatibleRSI = PreviousRegSeq[MI]; - if (RSI == CompatibleRSI) - continue; - if (tryMergeVector(&CompatibleRSI, &RSI, RemapChan)) - return true; - } - } - return false; -} - -bool R600VectorRegMerger::tryMergeUsingFreeSlot(RegSeqInfo &RSI, - RegSeqInfo &CompatibleRSI, - std::vector > &RemapChan) { - unsigned NeededUndefs = 4 - RSI.UndefReg.size(); - if (PreviousRegSeqByUndefCount[NeededUndefs].empty()) - return false; - std::vector &MIs = - PreviousRegSeqByUndefCount[NeededUndefs]; - CompatibleRSI = PreviousRegSeq[MIs.back()]; - tryMergeVector(&CompatibleRSI, &RSI, RemapChan); - return true; -} - -void R600VectorRegMerger::trackRSI(const RegSeqInfo &RSI) { - for (DenseMap::const_iterator - It = RSI.RegToChan.begin(), E = RSI.RegToChan.end(); It != E; ++It) { - PreviousRegSeqByReg[(*It).first].push_back(RSI.Instr); - } - PreviousRegSeqByUndefCount[RSI.UndefReg.size()].push_back(RSI.Instr); - PreviousRegSeq[RSI.Instr] = RSI; -} - -bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) { - TII = static_cast(Fn.getSubtarget().getInstrInfo()); - MRI = &(Fn.getRegInfo()); - for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); - MBB != MBBe; ++MBB) { - MachineBasicBlock *MB = MBB; - PreviousRegSeq.clear(); - PreviousRegSeqByReg.clear(); - PreviousRegSeqByUndefCount.clear(); - - for (MachineBasicBlock::iterator MII = MB->begin(), MIIE = MB->end(); - MII != MIIE; ++MII) { - MachineInstr *MI = MII; - if (MI->getOpcode() != AMDGPU::REG_SEQUENCE) { - if (TII->get(MI->getOpcode()).TSFlags & R600_InstFlag::TEX_INST) { - unsigned Reg = MI->getOperand(1).getReg(); - for (MachineRegisterInfo::def_instr_iterator - It = MRI->def_instr_begin(Reg), E = MRI->def_instr_end(); - It != E; ++It) { - RemoveMI(&(*It)); - } - } - continue; - } - - - RegSeqInfo RSI(*MRI, MI); - - // All uses of MI are swizzeable ? - unsigned Reg = MI->getOperand(0).getReg(); - if (!areAllUsesSwizzeable(Reg)) - continue; - - DEBUG (dbgs() << "Trying to optimize "; - MI->dump(); - ); - - RegSeqInfo CandidateRSI; - std::vector > RemapChan; - DEBUG(dbgs() << "Using common slots...\n";); - if (tryMergeUsingCommonSlot(RSI, CandidateRSI, RemapChan)) { - // Remove CandidateRSI mapping - RemoveMI(CandidateRSI.Instr); - MII = RebuildVector(&RSI, &CandidateRSI, RemapChan); - trackRSI(RSI); - continue; - } - DEBUG(dbgs() << "Using free slots...\n";); - RemapChan.clear(); - if (tryMergeUsingFreeSlot(RSI, CandidateRSI, RemapChan)) { - RemoveMI(CandidateRSI.Instr); - MII = RebuildVector(&RSI, &CandidateRSI, RemapChan); - trackRSI(RSI); - continue; - } - //Failed to merge - trackRSI(RSI); - } - } - return false; -} - -} - -llvm::FunctionPass *llvm::createR600VectorRegMerger(TargetMachine &tm) { - return new R600VectorRegMerger(tm); -} diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp deleted file mode 100644 index deee5bc3997..00000000000 --- a/lib/Target/R600/R600Packetizer.cpp +++ /dev/null @@ -1,408 +0,0 @@ -//===----- R600Packetizer.cpp - VLIW packetizer ---------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// This pass implements instructions packetization for R600. It unsets isLast -/// bit of instructions inside a bundle and substitutes src register with -/// PreviousVector when applicable. -// -//===----------------------------------------------------------------------===// - -#include "llvm/Support/Debug.h" -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "R600InstrInfo.h" -#include "llvm/CodeGen/DFAPacketizer.h" -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineLoopInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/CodeGen/ScheduleDAG.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -#define DEBUG_TYPE "packets" - -namespace { - -class R600Packetizer : public MachineFunctionPass { - -public: - static char ID; - R600Packetizer(const TargetMachine &TM) : MachineFunctionPass(ID) {} - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired(); - AU.addPreserved(); - AU.addRequired(); - AU.addPreserved(); - MachineFunctionPass::getAnalysisUsage(AU); - } - - const char *getPassName() const override { - return "R600 Packetizer"; - } - - bool runOnMachineFunction(MachineFunction &Fn) override; -}; -char R600Packetizer::ID = 0; - -class R600PacketizerList : public VLIWPacketizerList { - -private: - const R600InstrInfo *TII; - const R600RegisterInfo &TRI; - bool VLIW5; - bool ConsideredInstUsesAlreadyWrittenVectorElement; - - unsigned getSlot(const MachineInstr *MI) const { - return TRI.getHWRegChan(MI->getOperand(0).getReg()); - } - - /// \returns register to PV chan mapping for bundle/single instructions that - /// immediately precedes I. - DenseMap getPreviousVector(MachineBasicBlock::iterator I) - const { - DenseMap Result; - I--; - if (!TII->isALUInstr(I->getOpcode()) && !I->isBundle()) - return Result; - MachineBasicBlock::instr_iterator BI = I.getInstrIterator(); - if (I->isBundle()) - BI++; - int LastDstChan = -1; - do { - bool isTrans = false; - int BISlot = getSlot(BI); - if (LastDstChan >= BISlot) - isTrans = true; - LastDstChan = BISlot; - if (TII->isPredicated(BI)) - continue; - int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write); - if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0) - continue; - int DstIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::dst); - if (DstIdx == -1) { - continue; - } - unsigned Dst = BI->getOperand(DstIdx).getReg(); - if (isTrans || TII->isTransOnly(BI)) { - Result[Dst] = AMDGPU::PS; - continue; - } - if (BI->getOpcode() == AMDGPU::DOT4_r600 || - BI->getOpcode() == AMDGPU::DOT4_eg) { - Result[Dst] = AMDGPU::PV_X; - continue; - } - if (Dst == AMDGPU::OQAP) { - continue; - } - unsigned PVReg = 0; - switch (TRI.getHWRegChan(Dst)) { - case 0: - PVReg = AMDGPU::PV_X; - break; - case 1: - PVReg = AMDGPU::PV_Y; - break; - case 2: - PVReg = AMDGPU::PV_Z; - break; - case 3: - PVReg = AMDGPU::PV_W; - break; - default: - llvm_unreachable("Invalid Chan"); - } - Result[Dst] = PVReg; - } while ((++BI)->isBundledWithPred()); - return Result; - } - - void substitutePV(MachineInstr *MI, const DenseMap &PVs) - const { - unsigned Ops[] = { - AMDGPU::OpName::src0, - AMDGPU::OpName::src1, - AMDGPU::OpName::src2 - }; - for (unsigned i = 0; i < 3; i++) { - int OperandIdx = TII->getOperandIdx(MI->getOpcode(), Ops[i]); - if (OperandIdx < 0) - continue; - unsigned Src = MI->getOperand(OperandIdx).getReg(); - const DenseMap::const_iterator It = PVs.find(Src); - if (It != PVs.end()) - MI->getOperand(OperandIdx).setReg(It->second); - } - } -public: - // Ctor. - R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI) - : VLIWPacketizerList(MF, MLI, true), - TII(static_cast( - MF.getSubtarget().getInstrInfo())), - TRI(TII->getRegisterInfo()) { - VLIW5 = !MF.getSubtarget().hasCaymanISA(); - } - - // initPacketizerState - initialize some internal flags. - void initPacketizerState() override { - ConsideredInstUsesAlreadyWrittenVectorElement = false; - } - - // ignorePseudoInstruction - Ignore bundling of pseudo instructions. - bool ignorePseudoInstruction(MachineInstr *MI, - MachineBasicBlock *MBB) override { - return false; - } - - // isSoloInstruction - return true if instruction MI can not be packetized - // with any other instruction, which means that MI itself is a packet. - bool isSoloInstruction(MachineInstr *MI) override { - if (TII->isVector(*MI)) - return true; - if (!TII->isALUInstr(MI->getOpcode())) - return true; - if (MI->getOpcode() == AMDGPU::GROUP_BARRIER) - return true; - // XXX: This can be removed once the packetizer properly handles all the - // LDS instruction group restrictions. - if (TII->isLDSInstr(MI->getOpcode())) - return true; - return false; - } - - // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ - // together. - bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override { - MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr(); - if (getSlot(MII) == getSlot(MIJ)) - ConsideredInstUsesAlreadyWrittenVectorElement = true; - // Does MII and MIJ share the same pred_sel ? - int OpI = TII->getOperandIdx(MII->getOpcode(), AMDGPU::OpName::pred_sel), - OpJ = TII->getOperandIdx(MIJ->getOpcode(), AMDGPU::OpName::pred_sel); - unsigned PredI = (OpI > -1)?MII->getOperand(OpI).getReg():0, - PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg():0; - if (PredI != PredJ) - return false; - if (SUJ->isSucc(SUI)) { - for (unsigned i = 0, e = SUJ->Succs.size(); i < e; ++i) { - const SDep &Dep = SUJ->Succs[i]; - if (Dep.getSUnit() != SUI) - continue; - if (Dep.getKind() == SDep::Anti) - continue; - if (Dep.getKind() == SDep::Output) - if (MII->getOperand(0).getReg() != MIJ->getOperand(0).getReg()) - continue; - return false; - } - } - - bool ARDef = TII->definesAddressRegister(MII) || - TII->definesAddressRegister(MIJ); - bool ARUse = TII->usesAddressRegister(MII) || - TII->usesAddressRegister(MIJ); - if (ARDef && ARUse) - return false; - - return true; - } - - // isLegalToPruneDependencies - Is it legal to prune dependece between SUI - // and SUJ. - bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override { - return false; - } - - void setIsLastBit(MachineInstr *MI, unsigned Bit) const { - unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::last); - MI->getOperand(LastOp).setImm(Bit); - } - - bool isBundlableWithCurrentPMI(MachineInstr *MI, - const DenseMap &PV, - std::vector &BS, - bool &isTransSlot) { - isTransSlot = TII->isTransOnly(MI); - assert (!isTransSlot || VLIW5); - - // Is the dst reg sequence legal ? - if (!isTransSlot && !CurrentPacketMIs.empty()) { - if (getSlot(MI) <= getSlot(CurrentPacketMIs.back())) { - if (ConsideredInstUsesAlreadyWrittenVectorElement && - !TII->isVectorOnly(MI) && VLIW5) { - isTransSlot = true; - DEBUG(dbgs() << "Considering as Trans Inst :"; MI->dump();); - } - else - return false; - } - } - - // Are the Constants limitations met ? - CurrentPacketMIs.push_back(MI); - if (!TII->fitsConstReadLimitations(CurrentPacketMIs)) { - DEBUG( - dbgs() << "Couldn't pack :\n"; - MI->dump(); - dbgs() << "with the following packets :\n"; - for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) { - CurrentPacketMIs[i]->dump(); - dbgs() << "\n"; - } - dbgs() << "because of Consts read limitations\n"; - ); - CurrentPacketMIs.pop_back(); - return false; - } - - // Is there a BankSwizzle set that meet Read Port limitations ? - if (!TII->fitsReadPortLimitations(CurrentPacketMIs, - PV, BS, isTransSlot)) { - DEBUG( - dbgs() << "Couldn't pack :\n"; - MI->dump(); - dbgs() << "with the following packets :\n"; - for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) { - CurrentPacketMIs[i]->dump(); - dbgs() << "\n"; - } - dbgs() << "because of Read port limitations\n"; - ); - CurrentPacketMIs.pop_back(); - return false; - } - - // We cannot read LDS source registrs from the Trans slot. - if (isTransSlot && TII->readsLDSSrcReg(MI)) - return false; - - CurrentPacketMIs.pop_back(); - return true; - } - - MachineBasicBlock::iterator addToPacket(MachineInstr *MI) override { - MachineBasicBlock::iterator FirstInBundle = - CurrentPacketMIs.empty() ? MI : CurrentPacketMIs.front(); - const DenseMap &PV = - getPreviousVector(FirstInBundle); - std::vector BS; - bool isTransSlot; - - if (isBundlableWithCurrentPMI(MI, PV, BS, isTransSlot)) { - for (unsigned i = 0, e = CurrentPacketMIs.size(); i < e; i++) { - MachineInstr *MI = CurrentPacketMIs[i]; - unsigned Op = TII->getOperandIdx(MI->getOpcode(), - AMDGPU::OpName::bank_swizzle); - MI->getOperand(Op).setImm(BS[i]); - } - unsigned Op = TII->getOperandIdx(MI->getOpcode(), - AMDGPU::OpName::bank_swizzle); - MI->getOperand(Op).setImm(BS.back()); - if (!CurrentPacketMIs.empty()) - setIsLastBit(CurrentPacketMIs.back(), 0); - substitutePV(MI, PV); - MachineBasicBlock::iterator It = VLIWPacketizerList::addToPacket(MI); - if (isTransSlot) { - endPacket(std::next(It)->getParent(), std::next(It)); - } - return It; - } - endPacket(MI->getParent(), MI); - if (TII->isTransOnly(MI)) - return MI; - return VLIWPacketizerList::addToPacket(MI); - } -}; - -bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) { - const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo(); - MachineLoopInfo &MLI = getAnalysis(); - - // Instantiate the packetizer. - R600PacketizerList Packetizer(Fn, MLI); - - // DFA state table should not be empty. - assert(Packetizer.getResourceTracker() && "Empty DFA table!"); - - // - // Loop over all basic blocks and remove KILL pseudo-instructions - // These instructions confuse the dependence analysis. Consider: - // D0 = ... (Insn 0) - // R0 = KILL R0, D0 (Insn 1) - // R0 = ... (Insn 2) - // Here, Insn 1 will result in the dependence graph not emitting an output - // dependence between Insn 0 and Insn 2. This can lead to incorrect - // packetization - // - for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); - MBB != MBBe; ++MBB) { - MachineBasicBlock::iterator End = MBB->end(); - MachineBasicBlock::iterator MI = MBB->begin(); - while (MI != End) { - if (MI->isKill() || MI->getOpcode() == AMDGPU::IMPLICIT_DEF || - (MI->getOpcode() == AMDGPU::CF_ALU && !MI->getOperand(8).getImm())) { - MachineBasicBlock::iterator DeleteMI = MI; - ++MI; - MBB->erase(DeleteMI); - End = MBB->end(); - continue; - } - ++MI; - } - } - - // Loop over all of the basic blocks. - for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); - MBB != MBBe; ++MBB) { - // Find scheduling regions and schedule / packetize each region. - unsigned RemainingCount = MBB->size(); - for(MachineBasicBlock::iterator RegionEnd = MBB->end(); - RegionEnd != MBB->begin();) { - // The next region starts above the previous region. Look backward in the - // instruction stream until we find the nearest boundary. - MachineBasicBlock::iterator I = RegionEnd; - for(;I != MBB->begin(); --I, --RemainingCount) { - if (TII->isSchedulingBoundary(std::prev(I), MBB, Fn)) - break; - } - I = MBB->begin(); - - // Skip empty scheduling regions. - if (I == RegionEnd) { - RegionEnd = std::prev(RegionEnd); - --RemainingCount; - continue; - } - // Skip regions with one instruction. - if (I == std::prev(RegionEnd)) { - RegionEnd = std::prev(RegionEnd); - continue; - } - - Packetizer.PacketizeMIs(MBB, I, RegionEnd); - RegionEnd = I; - } - } - - return true; - -} - -} // end anonymous namespace - -llvm::FunctionPass *llvm::createR600Packetizer(TargetMachine &tm) { - return new R600Packetizer(tm); -} diff --git a/lib/Target/R600/R600RegisterInfo.cpp b/lib/Target/R600/R600RegisterInfo.cpp deleted file mode 100644 index fb0359cfc65..00000000000 --- a/lib/Target/R600/R600RegisterInfo.cpp +++ /dev/null @@ -1,91 +0,0 @@ -//===-- R600RegisterInfo.cpp - R600 Register Information ------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief R600 implementation of the TargetRegisterInfo class. -// -//===----------------------------------------------------------------------===// - -#include "R600RegisterInfo.h" -#include "AMDGPUTargetMachine.h" -#include "R600Defines.h" -#include "R600InstrInfo.h" -#include "R600MachineFunctionInfo.h" - -using namespace llvm; - -R600RegisterInfo::R600RegisterInfo() : AMDGPURegisterInfo() { - RCW.RegWeight = 0; - RCW.WeightLimit = 0; -} - -BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const { - BitVector Reserved(getNumRegs()); - - const R600InstrInfo *TII = - static_cast(MF.getSubtarget().getInstrInfo()); - - Reserved.set(AMDGPU::ZERO); - Reserved.set(AMDGPU::HALF); - Reserved.set(AMDGPU::ONE); - Reserved.set(AMDGPU::ONE_INT); - Reserved.set(AMDGPU::NEG_HALF); - Reserved.set(AMDGPU::NEG_ONE); - Reserved.set(AMDGPU::PV_X); - Reserved.set(AMDGPU::ALU_LITERAL_X); - Reserved.set(AMDGPU::ALU_CONST); - Reserved.set(AMDGPU::PREDICATE_BIT); - Reserved.set(AMDGPU::PRED_SEL_OFF); - Reserved.set(AMDGPU::PRED_SEL_ZERO); - Reserved.set(AMDGPU::PRED_SEL_ONE); - Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); - - for (TargetRegisterClass::iterator I = AMDGPU::R600_AddrRegClass.begin(), - E = AMDGPU::R600_AddrRegClass.end(); I != E; ++I) { - Reserved.set(*I); - } - - TII->reserveIndirectRegisters(Reserved, MF); - - return Reserved; -} - -unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const { - return this->getEncodingValue(reg) >> HW_CHAN_SHIFT; -} - -unsigned R600RegisterInfo::getHWRegIndex(unsigned Reg) const { - return GET_REG_INDEX(getEncodingValue(Reg)); -} - -const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass( - MVT VT) const { - switch(VT.SimpleTy) { - default: - case MVT::i32: return &AMDGPU::R600_TReg32RegClass; - } -} - -const RegClassWeight &R600RegisterInfo::getRegClassWeight( - const TargetRegisterClass *RC) const { - return RCW; -} - -bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const { - assert(!TargetRegisterInfo::isVirtualRegister(Reg)); - - switch (Reg) { - case AMDGPU::OQAP: - case AMDGPU::OQBP: - case AMDGPU::AR_X: - return false; - default: - return true; - } -} diff --git a/lib/Target/R600/R600RegisterInfo.h b/lib/Target/R600/R600RegisterInfo.h deleted file mode 100644 index 9713e600a72..00000000000 --- a/lib/Target/R600/R600RegisterInfo.h +++ /dev/null @@ -1,49 +0,0 @@ -//===-- R600RegisterInfo.h - R600 Register Info Interface ------*- C++ -*--===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Interface definition for R600RegisterInfo -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_R600REGISTERINFO_H -#define LLVM_LIB_TARGET_R600_R600REGISTERINFO_H - -#include "AMDGPURegisterInfo.h" - -namespace llvm { - -class AMDGPUSubtarget; - -struct R600RegisterInfo : public AMDGPURegisterInfo { - RegClassWeight RCW; - - R600RegisterInfo(); - - BitVector getReservedRegs(const MachineFunction &MF) const override; - - /// \brief get the HW encoding for a register's channel. - unsigned getHWRegChan(unsigned reg) const; - - unsigned getHWRegIndex(unsigned Reg) const override; - - /// \brief get the register class of the specified type to use in the - /// CFGStructurizer - const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override; - - const RegClassWeight & - getRegClassWeight(const TargetRegisterClass *RC) const override; - - // \returns true if \p Reg can be defined in one ALU caluse and used in another. - bool isPhysRegLiveAcrossClauses(unsigned Reg) const; -}; - -} // End namespace llvm - -#endif diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td deleted file mode 100644 index cc667d985a8..00000000000 --- a/lib/Target/R600/R600RegisterInfo.td +++ /dev/null @@ -1,252 +0,0 @@ - -class R600Reg encoding> : Register { - let Namespace = "AMDGPU"; - let HWEncoding = encoding; -} - -class R600RegWithChan sel, string chan> : - Register { - - field bits<2> chan_encoding = !if(!eq(chan, "X"), 0, - !if(!eq(chan, "Y"), 1, - !if(!eq(chan, "Z"), 2, - !if(!eq(chan, "W"), 3, 0)))); - let HWEncoding{8-0} = sel; - let HWEncoding{10-9} = chan_encoding; - let Namespace = "AMDGPU"; -} - -class R600Reg_128 subregs, bits<16> encoding> : - RegisterWithSubRegs { - field bits<2> chan_encoding = 0; - let Namespace = "AMDGPU"; - let SubRegIndices = [sub0, sub1, sub2, sub3]; - let HWEncoding{8-0} = encoding{8-0}; - let HWEncoding{10-9} = chan_encoding; -} - -class R600Reg_64 subregs, bits<16> encoding> : - RegisterWithSubRegs { - field bits<2> chan_encoding = 0; - let Namespace = "AMDGPU"; - let SubRegIndices = [sub0, sub1]; - let HWEncoding = encoding; - let HWEncoding{8-0} = encoding{8-0}; - let HWEncoding{10-9} = chan_encoding; -} - -class R600Reg_64Vertical : R600Reg_64 < - "V"#lo#hi#"_"#chan, - [!cast("T"#lo#"_"#chan), !cast("T"#hi#"_"#chan)], - lo ->; - -foreach Index = 0-127 in { - foreach Chan = [ "X", "Y", "Z", "W" ] in { - // 32-bit Temporary Registers - def T#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index, Chan>; - - // Indirect addressing offset registers - def Addr#Index#_#Chan : R600RegWithChan <"T("#Index#" + AR.x)."#Chan, - Index, Chan>; - } - // 128-bit Temporary Registers - def T#Index#_XYZW : R600Reg_128 <"T"#Index#"", - [!cast("T"#Index#"_X"), - !cast("T"#Index#"_Y"), - !cast("T"#Index#"_Z"), - !cast("T"#Index#"_W")], - Index>; - - def T#Index#_XY : R600Reg_64 <"T"#Index#"", - [!cast("T"#Index#"_X"), - !cast("T"#Index#"_Y")], - Index>; -} - -foreach Chan = [ "X", "Y", "Z", "W"] in { - - let chan_encoding = !if(!eq(Chan, "X"), 0, - !if(!eq(Chan, "Y"), 1, - !if(!eq(Chan, "Z"), 2, - !if(!eq(Chan, "W"), 3, 0)))) in { - def V0123_#Chan : R600Reg_128 <"V0123_"#Chan, - [!cast("T0_"#Chan), - !cast("T1_"#Chan), - !cast("T2_"#Chan), - !cast("T3_"#Chan)], - 0>; - def V01_#Chan : R600Reg_64Vertical<0, 1, Chan>; - def V23_#Chan : R600Reg_64Vertical<2, 3, Chan>; - } -} - - -// KCACHE_BANK0 -foreach Index = 159-128 in { - foreach Chan = [ "X", "Y", "Z", "W" ] in { - // 32-bit Temporary Registers - def KC0_#Index#_#Chan : R600RegWithChan <"KC0["#!add(Index,-128)#"]."#Chan, Index, Chan>; - } - // 128-bit Temporary Registers - def KC0_#Index#_XYZW : R600Reg_128 <"KC0["#!add(Index, -128)#"].XYZW", - [!cast("KC0_"#Index#"_X"), - !cast("KC0_"#Index#"_Y"), - !cast("KC0_"#Index#"_Z"), - !cast("KC0_"#Index#"_W")], - Index>; -} - -// KCACHE_BANK1 -foreach Index = 191-160 in { - foreach Chan = [ "X", "Y", "Z", "W" ] in { - // 32-bit Temporary Registers - def KC1_#Index#_#Chan : R600RegWithChan <"KC1["#!add(Index,-160)#"]."#Chan, Index, Chan>; - } - // 128-bit Temporary Registers - def KC1_#Index#_XYZW : R600Reg_128 <"KC1["#!add(Index, -160)#"].XYZW", - [!cast("KC1_"#Index#"_X"), - !cast("KC1_"#Index#"_Y"), - !cast("KC1_"#Index#"_Z"), - !cast("KC1_"#Index#"_W")], - Index>; -} - - -// Array Base Register holding input in FS -foreach Index = 448-480 in { - def ArrayBase#Index : R600Reg<"ARRAY_BASE", Index>; -} - - -// Special Registers - -def OQA : R600Reg<"OQA", 219>; -def OQB : R600Reg<"OQB", 220>; -def OQAP : R600Reg<"OQAP", 221>; -def OQBP : R600Reg<"OQAP", 222>; -def LDS_DIRECT_A : R600Reg<"LDS_DIRECT_A", 223>; -def LDS_DIRECT_B : R600Reg<"LDS_DIRECT_B", 224>; -def ZERO : R600Reg<"0.0", 248>; -def ONE : R600Reg<"1.0", 249>; -def NEG_ONE : R600Reg<"-1.0", 249>; -def ONE_INT : R600Reg<"1", 250>; -def HALF : R600Reg<"0.5", 252>; -def NEG_HALF : R600Reg<"-0.5", 252>; -def ALU_LITERAL_X : R600RegWithChan<"literal.x", 253, "X">; -def ALU_LITERAL_Y : R600RegWithChan<"literal.y", 253, "Y">; -def ALU_LITERAL_Z : R600RegWithChan<"literal.z", 253, "Z">; -def ALU_LITERAL_W : R600RegWithChan<"literal.w", 253, "W">; -def PV_X : R600RegWithChan<"PV.X", 254, "X">; -def PV_Y : R600RegWithChan<"PV.Y", 254, "Y">; -def PV_Z : R600RegWithChan<"PV.Z", 254, "Z">; -def PV_W : R600RegWithChan<"PV.W", 254, "W">; -def PS: R600Reg<"PS", 255>; -def PREDICATE_BIT : R600Reg<"PredicateBit", 0>; -def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>; -def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>; -def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>; -def AR_X : R600Reg<"AR.x", 0>; - -def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32, - (add (sequence "ArrayBase%u", 448, 480))>; -// special registers for ALU src operands -// const buffer reference, SRCx_SEL contains index -def ALU_CONST : R600Reg<"CBuf", 0>; -// interpolation param reference, SRCx_SEL contains index -def ALU_PARAM : R600Reg<"Param", 0>; - -let isAllocatable = 0 in { - -def R600_Addr : RegisterClass <"AMDGPU", [i32], 32, (add (sequence "Addr%u_X", 0, 127))>; - -// We only use Addr_[YZW] for vertical vectors. -// FIXME if we add more vertical vector registers we will need to ad more -// registers to these classes. -def R600_Addr_Y : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Y)>; -def R600_Addr_Z : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Z)>; -def R600_Addr_W : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_W)>; - -def R600_LDS_SRC_REG : RegisterClass<"AMDGPU", [i32], 32, - (add OQA, OQB, OQAP, OQBP, LDS_DIRECT_A, LDS_DIRECT_B)>; - -def R600_KC0_X : RegisterClass <"AMDGPU", [f32, i32], 32, - (add (sequence "KC0_%u_X", 128, 159))>; - -def R600_KC0_Y : RegisterClass <"AMDGPU", [f32, i32], 32, - (add (sequence "KC0_%u_Y", 128, 159))>; - -def R600_KC0_Z : RegisterClass <"AMDGPU", [f32, i32], 32, - (add (sequence "KC0_%u_Z", 128, 159))>; - -def R600_KC0_W : RegisterClass <"AMDGPU", [f32, i32], 32, - (add (sequence "KC0_%u_W", 128, 159))>; - -def R600_KC0 : RegisterClass <"AMDGPU", [f32, i32], 32, - (interleave R600_KC0_X, R600_KC0_Y, - R600_KC0_Z, R600_KC0_W)>; - -def R600_KC1_X : RegisterClass <"AMDGPU", [f32, i32], 32, - (add (sequence "KC1_%u_X", 160, 191))>; - -def R600_KC1_Y : RegisterClass <"AMDGPU", [f32, i32], 32, - (add (sequence "KC1_%u_Y", 160, 191))>; - -def R600_KC1_Z : RegisterClass <"AMDGPU", [f32, i32], 32, - (add (sequence "KC1_%u_Z", 160, 191))>; - -def R600_KC1_W : RegisterClass <"AMDGPU", [f32, i32], 32, - (add (sequence "KC1_%u_W", 160, 191))>; - -def R600_KC1 : RegisterClass <"AMDGPU", [f32, i32], 32, - (interleave R600_KC1_X, R600_KC1_Y, - R600_KC1_Z, R600_KC1_W)>; - -} // End isAllocatable = 0 - -def R600_TReg32_X : RegisterClass <"AMDGPU", [f32, i32], 32, - (add (sequence "T%u_X", 0, 127), AR_X)>; - -def R600_TReg32_Y : RegisterClass <"AMDGPU", [f32, i32], 32, - (add (sequence "T%u_Y", 0, 127))>; - -def R600_TReg32_Z : RegisterClass <"AMDGPU", [f32, i32], 32, - (add (sequence "T%u_Z", 0, 127))>; - -def R600_TReg32_W : RegisterClass <"AMDGPU", [f32, i32], 32, - (add (sequence "T%u_W", 0, 127))>; - -def R600_TReg32 : RegisterClass <"AMDGPU", [f32, i32], 32, - (interleave R600_TReg32_X, R600_TReg32_Y, - R600_TReg32_Z, R600_TReg32_W)>; - -def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add - R600_TReg32, - R600_ArrayBase, - R600_Addr, - R600_KC0, R600_KC1, - ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF, - ALU_CONST, ALU_PARAM, OQAP - )>; - -def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add - PRED_SEL_OFF, PRED_SEL_ZERO, PRED_SEL_ONE)>; - -def R600_Predicate_Bit: RegisterClass <"AMDGPU", [i32], 32, (add - PREDICATE_BIT)>; - -def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, - (add (sequence "T%u_XYZW", 0, 127))> { - let CopyCost = -1; -} - -def R600_Reg128Vertical : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, - (add V0123_W, V0123_Z, V0123_Y, V0123_X) ->; - -def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64, - (add (sequence "T%u_XY", 0, 63))>; - -def R600_Reg64Vertical : RegisterClass<"AMDGPU", [v2f32, v2i32], 64, - (add V01_X, V01_Y, V01_Z, V01_W, - V23_X, V23_Y, V23_Z, V23_W)>; diff --git a/lib/Target/R600/R600Schedule.td b/lib/Target/R600/R600Schedule.td deleted file mode 100644 index df62bf85c0a..00000000000 --- a/lib/Target/R600/R600Schedule.td +++ /dev/null @@ -1,49 +0,0 @@ -//===-- R600Schedule.td - R600 Scheduling definitions ------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// R600 has a VLIW architecture. On pre-cayman cards there are 5 instruction -// slots ALU.X, ALU.Y, ALU.Z, ALU.W, and TRANS. For cayman cards, the TRANS -// slot has been removed. -// -//===----------------------------------------------------------------------===// - - -def ALU_X : FuncUnit; -def ALU_Y : FuncUnit; -def ALU_Z : FuncUnit; -def ALU_W : FuncUnit; -def TRANS : FuncUnit; - -def AnyALU : InstrItinClass; -def VecALU : InstrItinClass; -def TransALU : InstrItinClass; -def XALU : InstrItinClass; - -def R600_VLIW5_Itin : ProcessorItineraries < - [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS, ALU_NULL], - [], - [ - InstrItinData]>, - InstrItinData]>, - InstrItinData]>, - InstrItinData]>, - InstrItinData]> - ] ->; - -def R600_VLIW4_Itin : ProcessorItineraries < - [ALU_X, ALU_Y, ALU_Z, ALU_W, ALU_NULL], - [], - [ - InstrItinData]>, - InstrItinData]>, - InstrItinData]>, - InstrItinData]> - ] ->; diff --git a/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp b/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp deleted file mode 100644 index 2fc7b02f673..00000000000 --- a/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp +++ /dev/null @@ -1,303 +0,0 @@ -//===-- R600TextureIntrinsicsReplacer.cpp ---------------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// This pass translates tgsi-like texture intrinsics into R600 texture -/// closer to hardware intrinsics. -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/Passes.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalValue.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InstVisitor.h" - -using namespace llvm; - -namespace { -class R600TextureIntrinsicsReplacer : - public FunctionPass, public InstVisitor { - static char ID; - - Module *Mod; - Type *FloatType; - Type *Int32Type; - Type *V4f32Type; - Type *V4i32Type; - FunctionType *TexSign; - FunctionType *TexQSign; - - void getAdjustmentFromTextureTarget(unsigned TextureType, bool hasLOD, - unsigned SrcSelect[4], unsigned CT[4], - bool &useShadowVariant) { - enum TextureTypes { - TEXTURE_1D = 1, - TEXTURE_2D, - TEXTURE_3D, - TEXTURE_CUBE, - TEXTURE_RECT, - TEXTURE_SHADOW1D, - TEXTURE_SHADOW2D, - TEXTURE_SHADOWRECT, - TEXTURE_1D_ARRAY, - TEXTURE_2D_ARRAY, - TEXTURE_SHADOW1D_ARRAY, - TEXTURE_SHADOW2D_ARRAY, - TEXTURE_SHADOWCUBE, - TEXTURE_2D_MSAA, - TEXTURE_2D_ARRAY_MSAA, - TEXTURE_CUBE_ARRAY, - TEXTURE_SHADOWCUBE_ARRAY - }; - - switch (TextureType) { - case 0: - useShadowVariant = false; - return; - case TEXTURE_RECT: - case TEXTURE_1D: - case TEXTURE_2D: - case TEXTURE_3D: - case TEXTURE_CUBE: - case TEXTURE_1D_ARRAY: - case TEXTURE_2D_ARRAY: - case TEXTURE_CUBE_ARRAY: - case TEXTURE_2D_MSAA: - case TEXTURE_2D_ARRAY_MSAA: - useShadowVariant = false; - break; - case TEXTURE_SHADOW1D: - case TEXTURE_SHADOW2D: - case TEXTURE_SHADOWRECT: - case TEXTURE_SHADOW1D_ARRAY: - case TEXTURE_SHADOW2D_ARRAY: - case TEXTURE_SHADOWCUBE: - case TEXTURE_SHADOWCUBE_ARRAY: - useShadowVariant = true; - break; - default: - llvm_unreachable("Unknow Texture Type"); - } - - if (TextureType == TEXTURE_RECT || - TextureType == TEXTURE_SHADOWRECT) { - CT[0] = 0; - CT[1] = 0; - } - - if (TextureType == TEXTURE_CUBE_ARRAY || - TextureType == TEXTURE_SHADOWCUBE_ARRAY) - CT[2] = 0; - - if (TextureType == TEXTURE_1D_ARRAY || - TextureType == TEXTURE_SHADOW1D_ARRAY) { - if (hasLOD && useShadowVariant) { - CT[1] = 0; - } else { - CT[2] = 0; - SrcSelect[2] = 1; - } - } else if (TextureType == TEXTURE_2D_ARRAY || - TextureType == TEXTURE_SHADOW2D_ARRAY) { - CT[2] = 0; - } - - if ((TextureType == TEXTURE_SHADOW1D || - TextureType == TEXTURE_SHADOW2D || - TextureType == TEXTURE_SHADOWRECT || - TextureType == TEXTURE_SHADOW1D_ARRAY) && - !(hasLOD && useShadowVariant)) - SrcSelect[3] = 2; - } - - void ReplaceCallInst(CallInst &I, FunctionType *FT, const char *Name, - unsigned SrcSelect[4], Value *Offset[3], Value *Resource, - Value *Sampler, unsigned CT[4], Value *Coord) { - IRBuilder<> Builder(&I); - Constant *Mask[] = { - ConstantInt::get(Int32Type, SrcSelect[0]), - ConstantInt::get(Int32Type, SrcSelect[1]), - ConstantInt::get(Int32Type, SrcSelect[2]), - ConstantInt::get(Int32Type, SrcSelect[3]) - }; - Value *SwizzleMask = ConstantVector::get(Mask); - Value *SwizzledCoord = - Builder.CreateShuffleVector(Coord, Coord, SwizzleMask); - - Value *Args[] = { - SwizzledCoord, - Offset[0], - Offset[1], - Offset[2], - Resource, - Sampler, - ConstantInt::get(Int32Type, CT[0]), - ConstantInt::get(Int32Type, CT[1]), - ConstantInt::get(Int32Type, CT[2]), - ConstantInt::get(Int32Type, CT[3]) - }; - - Function *F = Mod->getFunction(Name); - if (!F) { - F = Function::Create(FT, GlobalValue::ExternalLinkage, Name, Mod); - F->addFnAttr(Attribute::ReadNone); - } - I.replaceAllUsesWith(Builder.CreateCall(F, Args)); - I.eraseFromParent(); - } - - void ReplaceTexIntrinsic(CallInst &I, bool hasLOD, FunctionType *FT, - const char *VanillaInt, - const char *ShadowInt) { - Value *Coord = I.getArgOperand(0); - Value *ResourceId = I.getArgOperand(1); - Value *SamplerId = I.getArgOperand(2); - - unsigned TextureType = - cast(I.getArgOperand(3))->getZExtValue(); - - unsigned SrcSelect[4] = { 0, 1, 2, 3 }; - unsigned CT[4] = {1, 1, 1, 1}; - Value *Offset[3] = { - ConstantInt::get(Int32Type, 0), - ConstantInt::get(Int32Type, 0), - ConstantInt::get(Int32Type, 0) - }; - bool useShadowVariant; - - getAdjustmentFromTextureTarget(TextureType, hasLOD, SrcSelect, CT, - useShadowVariant); - - ReplaceCallInst(I, FT, useShadowVariant?ShadowInt:VanillaInt, SrcSelect, - Offset, ResourceId, SamplerId, CT, Coord); - } - - void ReplaceTXF(CallInst &I) { - Value *Coord = I.getArgOperand(0); - Value *ResourceId = I.getArgOperand(4); - Value *SamplerId = I.getArgOperand(5); - - unsigned TextureType = - cast(I.getArgOperand(6))->getZExtValue(); - - unsigned SrcSelect[4] = { 0, 1, 2, 3 }; - unsigned CT[4] = {1, 1, 1, 1}; - Value *Offset[3] = { - I.getArgOperand(1), - I.getArgOperand(2), - I.getArgOperand(3), - }; - bool useShadowVariant; - - getAdjustmentFromTextureTarget(TextureType, false, SrcSelect, CT, - useShadowVariant); - - ReplaceCallInst(I, TexQSign, "llvm.R600.txf", SrcSelect, - Offset, ResourceId, SamplerId, CT, Coord); - } - -public: - R600TextureIntrinsicsReplacer(): - FunctionPass(ID) { - } - - bool doInitialization(Module &M) override { - LLVMContext &Ctx = M.getContext(); - Mod = &M; - FloatType = Type::getFloatTy(Ctx); - Int32Type = Type::getInt32Ty(Ctx); - V4f32Type = VectorType::get(FloatType, 4); - V4i32Type = VectorType::get(Int32Type, 4); - Type *ArgsType[] = { - V4f32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - }; - TexSign = FunctionType::get(V4f32Type, ArgsType, /*isVarArg=*/false); - Type *ArgsQType[] = { - V4i32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - }; - TexQSign = FunctionType::get(V4f32Type, ArgsQType, /*isVarArg=*/false); - return false; - } - - bool runOnFunction(Function &F) override { - visit(F); - return false; - } - - const char *getPassName() const override { - return "R600 Texture Intrinsics Replacer"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - } - - void visitCallInst(CallInst &I) { - if (!I.getCalledFunction()) - return; - - StringRef Name = I.getCalledFunction()->getName(); - if (Name == "llvm.AMDGPU.tex") { - ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.tex", "llvm.R600.texc"); - return; - } - if (Name == "llvm.AMDGPU.txl") { - ReplaceTexIntrinsic(I, true, TexSign, "llvm.R600.txl", "llvm.R600.txlc"); - return; - } - if (Name == "llvm.AMDGPU.txb") { - ReplaceTexIntrinsic(I, true, TexSign, "llvm.R600.txb", "llvm.R600.txbc"); - return; - } - if (Name == "llvm.AMDGPU.txf") { - ReplaceTXF(I); - return; - } - if (Name == "llvm.AMDGPU.txq") { - ReplaceTexIntrinsic(I, false, TexQSign, "llvm.R600.txq", "llvm.R600.txq"); - return; - } - if (Name == "llvm.AMDGPU.ddx") { - ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.ddx", "llvm.R600.ddx"); - return; - } - if (Name == "llvm.AMDGPU.ddy") { - ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.ddy", "llvm.R600.ddy"); - return; - } - } - -}; - -char R600TextureIntrinsicsReplacer::ID = 0; - -} - -FunctionPass *llvm::createR600TextureIntrinsicsReplacer() { - return new R600TextureIntrinsicsReplacer(); -} diff --git a/lib/Target/R600/R700Instructions.td b/lib/Target/R600/R700Instructions.td deleted file mode 100644 index 613a0d729bb..00000000000 --- a/lib/Target/R600/R700Instructions.td +++ /dev/null @@ -1,21 +0,0 @@ -//===-- R700Instructions.td - R700 Instruction defs -------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// TableGen definitions for instructions which are: -// - Available to R700 and newer VLIW4/VLIW5 GPUs -// - Available only on R700 family GPUs. -// -//===----------------------------------------------------------------------===// - -def isR700 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::R700">; - -let Predicates = [isR700] in { - def SIN_r700 : SIN_Common<0x6E>; - def COS_r700 : COS_Common<0x6F>; -} diff --git a/lib/Target/R600/SIAnnotateControlFlow.cpp b/lib/Target/R600/SIAnnotateControlFlow.cpp deleted file mode 100644 index ccfbf1bf19e..00000000000 --- a/lib/Target/R600/SIAnnotateControlFlow.cpp +++ /dev/null @@ -1,365 +0,0 @@ -//===-- SIAnnotateControlFlow.cpp - ------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// Annotates the control flow with hardware specific intrinsics. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/SSAUpdater.h" - -using namespace llvm; - -#define DEBUG_TYPE "si-annotate-control-flow" - -namespace { - -// Complex types used in this pass -typedef std::pair StackEntry; -typedef SmallVector StackVector; - -// Intrinsic names the control flow is annotated with -static const char *const IfIntrinsic = "llvm.SI.if"; -static const char *const ElseIntrinsic = "llvm.SI.else"; -static const char *const BreakIntrinsic = "llvm.SI.break"; -static const char *const IfBreakIntrinsic = "llvm.SI.if.break"; -static const char *const ElseBreakIntrinsic = "llvm.SI.else.break"; -static const char *const LoopIntrinsic = "llvm.SI.loop"; -static const char *const EndCfIntrinsic = "llvm.SI.end.cf"; - -class SIAnnotateControlFlow : public FunctionPass { - - static char ID; - - Type *Boolean; - Type *Void; - Type *Int64; - Type *ReturnStruct; - - ConstantInt *BoolTrue; - ConstantInt *BoolFalse; - UndefValue *BoolUndef; - Constant *Int64Zero; - - Constant *If; - Constant *Else; - Constant *Break; - Constant *IfBreak; - Constant *ElseBreak; - Constant *Loop; - Constant *EndCf; - - DominatorTree *DT; - StackVector Stack; - - LoopInfo *LI; - - bool isTopOfStack(BasicBlock *BB); - - Value *popSaved(); - - void push(BasicBlock *BB, Value *Saved); - - bool isElse(PHINode *Phi); - - void eraseIfUnused(PHINode *Phi); - - void openIf(BranchInst *Term); - - void insertElse(BranchInst *Term); - - Value *handleLoopCondition(Value *Cond, PHINode *Broken, llvm::Loop *L); - - void handleLoop(BranchInst *Term); - - void closeControlFlow(BasicBlock *BB); - -public: - SIAnnotateControlFlow(): - FunctionPass(ID) { } - - bool doInitialization(Module &M) override; - - bool runOnFunction(Function &F) override; - - const char *getPassName() const override { - return "SI annotate control flow"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.addRequired(); - AU.addPreserved(); - FunctionPass::getAnalysisUsage(AU); - } - -}; - -} // end anonymous namespace - -char SIAnnotateControlFlow::ID = 0; - -/// \brief Initialize all the types and constants used in the pass -bool SIAnnotateControlFlow::doInitialization(Module &M) { - LLVMContext &Context = M.getContext(); - - Void = Type::getVoidTy(Context); - Boolean = Type::getInt1Ty(Context); - Int64 = Type::getInt64Ty(Context); - ReturnStruct = StructType::get(Boolean, Int64, (Type *)nullptr); - - BoolTrue = ConstantInt::getTrue(Context); - BoolFalse = ConstantInt::getFalse(Context); - BoolUndef = UndefValue::get(Boolean); - Int64Zero = ConstantInt::get(Int64, 0); - - If = M.getOrInsertFunction( - IfIntrinsic, ReturnStruct, Boolean, (Type *)nullptr); - - Else = M.getOrInsertFunction( - ElseIntrinsic, ReturnStruct, Int64, (Type *)nullptr); - - Break = M.getOrInsertFunction( - BreakIntrinsic, Int64, Int64, (Type *)nullptr); - - IfBreak = M.getOrInsertFunction( - IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)nullptr); - - ElseBreak = M.getOrInsertFunction( - ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)nullptr); - - Loop = M.getOrInsertFunction( - LoopIntrinsic, Boolean, Int64, (Type *)nullptr); - - EndCf = M.getOrInsertFunction( - EndCfIntrinsic, Void, Int64, (Type *)nullptr); - - return false; -} - -/// \brief Is BB the last block saved on the stack ? -bool SIAnnotateControlFlow::isTopOfStack(BasicBlock *BB) { - return !Stack.empty() && Stack.back().first == BB; -} - -/// \brief Pop the last saved value from the control flow stack -Value *SIAnnotateControlFlow::popSaved() { - return Stack.pop_back_val().second; -} - -/// \brief Push a BB and saved value to the control flow stack -void SIAnnotateControlFlow::push(BasicBlock *BB, Value *Saved) { - Stack.push_back(std::make_pair(BB, Saved)); -} - -/// \brief Can the condition represented by this PHI node treated like -/// an "Else" block? -bool SIAnnotateControlFlow::isElse(PHINode *Phi) { - BasicBlock *IDom = DT->getNode(Phi->getParent())->getIDom()->getBlock(); - for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { - if (Phi->getIncomingBlock(i) == IDom) { - - if (Phi->getIncomingValue(i) != BoolTrue) - return false; - - } else { - if (Phi->getIncomingValue(i) != BoolFalse) - return false; - - } - } - return true; -} - -// \brief Erase "Phi" if it is not used any more -void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) { - if (!Phi->hasNUsesOrMore(1)) - Phi->eraseFromParent(); -} - -/// \brief Open a new "If" block -void SIAnnotateControlFlow::openIf(BranchInst *Term) { - Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term); - Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term)); - push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); -} - -/// \brief Close the last "If" block and open a new "Else" block -void SIAnnotateControlFlow::insertElse(BranchInst *Term) { - Value *Ret = CallInst::Create(Else, popSaved(), "", Term); - Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term)); - push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); -} - -/// \brief Recursively handle the condition leading to a loop -Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken, - llvm::Loop *L) { - - // Only search through PHI nodes which are inside the loop. If we try this - // with PHI nodes that are outside of the loop, we end up inserting new PHI - // nodes outside of the loop which depend on values defined inside the loop. - // This will break the module with - // 'Instruction does not dominate all users!' errors. - PHINode *Phi = nullptr; - if ((Phi = dyn_cast(Cond)) && L->contains(Phi)) { - - BasicBlock *Parent = Phi->getParent(); - PHINode *NewPhi = PHINode::Create(Int64, 0, "", &Parent->front()); - Value *Ret = NewPhi; - - // Handle all non-constant incoming values first - for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { - Value *Incoming = Phi->getIncomingValue(i); - BasicBlock *From = Phi->getIncomingBlock(i); - if (isa(Incoming)) { - NewPhi->addIncoming(Broken, From); - continue; - } - - Phi->setIncomingValue(i, BoolFalse); - Value *PhiArg = handleLoopCondition(Incoming, Broken, L); - NewPhi->addIncoming(PhiArg, From); - } - - BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock(); - - for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { - - Value *Incoming = Phi->getIncomingValue(i); - if (Incoming != BoolTrue) - continue; - - BasicBlock *From = Phi->getIncomingBlock(i); - if (From == IDom) { - CallInst *OldEnd = dyn_cast(Parent->getFirstInsertionPt()); - if (OldEnd && OldEnd->getCalledFunction() == EndCf) { - Value *Args[] = { OldEnd->getArgOperand(0), NewPhi }; - Ret = CallInst::Create(ElseBreak, Args, "", OldEnd); - continue; - } - } - TerminatorInst *Insert = From->getTerminator(); - Value *PhiArg = CallInst::Create(Break, Broken, "", Insert); - NewPhi->setIncomingValue(i, PhiArg); - } - eraseIfUnused(Phi); - return Ret; - - } else if (Instruction *Inst = dyn_cast(Cond)) { - BasicBlock *Parent = Inst->getParent(); - Instruction *Insert; - if (L->contains(Inst)) { - Insert = Parent->getTerminator(); - } else { - Insert = L->getHeader()->getFirstNonPHIOrDbgOrLifetime(); - } - Value *Args[] = { Cond, Broken }; - return CallInst::Create(IfBreak, Args, "", Insert); - - } else { - llvm_unreachable("Unhandled loop condition!"); - } - return 0; -} - -/// \brief Handle a back edge (loop) -void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { - BasicBlock *BB = Term->getParent(); - llvm::Loop *L = LI->getLoopFor(BB); - BasicBlock *Target = Term->getSuccessor(1); - PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front()); - - Value *Cond = Term->getCondition(); - Term->setCondition(BoolTrue); - Value *Arg = handleLoopCondition(Cond, Broken, L); - - for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target); - PI != PE; ++PI) { - - Broken->addIncoming(*PI == BB ? Arg : Int64Zero, *PI); - } - - Term->setCondition(CallInst::Create(Loop, Arg, "", Term)); - push(Term->getSuccessor(0), Arg); -}/// \brief Close the last opened control flow -void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { - llvm::Loop *L = LI->getLoopFor(BB); - - if (L && L->getHeader() == BB) { - // We can't insert an EndCF call into a loop header, because it will - // get executed on every iteration of the loop, when it should be - // executed only once before the loop. - SmallVector Latches; - L->getLoopLatches(Latches); - - std::vector Preds; - for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) { - if (std::find(Latches.begin(), Latches.end(), *PI) == Latches.end()) - Preds.push_back(*PI); - } - BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", nullptr, DT, - LI, false); - } - - CallInst::Create(EndCf, popSaved(), "", BB->getFirstInsertionPt()); -} - -/// \brief Annotate the control flow with intrinsics so the backend can -/// recognize if/then/else and loops. -bool SIAnnotateControlFlow::runOnFunction(Function &F) { - DT = &getAnalysis().getDomTree(); - LI = &getAnalysis().getLoopInfo(); - - for (df_iterator I = df_begin(&F.getEntryBlock()), - E = df_end(&F.getEntryBlock()); I != E; ++I) { - - BranchInst *Term = dyn_cast((*I)->getTerminator()); - - if (!Term || Term->isUnconditional()) { - if (isTopOfStack(*I)) - closeControlFlow(*I); - continue; - } - - if (I.nodeVisited(Term->getSuccessor(1))) { - if (isTopOfStack(*I)) - closeControlFlow(*I); - handleLoop(Term); - continue; - } - - if (isTopOfStack(*I)) { - PHINode *Phi = dyn_cast(Term->getCondition()); - if (Phi && Phi->getParent() == *I && isElse(Phi)) { - insertElse(Term); - eraseIfUnused(Phi); - continue; - } - closeControlFlow(*I); - } - openIf(Term); - } - - assert(Stack.empty()); - return true; -} - -/// \brief Create the annotation pass -FunctionPass *llvm::createSIAnnotateControlFlowPass() { - return new SIAnnotateControlFlow(); -} diff --git a/lib/Target/R600/SIDefines.h b/lib/Target/R600/SIDefines.h deleted file mode 100644 index 4727d971ab7..00000000000 --- a/lib/Target/R600/SIDefines.h +++ /dev/null @@ -1,172 +0,0 @@ -//===-- SIDefines.h - SI Helper Macros ----------------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -/// \file -//===----------------------------------------------------------------------===// - -#include "llvm/MC/MCInstrDesc.h" - -#ifndef LLVM_LIB_TARGET_R600_SIDEFINES_H -#define LLVM_LIB_TARGET_R600_SIDEFINES_H - -namespace SIInstrFlags { -// This needs to be kept in sync with the field bits in InstSI. -enum { - SALU = 1 << 3, - VALU = 1 << 4, - - SOP1 = 1 << 5, - SOP2 = 1 << 6, - SOPC = 1 << 7, - SOPK = 1 << 8, - SOPP = 1 << 9, - - VOP1 = 1 << 10, - VOP2 = 1 << 11, - VOP3 = 1 << 12, - VOPC = 1 << 13, - - MUBUF = 1 << 14, - MTBUF = 1 << 15, - SMRD = 1 << 16, - DS = 1 << 17, - MIMG = 1 << 18, - FLAT = 1 << 19, - WQM = 1 << 20, - VGPRSpill = 1 << 21 -}; -} - -namespace llvm { -namespace AMDGPU { - enum OperandType { - /// Operand with register or 32-bit immediate - OPERAND_REG_IMM32 = llvm::MCOI::OPERAND_FIRST_TARGET, - /// Operand with register or inline constant - OPERAND_REG_INLINE_C - }; -} -} - -namespace SIInstrFlags { - enum Flags { - // First 4 bits are the instruction encoding - VM_CNT = 1 << 0, - EXP_CNT = 1 << 1, - LGKM_CNT = 1 << 2 - }; - - // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. - // The result is true if any of these tests are true. - enum ClassFlags { - S_NAN = 1 << 0, // Signaling NaN - Q_NAN = 1 << 1, // Quiet NaN - N_INFINITY = 1 << 2, // Negative infinity - N_NORMAL = 1 << 3, // Negative normal - N_SUBNORMAL = 1 << 4, // Negative subnormal - N_ZERO = 1 << 5, // Negative zero - P_ZERO = 1 << 6, // Positive zero - P_SUBNORMAL = 1 << 7, // Positive subnormal - P_NORMAL = 1 << 8, // Positive normal - P_INFINITY = 1 << 9 // Positive infinity - }; -} - -namespace SISrcMods { - enum { - NEG = 1 << 0, - ABS = 1 << 1 - }; -} - -namespace SIOutMods { - enum { - NONE = 0, - MUL2 = 1, - MUL4 = 2, - DIV2 = 3 - }; -} - -#define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028 -#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS 0x00B02C -#define S_00B02C_EXTRA_LDS_SIZE(x) (((x) & 0xFF) << 8) -#define R_00B128_SPI_SHADER_PGM_RSRC1_VS 0x00B128 -#define R_00B228_SPI_SHADER_PGM_RSRC1_GS 0x00B228 -#define R_00B848_COMPUTE_PGM_RSRC1 0x00B848 -#define S_00B028_VGPRS(x) (((x) & 0x3F) << 0) -#define S_00B028_SGPRS(x) (((x) & 0x0F) << 6) -#define R_00B84C_COMPUTE_PGM_RSRC2 0x00B84C -#define S_00B84C_SCRATCH_EN(x) (((x) & 0x1) << 0) -#define S_00B84C_USER_SGPR(x) (((x) & 0x1F) << 1) -#define S_00B84C_TGID_X_EN(x) (((x) & 0x1) << 7) -#define S_00B84C_TGID_Y_EN(x) (((x) & 0x1) << 8) -#define S_00B84C_TGID_Z_EN(x) (((x) & 0x1) << 9) -#define S_00B84C_TG_SIZE_EN(x) (((x) & 0x1) << 10) -#define S_00B84C_TIDIG_COMP_CNT(x) (((x) & 0x03) << 11) - -#define S_00B84C_LDS_SIZE(x) (((x) & 0x1FF) << 15) -#define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC - - -#define R_00B848_COMPUTE_PGM_RSRC1 0x00B848 -#define S_00B848_VGPRS(x) (((x) & 0x3F) << 0) -#define G_00B848_VGPRS(x) (((x) >> 0) & 0x3F) -#define C_00B848_VGPRS 0xFFFFFFC0 -#define S_00B848_SGPRS(x) (((x) & 0x0F) << 6) -#define G_00B848_SGPRS(x) (((x) >> 6) & 0x0F) -#define C_00B848_SGPRS 0xFFFFFC3F -#define S_00B848_PRIORITY(x) (((x) & 0x03) << 10) -#define G_00B848_PRIORITY(x) (((x) >> 10) & 0x03) -#define C_00B848_PRIORITY 0xFFFFF3FF -#define S_00B848_FLOAT_MODE(x) (((x) & 0xFF) << 12) -#define G_00B848_FLOAT_MODE(x) (((x) >> 12) & 0xFF) -#define C_00B848_FLOAT_MODE 0xFFF00FFF -#define S_00B848_PRIV(x) (((x) & 0x1) << 20) -#define G_00B848_PRIV(x) (((x) >> 20) & 0x1) -#define C_00B848_PRIV 0xFFEFFFFF -#define S_00B848_DX10_CLAMP(x) (((x) & 0x1) << 21) -#define G_00B848_DX10_CLAMP(x) (((x) >> 21) & 0x1) -#define C_00B848_DX10_CLAMP 0xFFDFFFFF -#define S_00B848_DEBUG_MODE(x) (((x) & 0x1) << 22) -#define G_00B848_DEBUG_MODE(x) (((x) >> 22) & 0x1) -#define C_00B848_DEBUG_MODE 0xFFBFFFFF -#define S_00B848_IEEE_MODE(x) (((x) & 0x1) << 23) -#define G_00B848_IEEE_MODE(x) (((x) >> 23) & 0x1) -#define C_00B848_IEEE_MODE 0xFF7FFFFF - - -// Helpers for setting FLOAT_MODE -#define FP_ROUND_ROUND_TO_NEAREST 0 -#define FP_ROUND_ROUND_TO_INF 1 -#define FP_ROUND_ROUND_TO_NEGINF 2 -#define FP_ROUND_ROUND_TO_ZERO 3 - -// Bits 3:0 control rounding mode. 1:0 control single precision, 3:2 double -// precision. -#define FP_ROUND_MODE_SP(x) ((x) & 0x3) -#define FP_ROUND_MODE_DP(x) (((x) & 0x3) << 2) - -#define FP_DENORM_FLUSH_IN_FLUSH_OUT 0 -#define FP_DENORM_FLUSH_OUT 1 -#define FP_DENORM_FLUSH_IN 2 -#define FP_DENORM_FLUSH_NONE 3 - - -// Bits 7:4 control denormal handling. 5:4 control single precision, 6:7 double -// precision. -#define FP_DENORM_MODE_SP(x) (((x) & 0x3) << 4) -#define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6) - -#define R_00B860_COMPUTE_TMPRING_SIZE 0x00B860 -#define S_00B860_WAVESIZE(x) (((x) & 0x1FFF) << 12) - -#define R_0286E8_SPI_TMPRING_SIZE 0x0286E8 -#define S_0286E8_WAVESIZE(x) (((x) & 0x1FFF) << 12) - - -#endif diff --git a/lib/Target/R600/SIFixControlFlowLiveIntervals.cpp b/lib/Target/R600/SIFixControlFlowLiveIntervals.cpp deleted file mode 100644 index 5fe8d19426d..00000000000 --- a/lib/Target/R600/SIFixControlFlowLiveIntervals.cpp +++ /dev/null @@ -1,96 +0,0 @@ -//===-- SIFixControlFlowLiveIntervals.cpp - Fix CF live intervals ---------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Spilling of EXEC masks used for control flow messes up control flow -/// lowering, so mark all live intervals associated with CF instructions as -/// non-spillable. -/// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "SIInstrInfo.h" -#include "SIRegisterInfo.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachinePostDominators.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetMachine.h" - -using namespace llvm; - -#define DEBUG_TYPE "si-fix-cf-live-intervals" - -namespace { - -class SIFixControlFlowLiveIntervals : public MachineFunctionPass { -public: - static char ID; - -public: - SIFixControlFlowLiveIntervals() : MachineFunctionPass(ID) { - initializeSIFixControlFlowLiveIntervalsPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "SI Fix CF Live Intervals"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.setPreservesAll(); - MachineFunctionPass::getAnalysisUsage(AU); - } -}; - -} // End anonymous namespace. - -INITIALIZE_PASS_BEGIN(SIFixControlFlowLiveIntervals, DEBUG_TYPE, - "SI Fix CF Live Intervals", false, false) -INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_END(SIFixControlFlowLiveIntervals, DEBUG_TYPE, - "SI Fix CF Live Intervals", false, false) - -char SIFixControlFlowLiveIntervals::ID = 0; - -char &llvm::SIFixControlFlowLiveIntervalsID = SIFixControlFlowLiveIntervals::ID; - -FunctionPass *llvm::createSIFixControlFlowLiveIntervalsPass() { - return new SIFixControlFlowLiveIntervals(); -} - -bool SIFixControlFlowLiveIntervals::runOnMachineFunction(MachineFunction &MF) { - LiveIntervals *LIS = &getAnalysis(); - - for (const MachineBasicBlock &MBB : MF) { - for (const MachineInstr &MI : MBB) { - switch (MI.getOpcode()) { - case AMDGPU::SI_IF: - case AMDGPU::SI_ELSE: - case AMDGPU::SI_BREAK: - case AMDGPU::SI_IF_BREAK: - case AMDGPU::SI_ELSE_BREAK: - case AMDGPU::SI_END_CF: { - unsigned Reg = MI.getOperand(0).getReg(); - LIS->getInterval(Reg).markNotSpillable(); - break; - } - default: - break; - } - } - } - - return false; -} diff --git a/lib/Target/R600/SIFixSGPRCopies.cpp b/lib/Target/R600/SIFixSGPRCopies.cpp deleted file mode 100644 index 23502b45905..00000000000 --- a/lib/Target/R600/SIFixSGPRCopies.cpp +++ /dev/null @@ -1,338 +0,0 @@ -//===-- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// Copies from VGPR to SGPR registers are illegal and the register coalescer -/// will sometimes generate these illegal copies in situations like this: -/// -/// Register Class is the union of and -/// -/// BB0: -/// %vreg0 = SCALAR_INST -/// %vreg1 = COPY %vreg0 -/// ... -/// BRANCH %cond BB1, BB2 -/// BB1: -/// %vreg2 = VECTOR_INST -/// %vreg3 = COPY %vreg2 -/// BB2: -/// %vreg4 = PHI %vreg1 , , %vreg3 , -/// %vreg5 = VECTOR_INST %vreg4 -/// -/// -/// The coalescer will begin at BB0 and eliminate its copy, then the resulting -/// code will look like this: -/// -/// BB0: -/// %vreg0 = SCALAR_INST -/// ... -/// BRANCH %cond BB1, BB2 -/// BB1: -/// %vreg2 = VECTOR_INST -/// %vreg3 = COPY %vreg2 -/// BB2: -/// %vreg4 = PHI %vreg0 , , %vreg3 , -/// %vreg5 = VECTOR_INST %vreg4 -/// -/// Now that the result of the PHI instruction is an SGPR, the register -/// allocator is now forced to constrain the register class of %vreg3 to -/// so we end up with final code like this: -/// -/// BB0: -/// %vreg0 = SCALAR_INST -/// ... -/// BRANCH %cond BB1, BB2 -/// BB1: -/// %vreg2 = VECTOR_INST -/// %vreg3 = COPY %vreg2 -/// BB2: -/// %vreg4 = PHI %vreg0 , , %vreg3 , -/// %vreg5 = VECTOR_INST %vreg4 -/// -/// Now this code contains an illegal copy from a VGPR to an SGPR. -/// -/// In order to avoid this problem, this pass searches for PHI instructions -/// which define a register and constrains its definition class to -/// if the user of the PHI's definition register is a vector instruction. -/// If the PHI's definition class is constrained to then the coalescer -/// will be unable to perform the COPY removal from the above example which -/// ultimately led to the creation of an illegal COPY. -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "SIInstrInfo.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetMachine.h" - -using namespace llvm; - -#define DEBUG_TYPE "sgpr-copies" - -namespace { - -class SIFixSGPRCopies : public MachineFunctionPass { - -private: - static char ID; - const TargetRegisterClass *inferRegClassFromUses(const SIRegisterInfo *TRI, - const MachineRegisterInfo &MRI, - unsigned Reg, - unsigned SubReg) const; - const TargetRegisterClass *inferRegClassFromDef(const SIRegisterInfo *TRI, - const MachineRegisterInfo &MRI, - unsigned Reg, - unsigned SubReg) const; - bool isVGPRToSGPRCopy(const MachineInstr &Copy, const SIRegisterInfo *TRI, - const MachineRegisterInfo &MRI) const; - -public: - SIFixSGPRCopies(TargetMachine &tm) : MachineFunctionPass(ID) { } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "SI Fix SGPR copies"; - } - -}; - -} // End anonymous namespace - -char SIFixSGPRCopies::ID = 0; - -FunctionPass *llvm::createSIFixSGPRCopiesPass(TargetMachine &tm) { - return new SIFixSGPRCopies(tm); -} - -static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) { - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - if (!MI.getOperand(i).isReg() || - !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) - continue; - - if (TRI->hasVGPRs(MRI.getRegClass(MI.getOperand(i).getReg()))) - return true; - } - return false; -} - -/// This functions walks the use list of Reg until it finds an Instruction -/// that isn't a COPY returns the register class of that instruction. -/// \return The register defined by the first non-COPY instruction. -const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromUses( - const SIRegisterInfo *TRI, - const MachineRegisterInfo &MRI, - unsigned Reg, - unsigned SubReg) const { - - const TargetRegisterClass *RC - = TargetRegisterInfo::isVirtualRegister(Reg) ? - MRI.getRegClass(Reg) : - TRI->getPhysRegClass(Reg); - - RC = TRI->getSubRegClass(RC, SubReg); - for (MachineRegisterInfo::use_instr_iterator - I = MRI.use_instr_begin(Reg), E = MRI.use_instr_end(); I != E; ++I) { - switch (I->getOpcode()) { - case AMDGPU::COPY: - RC = TRI->getCommonSubClass(RC, inferRegClassFromUses(TRI, MRI, - I->getOperand(0).getReg(), - I->getOperand(0).getSubReg())); - break; - } - } - - return RC; -} - -const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromDef( - const SIRegisterInfo *TRI, - const MachineRegisterInfo &MRI, - unsigned Reg, - unsigned SubReg) const { - if (!TargetRegisterInfo::isVirtualRegister(Reg)) { - const TargetRegisterClass *RC = TRI->getPhysRegClass(Reg); - return TRI->getSubRegClass(RC, SubReg); - } - MachineInstr *Def = MRI.getVRegDef(Reg); - if (Def->getOpcode() != AMDGPU::COPY) { - return TRI->getSubRegClass(MRI.getRegClass(Reg), SubReg); - } - - return inferRegClassFromDef(TRI, MRI, Def->getOperand(1).getReg(), - Def->getOperand(1).getSubReg()); -} - -bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy, - const SIRegisterInfo *TRI, - const MachineRegisterInfo &MRI) const { - - unsigned DstReg = Copy.getOperand(0).getReg(); - unsigned SrcReg = Copy.getOperand(1).getReg(); - unsigned SrcSubReg = Copy.getOperand(1).getSubReg(); - - if (!TargetRegisterInfo::isVirtualRegister(DstReg)) { - // If the destination register is a physical register there isn't really - // much we can do to fix this. - return false; - } - - const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); - - const TargetRegisterClass *SrcRC; - - if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || - MRI.getRegClass(SrcReg) == &AMDGPU::VReg_1RegClass) - return false; - - SrcRC = TRI->getSubRegClass(MRI.getRegClass(SrcReg), SrcSubReg); - return TRI->isSGPRClass(DstRC) && TRI->hasVGPRs(SrcRC); -} - -bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { - MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIRegisterInfo *TRI = - static_cast(MF.getSubtarget().getRegisterInfo()); - const SIInstrInfo *TII = - static_cast(MF.getSubtarget().getInstrInfo()); - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - - MachineBasicBlock &MBB = *BI; - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { - MachineInstr &MI = *I; - if (MI.getOpcode() == AMDGPU::COPY && isVGPRToSGPRCopy(MI, TRI, MRI)) { - DEBUG(dbgs() << "Fixing VGPR -> SGPR copy:\n"); - DEBUG(MI.print(dbgs())); - TII->moveToVALU(MI); - - } - - switch (MI.getOpcode()) { - default: continue; - case AMDGPU::PHI: { - DEBUG(dbgs() << "Fixing PHI: " << MI); - - for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { - const MachineOperand &Op = MI.getOperand(i); - unsigned Reg = Op.getReg(); - const TargetRegisterClass *RC - = inferRegClassFromDef(TRI, MRI, Reg, Op.getSubReg()); - - MRI.constrainRegClass(Op.getReg(), RC); - } - unsigned Reg = MI.getOperand(0).getReg(); - const TargetRegisterClass *RC = inferRegClassFromUses(TRI, MRI, Reg, - MI.getOperand(0).getSubReg()); - if (TRI->getCommonSubClass(RC, &AMDGPU::VGPR_32RegClass)) { - MRI.constrainRegClass(Reg, &AMDGPU::VGPR_32RegClass); - } - - if (!TRI->isSGPRClass(MRI.getRegClass(Reg))) - break; - - // If a PHI node defines an SGPR and any of its operands are VGPRs, - // then we need to move it to the VALU. - // - // Also, if a PHI node defines an SGPR and has all SGPR operands - // we must move it to the VALU, because the SGPR operands will - // all end up being assigned the same register, which means - // there is a potential for a conflict if different threads take - // different control flow paths. - // - // For Example: - // - // sgpr0 = def; - // ... - // sgpr1 = def; - // ... - // sgpr2 = PHI sgpr0, sgpr1 - // use sgpr2; - // - // Will Become: - // - // sgpr2 = def; - // ... - // sgpr2 = def; - // ... - // use sgpr2 - // - // FIXME: This is OK if the branching decision is made based on an - // SGPR value. - bool SGPRBranch = false; - - // The one exception to this rule is when one of the operands - // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK - // instruction. In this case, there we know the program will - // never enter the second block (the loop) without entering - // the first block (where the condition is computed), so there - // is no chance for values to be over-written. - - bool HasBreakDef = false; - for (unsigned i = 1; i < MI.getNumOperands(); i+=2) { - unsigned Reg = MI.getOperand(i).getReg(); - if (TRI->hasVGPRs(MRI.getRegClass(Reg))) { - TII->moveToVALU(MI); - break; - } - MachineInstr *DefInstr = MRI.getUniqueVRegDef(Reg); - assert(DefInstr); - switch(DefInstr->getOpcode()) { - - case AMDGPU::SI_BREAK: - case AMDGPU::SI_IF_BREAK: - case AMDGPU::SI_ELSE_BREAK: - // If we see a PHI instruction that defines an SGPR, then that PHI - // instruction has already been considered and should have - // a *_BREAK as an operand. - case AMDGPU::PHI: - HasBreakDef = true; - break; - } - } - - if (!SGPRBranch && !HasBreakDef) - TII->moveToVALU(MI); - break; - } - case AMDGPU::REG_SEQUENCE: { - if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) || - !hasVGPROperands(MI, TRI)) - continue; - - DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI); - - TII->moveToVALU(MI); - break; - } - case AMDGPU::INSERT_SUBREG: { - const TargetRegisterClass *DstRC, *Src0RC, *Src1RC; - DstRC = MRI.getRegClass(MI.getOperand(0).getReg()); - Src0RC = MRI.getRegClass(MI.getOperand(1).getReg()); - Src1RC = MRI.getRegClass(MI.getOperand(2).getReg()); - if (TRI->isSGPRClass(DstRC) && - (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) { - DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI); - TII->moveToVALU(MI); - } - break; - } - } - } - } - - return true; -} diff --git a/lib/Target/R600/SIFixSGPRLiveRanges.cpp b/lib/Target/R600/SIFixSGPRLiveRanges.cpp deleted file mode 100644 index 0c54446b0fb..00000000000 --- a/lib/Target/R600/SIFixSGPRLiveRanges.cpp +++ /dev/null @@ -1,192 +0,0 @@ -//===-- SIFixSGPRLiveRanges.cpp - Fix SGPR live ranges ----------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// SALU instructions ignore control flow, so we need to modify the live ranges -/// of the registers they define in some cases. -/// -/// The main case we need to handle is when a def is used in one side of a -/// branch and not another. For example: -/// -/// %def -/// IF -/// ... -/// ... -/// ELSE -/// %use -/// ... -/// ENDIF -/// -/// Here we need the register allocator to avoid assigning any of the defs -/// inside of the IF to the same register as %def. In traditional live -/// interval analysis %def is not live inside the IF branch, however, since -/// SALU instructions inside of IF will be executed even if the branch is not -/// taken, there is the chance that one of the instructions will overwrite the -/// value of %def, so the use in ELSE will see the wrong value. -/// -/// The strategy we use for solving this is to add an extra use after the ENDIF: -/// -/// %def -/// IF -/// ... -/// ... -/// ELSE -/// %use -/// ... -/// ENDIF -/// %use -/// -/// Adding this use will make the def live thoughout the IF branch, which is -/// what we want. - -#include "AMDGPU.h" -#include "SIInstrInfo.h" -#include "SIRegisterInfo.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachinePostDominators.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetMachine.h" - -using namespace llvm; - -#define DEBUG_TYPE "si-fix-sgpr-live-ranges" - -namespace { - -class SIFixSGPRLiveRanges : public MachineFunctionPass { -public: - static char ID; - -public: - SIFixSGPRLiveRanges() : MachineFunctionPass(ID) { - initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "SI Fix SGPR live ranges"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.addRequired(); - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } -}; - -} // End anonymous namespace. - -INITIALIZE_PASS_BEGIN(SIFixSGPRLiveRanges, DEBUG_TYPE, - "SI Fix SGPR Live Ranges", false, false) -INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) -INITIALIZE_PASS_END(SIFixSGPRLiveRanges, DEBUG_TYPE, - "SI Fix SGPR Live Ranges", false, false) - -char SIFixSGPRLiveRanges::ID = 0; - -char &llvm::SIFixSGPRLiveRangesID = SIFixSGPRLiveRanges::ID; - -FunctionPass *llvm::createSIFixSGPRLiveRangesPass() { - return new SIFixSGPRLiveRanges(); -} - -bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) { - MachineRegisterInfo &MRI = MF.getRegInfo(); - const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); - const SIRegisterInfo *TRI = static_cast( - MF.getSubtarget().getRegisterInfo()); - LiveIntervals *LIS = &getAnalysis(); - MachinePostDominatorTree *PDT = &getAnalysis(); - std::vector> SGPRLiveRanges; - - // First pass, collect all live intervals for SGPRs - for (const MachineBasicBlock &MBB : MF) { - for (const MachineInstr &MI : MBB) { - for (const MachineOperand &MO : MI.defs()) { - if (MO.isImplicit()) - continue; - unsigned Def = MO.getReg(); - if (TargetRegisterInfo::isVirtualRegister(Def)) { - if (TRI->isSGPRClass(MRI.getRegClass(Def))) - SGPRLiveRanges.push_back( - std::make_pair(Def, &LIS->getInterval(Def))); - } else if (TRI->isSGPRClass(TRI->getPhysRegClass(Def))) { - SGPRLiveRanges.push_back( - std::make_pair(Def, &LIS->getRegUnit(Def))); - } - } - } - } - - // Second pass fix the intervals - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - MachineBasicBlock &MBB = *BI; - if (MBB.succ_size() < 2) - continue; - - // We have structured control flow, so number of succesors should be two. - assert(MBB.succ_size() == 2); - MachineBasicBlock *SuccA = *MBB.succ_begin(); - MachineBasicBlock *SuccB = *(++MBB.succ_begin()); - MachineBasicBlock *NCD = PDT->findNearestCommonDominator(SuccA, SuccB); - - if (!NCD) - continue; - - MachineBasicBlock::iterator NCDTerm = NCD->getFirstTerminator(); - - if (NCDTerm != NCD->end() && NCDTerm->getOpcode() == AMDGPU::SI_ELSE) { - assert(NCD->succ_size() == 2); - // We want to make sure we insert the Use after the ENDIF, not after - // the ELSE. - NCD = PDT->findNearestCommonDominator(*NCD->succ_begin(), - *(++NCD->succ_begin())); - } - assert(SuccA && SuccB); - for (std::pair RegLR : SGPRLiveRanges) { - unsigned Reg = RegLR.first; - LiveRange *LR = RegLR.second; - - // FIXME: We could be smarter here. If the register is Live-In to - // one block, but the other doesn't have any SGPR defs, then there - // won't be a conflict. Also, if the branch decision is based on - // a value in an SGPR, then there will be no conflict. - bool LiveInToA = LIS->isLiveInToMBB(*LR, SuccA); - bool LiveInToB = LIS->isLiveInToMBB(*LR, SuccB); - - if ((!LiveInToA && !LiveInToB) || - (LiveInToA && LiveInToB)) - continue; - - // This interval is live in to one successor, but not the other, so - // we need to update its range so it is live in to both. - DEBUG(dbgs() << "Possible SGPR conflict detected " << " in " << *LR << - " BB#" << SuccA->getNumber() << ", BB#" << - SuccB->getNumber() << - " with NCD = " << NCD->getNumber() << '\n'); - - // FIXME: Need to figure out how to update LiveRange here so this pass - // will be able to preserve LiveInterval analysis. - BuildMI(*NCD, NCD->getFirstNonPHI(), DebugLoc(), - TII->get(AMDGPU::SGPR_USE)) - .addReg(Reg, RegState::Implicit); - DEBUG(NCD->getFirstNonPHI()->dump()); - } - } - - return false; -} diff --git a/lib/Target/R600/SIFoldOperands.cpp b/lib/Target/R600/SIFoldOperands.cpp deleted file mode 100644 index d14e37a6461..00000000000 --- a/lib/Target/R600/SIFoldOperands.cpp +++ /dev/null @@ -1,288 +0,0 @@ -//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -/// \file -//===----------------------------------------------------------------------===// -// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "SIInstrInfo.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetMachine.h" - -#define DEBUG_TYPE "si-fold-operands" -using namespace llvm; - -namespace { - -class SIFoldOperands : public MachineFunctionPass { -public: - static char ID; - -public: - SIFoldOperands() : MachineFunctionPass(ID) { - initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "SI Fold Operands"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } -}; - -struct FoldCandidate { - MachineInstr *UseMI; - unsigned UseOpNo; - MachineOperand *OpToFold; - uint64_t ImmToFold; - - FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) : - UseMI(MI), UseOpNo(OpNo) { - - if (FoldOp->isImm()) { - OpToFold = nullptr; - ImmToFold = FoldOp->getImm(); - } else { - assert(FoldOp->isReg()); - OpToFold = FoldOp; - } - } - - bool isImm() const { - return !OpToFold; - } -}; - -} // End anonymous namespace. - -INITIALIZE_PASS_BEGIN(SIFoldOperands, DEBUG_TYPE, - "SI Fold Operands", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_END(SIFoldOperands, DEBUG_TYPE, - "SI Fold Operands", false, false) - -char SIFoldOperands::ID = 0; - -char &llvm::SIFoldOperandsID = SIFoldOperands::ID; - -FunctionPass *llvm::createSIFoldOperandsPass() { - return new SIFoldOperands(); -} - -static bool isSafeToFold(unsigned Opcode) { - switch(Opcode) { - case AMDGPU::V_MOV_B32_e32: - case AMDGPU::V_MOV_B32_e64: - case AMDGPU::V_MOV_B64_PSEUDO: - case AMDGPU::S_MOV_B32: - case AMDGPU::S_MOV_B64: - case AMDGPU::COPY: - return true; - default: - return false; - } -} - -static bool updateOperand(FoldCandidate &Fold, - const TargetRegisterInfo &TRI) { - MachineInstr *MI = Fold.UseMI; - MachineOperand &Old = MI->getOperand(Fold.UseOpNo); - assert(Old.isReg()); - - if (Fold.isImm()) { - Old.ChangeToImmediate(Fold.ImmToFold); - return true; - } - - MachineOperand *New = Fold.OpToFold; - if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) && - TargetRegisterInfo::isVirtualRegister(New->getReg())) { - Old.substVirtReg(New->getReg(), New->getSubReg(), TRI); - return true; - } - - // FIXME: Handle physical registers. - - return false; -} - -static bool tryAddToFoldList(std::vector &FoldList, - MachineInstr *MI, unsigned OpNo, - MachineOperand *OpToFold, - const SIInstrInfo *TII) { - if (!TII->isOperandLegal(MI, OpNo, OpToFold)) { - // Operand is not legal, so try to commute the instruction to - // see if this makes it possible to fold. - unsigned CommuteIdx0; - unsigned CommuteIdx1; - bool CanCommute = TII->findCommutedOpIndices(MI, CommuteIdx0, CommuteIdx1); - - if (CanCommute) { - if (CommuteIdx0 == OpNo) - OpNo = CommuteIdx1; - else if (CommuteIdx1 == OpNo) - OpNo = CommuteIdx0; - } - - if (!CanCommute || !TII->commuteInstruction(MI)) - return false; - - if (!TII->isOperandLegal(MI, OpNo, OpToFold)) - return false; - } - - FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold)); - return true; -} - -bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { - MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIInstrInfo *TII = - static_cast(MF.getSubtarget().getInstrInfo()); - const SIRegisterInfo &TRI = TII->getRegisterInfo(); - - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - - MachineBasicBlock &MBB = *BI; - MachineBasicBlock::iterator I, Next; - for (I = MBB.begin(); I != MBB.end(); I = Next) { - Next = std::next(I); - MachineInstr &MI = *I; - - if (!isSafeToFold(MI.getOpcode())) - continue; - - unsigned OpSize = TII->getOpSize(MI, 1); - MachineOperand &OpToFold = MI.getOperand(1); - bool FoldingImm = OpToFold.isImm(); - - // FIXME: We could also be folding things like FrameIndexes and - // TargetIndexes. - if (!FoldingImm && !OpToFold.isReg()) - continue; - - // Folding immediates with more than one use will increase program size. - // FIXME: This will also reduce register usage, which may be better - // in some cases. A better heuristic is needed. - if (FoldingImm && !TII->isInlineConstant(OpToFold, OpSize) && - !MRI.hasOneUse(MI.getOperand(0).getReg())) - continue; - - // FIXME: Fold operands with subregs. - if (OpToFold.isReg() && - (!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()) || - OpToFold.getSubReg())) - continue; - - std::vector FoldList; - for (MachineRegisterInfo::use_iterator - Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end(); - Use != E; ++Use) { - - MachineInstr *UseMI = Use->getParent(); - const MachineOperand &UseOp = UseMI->getOperand(Use.getOperandNo()); - - // FIXME: Fold operands with subregs. - if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) || - UseOp.isImplicit())) { - continue; - } - - APInt Imm; - - if (FoldingImm) { - unsigned UseReg = UseOp.getReg(); - const TargetRegisterClass *UseRC - = TargetRegisterInfo::isVirtualRegister(UseReg) ? - MRI.getRegClass(UseReg) : - TRI.getPhysRegClass(UseReg); - - Imm = APInt(64, OpToFold.getImm()); - - // Split 64-bit constants into 32-bits for folding. - if (UseOp.getSubReg()) { - if (UseRC->getSize() != 8) - continue; - - if (UseOp.getSubReg() == AMDGPU::sub0) { - Imm = Imm.getLoBits(32); - } else { - assert(UseOp.getSubReg() == AMDGPU::sub1); - Imm = Imm.getHiBits(32); - } - } - - // In order to fold immediates into copies, we need to change the - // copy to a MOV. - if (UseMI->getOpcode() == AMDGPU::COPY) { - unsigned DestReg = UseMI->getOperand(0).getReg(); - const TargetRegisterClass *DestRC - = TargetRegisterInfo::isVirtualRegister(DestReg) ? - MRI.getRegClass(DestReg) : - TRI.getPhysRegClass(DestReg); - - unsigned MovOp = TII->getMovOpcode(DestRC); - if (MovOp == AMDGPU::COPY) - continue; - - UseMI->setDesc(TII->get(MovOp)); - } - } - - const MCInstrDesc &UseDesc = UseMI->getDesc(); - - // Don't fold into target independent nodes. Target independent opcodes - // don't have defined register classes. - if (UseDesc.isVariadic() || - UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1) - continue; - - if (FoldingImm) { - MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); - tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &ImmOp, TII); - continue; - } - - tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &OpToFold, TII); - - // FIXME: We could try to change the instruction from 64-bit to 32-bit - // to enable more folding opportunites. The shrink operands pass - // already does this. - } - - for (FoldCandidate &Fold : FoldList) { - if (updateOperand(Fold, TRI)) { - // Clear kill flags. - if (!Fold.isImm()) { - assert(Fold.OpToFold && Fold.OpToFold->isReg()); - Fold.OpToFold->setIsKill(false); - } - DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << - Fold.UseOpNo << " of " << *Fold.UseMI << '\n'); - } - } - } - } - return false; -} diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp deleted file mode 100644 index 12d08cf4c7f..00000000000 --- a/lib/Target/R600/SIISelLowering.cpp +++ /dev/null @@ -1,2241 +0,0 @@ -//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Custom DAG lowering for SI -// -//===----------------------------------------------------------------------===// - -#ifdef _MSC_VER -// Provide M_PI. -#define _USE_MATH_DEFINES -#include -#endif - -#include "SIISelLowering.h" -#include "AMDGPU.h" -#include "AMDGPUIntrinsicInfo.h" -#include "AMDGPUSubtarget.h" -#include "SIInstrInfo.h" -#include "SIMachineFunctionInfo.h" -#include "SIRegisterInfo.h" -#include "llvm/ADT/BitVector.h" -#include "llvm/CodeGen/CallingConvLower.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/IR/Function.h" -#include "llvm/ADT/SmallString.h" - -using namespace llvm; - -SITargetLowering::SITargetLowering(TargetMachine &TM, - const AMDGPUSubtarget &STI) - : AMDGPUTargetLowering(TM, STI) { - addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); - addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); - - addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); - addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); - - addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); - addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); - - addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); - addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); - addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); - - addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); - addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); - - addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass); - addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); - - addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass); - addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); - - computeRegisterProperties(STI.getRegisterInfo()); - - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); - - setOperationAction(ISD::ADD, MVT::i32, Legal); - setOperationAction(ISD::ADDC, MVT::i32, Legal); - setOperationAction(ISD::ADDE, MVT::i32, Legal); - setOperationAction(ISD::SUBC, MVT::i32, Legal); - setOperationAction(ISD::SUBE, MVT::i32, Legal); - - setOperationAction(ISD::FSIN, MVT::f32, Custom); - setOperationAction(ISD::FCOS, MVT::f32, Custom); - - setOperationAction(ISD::FMINNUM, MVT::f64, Legal); - setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); - - // We need to custom lower vector stores from local memory - setOperationAction(ISD::LOAD, MVT::v4i32, Custom); - setOperationAction(ISD::LOAD, MVT::v8i32, Custom); - setOperationAction(ISD::LOAD, MVT::v16i32, Custom); - - setOperationAction(ISD::STORE, MVT::v8i32, Custom); - setOperationAction(ISD::STORE, MVT::v16i32, Custom); - - setOperationAction(ISD::STORE, MVT::i1, Custom); - setOperationAction(ISD::STORE, MVT::v4i32, Custom); - - setOperationAction(ISD::SELECT, MVT::i64, Custom); - setOperationAction(ISD::SELECT, MVT::f64, Promote); - AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); - - setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); - setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); - setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); - setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); - - setOperationAction(ISD::SETCC, MVT::v2i1, Expand); - setOperationAction(ISD::SETCC, MVT::v4i1, Expand); - - setOperationAction(ISD::BSWAP, MVT::i32, Legal); - - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); - - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); - - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); - - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); - - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); - - setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); - setOperationAction(ISD::BRCOND, MVT::Other, Custom); - - for (MVT VT : MVT::integer_valuetypes()) { - if (VT == MVT::i64) - continue; - - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand); - - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand); - - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand); - } - - for (MVT VT : MVT::integer_vector_valuetypes()) { - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i16, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v16i16, Expand); - } - - for (MVT VT : MVT::fp_valuetypes()) - setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); - - setTruncStoreAction(MVT::i64, MVT::i32, Expand); - setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); - setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); - - setOperationAction(ISD::LOAD, MVT::i1, Custom); - - setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); - setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); - setOperationAction(ISD::FrameIndex, MVT::i32, Custom); - - // These should use UDIVREM, so set them to expand - setOperationAction(ISD::UDIV, MVT::i64, Expand); - setOperationAction(ISD::UREM, MVT::i64, Expand); - - setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); - setOperationAction(ISD::SELECT, MVT::i1, Promote); - - // We only support LOAD/STORE and vector manipulation ops for vectors - // with > 4 elements. - for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}) { - for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { - switch(Op) { - case ISD::LOAD: - case ISD::STORE: - case ISD::BUILD_VECTOR: - case ISD::BITCAST: - case ISD::EXTRACT_VECTOR_ELT: - case ISD::INSERT_VECTOR_ELT: - case ISD::INSERT_SUBVECTOR: - case ISD::EXTRACT_SUBVECTOR: - break; - case ISD::CONCAT_VECTORS: - setOperationAction(Op, VT, Custom); - break; - default: - setOperationAction(Op, VT, Expand); - break; - } - } - } - - if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { - setOperationAction(ISD::FTRUNC, MVT::f64, Legal); - setOperationAction(ISD::FCEIL, MVT::f64, Legal); - setOperationAction(ISD::FRINT, MVT::f64, Legal); - } - - setOperationAction(ISD::FFLOOR, MVT::f64, Legal); - setOperationAction(ISD::FDIV, MVT::f32, Custom); - setOperationAction(ISD::FDIV, MVT::f64, Custom); - - setTargetDAGCombine(ISD::FADD); - setTargetDAGCombine(ISD::FSUB); - setTargetDAGCombine(ISD::FMINNUM); - setTargetDAGCombine(ISD::FMAXNUM); - setTargetDAGCombine(ISD::SMIN); - setTargetDAGCombine(ISD::SMAX); - setTargetDAGCombine(ISD::UMIN); - setTargetDAGCombine(ISD::UMAX); - setTargetDAGCombine(ISD::SELECT_CC); - setTargetDAGCombine(ISD::SETCC); - setTargetDAGCombine(ISD::AND); - setTargetDAGCombine(ISD::OR); - setTargetDAGCombine(ISD::UINT_TO_FP); - - // All memory operations. Some folding on the pointer operand is done to help - // matching the constant offsets in the addressing modes. - setTargetDAGCombine(ISD::LOAD); - setTargetDAGCombine(ISD::STORE); - setTargetDAGCombine(ISD::ATOMIC_LOAD); - setTargetDAGCombine(ISD::ATOMIC_STORE); - setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP); - setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS); - setTargetDAGCombine(ISD::ATOMIC_SWAP); - setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD); - setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB); - setTargetDAGCombine(ISD::ATOMIC_LOAD_AND); - setTargetDAGCombine(ISD::ATOMIC_LOAD_OR); - setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR); - setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND); - setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN); - setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX); - setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN); - setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX); - - setSchedulingPreference(Sched::RegPressure); -} - -//===----------------------------------------------------------------------===// -// TargetLowering queries -//===----------------------------------------------------------------------===// - -bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl &, - EVT) const { - // SI has some legal vector types, but no legal vector operations. Say no - // shuffles are legal in order to prefer scalarizing some vector operations. - return false; -} - -bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM, - Type *Ty, unsigned AS) const { - // No global is ever allowed as a base. - if (AM.BaseGV) - return false; - - switch (AS) { - case AMDGPUAS::GLOBAL_ADDRESS: - case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions? - case AMDGPUAS::PRIVATE_ADDRESS: - case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: { - // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and - // additionally can do r + r + i with addr64. 32-bit has more addressing - // mode options. Depending on the resource constant, it can also do - // (i64 r0) + (i32 r1) * (i14 i). - // - // SMRD instructions have an 8-bit, dword offset. - // - // Assume nonunifom access, since the address space isn't enough to know - // what instruction we will use, and since we don't know if this is a load - // or store and scalar stores are only available on VI. - // - // We also know if we are doing an extload, we can't do a scalar load. - // - // Private arrays end up using a scratch buffer most of the time, so also - // assume those use MUBUF instructions. Scratch loads / stores are currently - // implemented as mubuf instructions with offen bit set, so slightly - // different than the normal addr64. - if (!isUInt<12>(AM.BaseOffs)) - return false; - - // FIXME: Since we can split immediate into soffset and immediate offset, - // would it make sense to allow any immediate? - - switch (AM.Scale) { - case 0: // r + i or just i, depending on HasBaseReg. - return true; - case 1: - return true; // We have r + r or r + i. - case 2: - if (AM.HasBaseReg) { - // Reject 2 * r + r. - return false; - } - - // Allow 2 * r as r + r - // Or 2 * r + i is allowed as r + r + i. - return true; - default: // Don't allow n * r - return false; - } - } - case AMDGPUAS::LOCAL_ADDRESS: - case AMDGPUAS::REGION_ADDRESS: { - // Basic, single offset DS instructions allow a 16-bit unsigned immediate - // field. - // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have - // an 8-bit dword offset but we don't know the alignment here. - if (!isUInt<16>(AM.BaseOffs)) - return false; - - if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. - return true; - - if (AM.Scale == 1 && AM.HasBaseReg) - return true; - - return false; - } - case AMDGPUAS::FLAT_ADDRESS: { - // Flat instructions do not have offsets, and only have the register - // address. - return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1); - } - default: - llvm_unreachable("unhandled address space"); - } -} - -bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, - unsigned AddrSpace, - unsigned Align, - bool *IsFast) const { - if (IsFast) - *IsFast = false; - - // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, - // which isn't a simple VT. - if (!VT.isSimple() || VT == MVT::Other) - return false; - - // TODO - CI+ supports unaligned memory accesses, but this requires driver - // support. - - // XXX - The only mention I see of this in the ISA manual is for LDS direct - // reads the "byte address and must be dword aligned". Is it also true for the - // normal loads and stores? - if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) { - // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte - // aligned, 8 byte access in a single operation using ds_read2/write2_b32 - // with adjacent offsets. - return Align % 4 == 0; - } - - // Smaller than dword value must be aligned. - // FIXME: This should be allowed on CI+ - if (VT.bitsLT(MVT::i32)) - return false; - - // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the - // byte-address are ignored, thus forcing Dword alignment. - // This applies to private, global, and constant memory. - if (IsFast) - *IsFast = true; - - return VT.bitsGT(MVT::i32) && Align % 4 == 0; -} - -EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, - unsigned SrcAlign, bool IsMemset, - bool ZeroMemset, - bool MemcpyStrSrc, - MachineFunction &MF) const { - // FIXME: Should account for address space here. - - // The default fallback uses the private pointer size as a guess for a type to - // use. Make sure we switch these to 64-bit accesses. - - if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global - return MVT::v4i32; - - if (Size >= 8 && DstAlign >= 4) - return MVT::v2i32; - - // Use the default. - return MVT::Other; -} - -TargetLoweringBase::LegalizeTypeAction -SITargetLowering::getPreferredVectorAction(EVT VT) const { - if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) - return TypeSplitVector; - - return TargetLoweringBase::getPreferredVectorAction(VT); -} - -bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, - Type *Ty) const { - const SIInstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); - return TII->isInlineConstant(Imm); -} - -static EVT toIntegerVT(EVT VT) { - if (VT.isVector()) - return VT.changeVectorElementTypeToInteger(); - return MVT::getIntegerVT(VT.getSizeInBits()); -} - -SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, - SDLoc SL, SDValue Chain, - unsigned Offset, bool Signed) const { - const DataLayout *DL = getDataLayout(); - MachineFunction &MF = DAG.getMachineFunction(); - const SIRegisterInfo *TRI = - static_cast(Subtarget->getRegisterInfo()); - unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); - - Type *Ty = VT.getTypeForEVT(*DAG.getContext()); - - MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); - MVT PtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS); - PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); - SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, - MRI.getLiveInVirtReg(InputPtrReg), PtrVT); - SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, - DAG.getConstant(Offset, SL, PtrVT)); - SDValue PtrOffset = DAG.getUNDEF(getPointerTy(AMDGPUAS::CONSTANT_ADDRESS)); - MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); - - unsigned Align = DL->getABITypeAlignment(Ty); - - if (VT != MemVT && VT.isFloatingPoint()) { - // Do an integer load and convert. - // FIXME: This is mostly because load legalization after type legalization - // doesn't handle FP extloads. - assert(VT.getScalarType() == MVT::f32 && - MemVT.getScalarType() == MVT::f16); - - EVT IVT = toIntegerVT(VT); - EVT MemIVT = toIntegerVT(MemVT); - SDValue Load = DAG.getLoad(ISD::UNINDEXED, ISD::ZEXTLOAD, - IVT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemIVT, - false, // isVolatile - true, // isNonTemporal - true, // isInvariant - Align); // Alignment - return DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load); - } - - ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD; - return DAG.getLoad(ISD::UNINDEXED, ExtTy, - VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT, - false, // isVolatile - true, // isNonTemporal - true, // isInvariant - Align); // Alignment -} - -SDValue SITargetLowering::LowerFormalArguments( - SDValue Chain, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl &Ins, SDLoc DL, SelectionDAG &DAG, - SmallVectorImpl &InVals) const { - const SIRegisterInfo *TRI = - static_cast(Subtarget->getRegisterInfo()); - - MachineFunction &MF = DAG.getMachineFunction(); - FunctionType *FType = MF.getFunction()->getFunctionType(); - SIMachineFunctionInfo *Info = MF.getInfo(); - - assert(CallConv == CallingConv::C); - - SmallVector Splits; - BitVector Skipped(Ins.size()); - - for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { - const ISD::InputArg &Arg = Ins[i]; - - // First check if it's a PS input addr - if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() && - !Arg.Flags.isByVal()) { - - assert((PSInputNum <= 15) && "Too many PS inputs!"); - - if (!Arg.Used) { - // We can savely skip PS inputs - Skipped.set(i); - ++PSInputNum; - continue; - } - - Info->PSInputAddr |= 1 << PSInputNum++; - } - - // Second split vertices into their elements - if (Info->getShaderType() != ShaderType::COMPUTE && Arg.VT.isVector()) { - ISD::InputArg NewArg = Arg; - NewArg.Flags.setSplit(); - NewArg.VT = Arg.VT.getVectorElementType(); - - // We REALLY want the ORIGINAL number of vertex elements here, e.g. a - // three or five element vertex only needs three or five registers, - // NOT four or eigth. - Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); - unsigned NumElements = ParamType->getVectorNumElements(); - - for (unsigned j = 0; j != NumElements; ++j) { - Splits.push_back(NewArg); - NewArg.PartOffset += NewArg.VT.getStoreSize(); - } - - } else if (Info->getShaderType() != ShaderType::COMPUTE) { - Splits.push_back(Arg); - } - } - - SmallVector ArgLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, - *DAG.getContext()); - - // At least one interpolation mode must be enabled or else the GPU will hang. - if (Info->getShaderType() == ShaderType::PIXEL && - (Info->PSInputAddr & 0x7F) == 0) { - Info->PSInputAddr |= 1; - CCInfo.AllocateReg(AMDGPU::VGPR0); - CCInfo.AllocateReg(AMDGPU::VGPR1); - } - - // The pointer to the list of arguments is stored in SGPR0, SGPR1 - // The pointer to the scratch buffer is stored in SGPR2, SGPR3 - if (Info->getShaderType() == ShaderType::COMPUTE) { - if (Subtarget->isAmdHsaOS()) - Info->NumUserSGPRs = 2; // FIXME: Need to support scratch buffers. - else - Info->NumUserSGPRs = 4; - - unsigned InputPtrReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); - unsigned InputPtrRegLo = - TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0); - unsigned InputPtrRegHi = - TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1); - - unsigned ScratchPtrReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); - unsigned ScratchPtrRegLo = - TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 0); - unsigned ScratchPtrRegHi = - TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 1); - - CCInfo.AllocateReg(InputPtrRegLo); - CCInfo.AllocateReg(InputPtrRegHi); - CCInfo.AllocateReg(ScratchPtrRegLo); - CCInfo.AllocateReg(ScratchPtrRegHi); - MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); - MF.addLiveIn(ScratchPtrReg, &AMDGPU::SReg_64RegClass); - } - - if (Info->getShaderType() == ShaderType::COMPUTE) { - getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, - Splits); - } - - AnalyzeFormalArguments(CCInfo, Splits); - - for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { - - const ISD::InputArg &Arg = Ins[i]; - if (Skipped[i]) { - InVals.push_back(DAG.getUNDEF(Arg.VT)); - continue; - } - - CCValAssign &VA = ArgLocs[ArgIdx++]; - MVT VT = VA.getLocVT(); - - if (VA.isMemLoc()) { - VT = Ins[i].VT; - EVT MemVT = Splits[i].VT; - const unsigned Offset = 36 + VA.getLocMemOffset(); - // The first 36 bytes of the input buffer contains information about - // thread group and global sizes. - SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, DAG.getRoot(), - Offset, Ins[i].Flags.isSExt()); - - const PointerType *ParamTy = - dyn_cast(FType->getParamType(Ins[i].getOrigArgIndex())); - if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && - ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { - // On SI local pointers are just offsets into LDS, so they are always - // less than 16-bits. On CI and newer they could potentially be - // real pointers, so we can't guarantee their size. - Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg, - DAG.getValueType(MVT::i16)); - } - - InVals.push_back(Arg); - Info->ABIArgOffset = Offset + MemVT.getStoreSize(); - continue; - } - assert(VA.isRegLoc() && "Parameter must be in a register!"); - - unsigned Reg = VA.getLocReg(); - - if (VT == MVT::i64) { - // For now assume it is a pointer - Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, - &AMDGPU::SReg_64RegClass); - Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass); - InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); - continue; - } - - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); - - Reg = MF.addLiveIn(Reg, RC); - SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); - - if (Arg.VT.isVector()) { - - // Build a vector from the registers - Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); - unsigned NumElements = ParamType->getVectorNumElements(); - - SmallVector Regs; - Regs.push_back(Val); - for (unsigned j = 1; j != NumElements; ++j) { - Reg = ArgLocs[ArgIdx++].getLocReg(); - Reg = MF.addLiveIn(Reg, RC); - Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); - } - - // Fill up the missing vector elements - NumElements = Arg.VT.getVectorNumElements() - NumElements; - Regs.append(NumElements, DAG.getUNDEF(VT)); - - InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs)); - continue; - } - - InVals.push_back(Val); - } - - if (Info->getShaderType() != ShaderType::COMPUTE) { - unsigned ScratchIdx = CCInfo.getFirstUnallocated(ArrayRef( - AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs())); - Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx); - } - return Chain; -} - -MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( - MachineInstr * MI, MachineBasicBlock * BB) const { - - MachineBasicBlock::iterator I = *MI; - const SIInstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); - - switch (MI->getOpcode()) { - default: - return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); - case AMDGPU::BRANCH: - return BB; - case AMDGPU::SI_RegisterStorePseudo: { - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - MachineInstrBuilder MIB = - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore), - Reg); - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) - MIB.addOperand(MI->getOperand(i)); - - MI->eraseFromParent(); - break; - } - } - return BB; -} - -bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { - // This currently forces unfolding various combinations of fsub into fma with - // free fneg'd operands. As long as we have fast FMA (controlled by - // isFMAFasterThanFMulAndFAdd), we should perform these. - - // When fma is quarter rate, for f64 where add / sub are at best half rate, - // most of these combines appear to be cycle neutral but save on instruction - // count / code size. - return true; -} - -EVT SITargetLowering::getSetCCResultType(LLVMContext &Ctx, EVT VT) const { - if (!VT.isVector()) { - return MVT::i1; - } - return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements()); -} - -MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const { - return MVT::i32; -} - -// Answering this is somewhat tricky and depends on the specific device which -// have different rates for fma or all f64 operations. -// -// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other -// regardless of which device (although the number of cycles differs between -// devices), so it is always profitable for f64. -// -// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable -// only on full rate devices. Normally, we should prefer selecting v_mad_f32 -// which we can always do even without fused FP ops since it returns the same -// result as the separate operations and since it is always full -// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32 -// however does not support denormals, so we do report fma as faster if we have -// a fast fma device and require denormals. -// -bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { - VT = VT.getScalarType(); - - if (!VT.isSimple()) - return false; - - switch (VT.getSimpleVT().SimpleTy) { - case MVT::f32: - // This is as fast on some subtargets. However, we always have full rate f32 - // mad available which returns the same result as the separate operations - // which we should prefer over fma. We can't use this if we want to support - // denormals, so only report this in these cases. - return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32(); - case MVT::f64: - return true; - default: - break; - } - - return false; -} - -//===----------------------------------------------------------------------===// -// Custom DAG Lowering Operations -//===----------------------------------------------------------------------===// - -SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { - switch (Op.getOpcode()) { - default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); - case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); - case ISD::BRCOND: return LowerBRCOND(Op, DAG); - case ISD::LOAD: { - SDValue Result = LowerLOAD(Op, DAG); - assert((!Result.getNode() || - Result.getNode()->getNumValues() == 2) && - "Load should return a value and a chain"); - return Result; - } - - case ISD::FSIN: - case ISD::FCOS: - return LowerTrig(Op, DAG); - case ISD::SELECT: return LowerSELECT(Op, DAG); - case ISD::FDIV: return LowerFDIV(Op, DAG); - case ISD::STORE: return LowerSTORE(Op, DAG); - case ISD::GlobalAddress: { - MachineFunction &MF = DAG.getMachineFunction(); - SIMachineFunctionInfo *MFI = MF.getInfo(); - return LowerGlobalAddress(MFI, Op, DAG); - } - case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); - case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); - } - return SDValue(); -} - -/// \brief Helper function for LowerBRCOND -static SDNode *findUser(SDValue Value, unsigned Opcode) { - - SDNode *Parent = Value.getNode(); - for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); - I != E; ++I) { - - if (I.getUse().get() != Value) - continue; - - if (I->getOpcode() == Opcode) - return *I; - } - return nullptr; -} - -SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { - - FrameIndexSDNode *FINode = cast(Op); - unsigned FrameIndex = FINode->getIndex(); - - return DAG.getTargetFrameIndex(FrameIndex, MVT::i32); -} - -/// This transforms the control flow intrinsics to get the branch destination as -/// last parameter, also switches branch target with BR if the need arise -SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, - SelectionDAG &DAG) const { - - SDLoc DL(BRCOND); - - SDNode *Intr = BRCOND.getOperand(1).getNode(); - SDValue Target = BRCOND.getOperand(2); - SDNode *BR = nullptr; - - if (Intr->getOpcode() == ISD::SETCC) { - // As long as we negate the condition everything is fine - SDNode *SetCC = Intr; - assert(SetCC->getConstantOperandVal(1) == 1); - assert(cast(SetCC->getOperand(2).getNode())->get() == - ISD::SETNE); - Intr = SetCC->getOperand(0).getNode(); - - } else { - // Get the target from BR if we don't negate the condition - BR = findUser(BRCOND, ISD::BR); - Target = BR->getOperand(1); - } - - assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); - - // Build the result and - ArrayRef Res(Intr->value_begin() + 1, Intr->value_end()); - - // operands of the new intrinsic call - SmallVector Ops; - Ops.push_back(BRCOND.getOperand(0)); - Ops.append(Intr->op_begin() + 1, Intr->op_end()); - Ops.push_back(Target); - - // build the new intrinsic call - SDNode *Result = DAG.getNode( - Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, - DAG.getVTList(Res), Ops).getNode(); - - if (BR) { - // Give the branch instruction our target - SDValue Ops[] = { - BR->getOperand(0), - BRCOND.getOperand(2) - }; - SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops); - DAG.ReplaceAllUsesWith(BR, NewBR.getNode()); - BR = NewBR.getNode(); - } - - SDValue Chain = SDValue(Result, Result->getNumValues() - 1); - - // Copy the intrinsic results to registers - for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { - SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); - if (!CopyToReg) - continue; - - Chain = DAG.getCopyToReg( - Chain, DL, - CopyToReg->getOperand(1), - SDValue(Result, i - 1), - SDValue()); - - DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); - } - - // Remove the old intrinsic from the chain - DAG.ReplaceAllUsesOfValueWith( - SDValue(Intr, Intr->getNumValues() - 1), - Intr->getOperand(0)); - - return Chain; -} - -SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, - SDValue Op, - SelectionDAG &DAG) const { - GlobalAddressSDNode *GSD = cast(Op); - - if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) - return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); - - SDLoc DL(GSD); - const GlobalValue *GV = GSD->getGlobal(); - MVT PtrVT = getPointerTy(GSD->getAddressSpace()); - - SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT); - SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32); - - SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, - DAG.getConstant(0, DL, MVT::i32)); - SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, - DAG.getConstant(1, DL, MVT::i32)); - - SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue), - PtrLo, GA); - SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue), - PtrHi, DAG.getConstant(0, DL, MVT::i32), - SDValue(Lo.getNode(), 1)); - return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi); -} - -SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, - SDValue V) const { - // We can't use CopyToReg, because MachineCSE won't combine COPY instructions, - // so we will end up with redundant moves to m0. - // - // We can't use S_MOV_B32, because there is no way to specify m0 as the - // destination register. - // - // We have to use them both. Machine cse will combine all the S_MOV_B32 - // instructions and the register coalescer eliminate the extra copies. - SDNode *M0 = DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, V.getValueType(), V); - return DAG.getCopyToReg(Chain, DL, DAG.getRegister(AMDGPU::M0, MVT::i32), - SDValue(M0, 0), SDValue()); // Glue - // A Null SDValue creates - // a glue result. -} - -SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, - SelectionDAG &DAG) const { - MachineFunction &MF = DAG.getMachineFunction(); - const SIRegisterInfo *TRI = - static_cast(Subtarget->getRegisterInfo()); - - EVT VT = Op.getValueType(); - SDLoc DL(Op); - unsigned IntrinsicID = cast(Op.getOperand(0))->getZExtValue(); - - switch (IntrinsicID) { - case Intrinsic::r600_read_ngroups_x: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_X, false); - case Intrinsic::r600_read_ngroups_y: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_Y, false); - case Intrinsic::r600_read_ngroups_z: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_Z, false); - case Intrinsic::r600_read_global_size_x: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_X, false); - case Intrinsic::r600_read_global_size_y: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); - case Intrinsic::r600_read_global_size_z: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); - case Intrinsic::r600_read_local_size_x: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::LOCAL_SIZE_X, false); - case Intrinsic::r600_read_local_size_y: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::LOCAL_SIZE_Y, false); - case Intrinsic::r600_read_local_size_z: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::LOCAL_SIZE_Z, false); - - case Intrinsic::AMDGPU_read_workdim: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - MF.getInfo()->ABIArgOffset, - false); - - case Intrinsic::r600_read_tgid_x: - return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT); - case Intrinsic::r600_read_tgid_y: - return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Y), VT); - case Intrinsic::r600_read_tgid_z: - return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT); - case Intrinsic::r600_read_tidig_x: - return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT); - case Intrinsic::r600_read_tidig_y: - return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT); - case Intrinsic::r600_read_tidig_z: - return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT); - case AMDGPUIntrinsic::SI_load_const: { - SDValue Ops[] = { - Op.getOperand(1), - Op.getOperand(2) - }; - - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, - VT.getStoreSize(), 4); - return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, - Op->getVTList(), Ops, VT, MMO); - } - case AMDGPUIntrinsic::SI_sample: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG); - case AMDGPUIntrinsic::SI_sampleb: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG); - case AMDGPUIntrinsic::SI_sampled: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG); - case AMDGPUIntrinsic::SI_samplel: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG); - case AMDGPUIntrinsic::SI_vs_load_input: - return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, - Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3)); - - case AMDGPUIntrinsic::AMDGPU_fract: - case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. - return DAG.getNode(ISD::FSUB, DL, VT, Op.getOperand(1), - DAG.getNode(ISD::FFLOOR, DL, VT, Op.getOperand(1))); - case AMDGPUIntrinsic::SI_fs_constant: { - SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); - SDValue Glue = M0.getValue(1); - return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, - DAG.getConstant(2, DL, MVT::i32), // P0 - Op.getOperand(1), Op.getOperand(2), Glue); - } - case AMDGPUIntrinsic::SI_fs_interp: { - SDValue IJ = Op.getOperand(4); - SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, - DAG.getConstant(0, DL, MVT::i32)); - SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, - DAG.getConstant(1, DL, MVT::i32)); - SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); - SDValue Glue = M0.getValue(1); - SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL, - DAG.getVTList(MVT::f32, MVT::Glue), - I, Op.getOperand(1), Op.getOperand(2), Glue); - Glue = SDValue(P1.getNode(), 1); - return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J, - Op.getOperand(1), Op.getOperand(2), Glue); - } - default: - return AMDGPUTargetLowering::LowerOperation(Op, DAG); - } -} - -SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, - SelectionDAG &DAG) const { - MachineFunction &MF = DAG.getMachineFunction(); - SDLoc DL(Op); - SDValue Chain = Op.getOperand(0); - unsigned IntrinsicID = cast(Op.getOperand(1))->getZExtValue(); - - switch (IntrinsicID) { - case AMDGPUIntrinsic::SI_sendmsg: { - Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3)); - SDValue Glue = Chain.getValue(1); - return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain, - Op.getOperand(2), Glue); - } - case AMDGPUIntrinsic::SI_tbuffer_store: { - SDValue Ops[] = { - Chain, - Op.getOperand(2), - Op.getOperand(3), - Op.getOperand(4), - Op.getOperand(5), - Op.getOperand(6), - Op.getOperand(7), - Op.getOperand(8), - Op.getOperand(9), - Op.getOperand(10), - Op.getOperand(11), - Op.getOperand(12), - Op.getOperand(13), - Op.getOperand(14) - }; - - EVT VT = Op.getOperand(3).getValueType(); - - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOStore, - VT.getStoreSize(), 4); - return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, - Op->getVTList(), Ops, VT, MMO); - } - default: - return SDValue(); - } -} - -SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - LoadSDNode *Load = cast(Op); - - if (Op.getValueType().isVector()) { - assert(Op.getValueType().getVectorElementType() == MVT::i32 && - "Custom lowering for non-i32 vectors hasn't been implemented."); - unsigned NumElements = Op.getValueType().getVectorNumElements(); - assert(NumElements != 2 && "v2 loads are supported for all address spaces."); - switch (Load->getAddressSpace()) { - default: break; - case AMDGPUAS::GLOBAL_ADDRESS: - case AMDGPUAS::PRIVATE_ADDRESS: - // v4 loads are supported for private and global memory. - if (NumElements <= 4) - break; - // fall-through - case AMDGPUAS::LOCAL_ADDRESS: - return ScalarizeVectorLoad(Op, DAG); - } - } - - return AMDGPUTargetLowering::LowerLOAD(Op, DAG); -} - -SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, - const SDValue &Op, - SelectionDAG &DAG) const { - return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3), - Op.getOperand(4)); -} - -SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { - if (Op.getValueType() != MVT::i64) - return SDValue(); - - SDLoc DL(Op); - SDValue Cond = Op.getOperand(0); - - SDValue Zero = DAG.getConstant(0, DL, MVT::i32); - SDValue One = DAG.getConstant(1, DL, MVT::i32); - - SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); - SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2)); - - SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero); - SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero); - - SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1); - - SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One); - SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One); - - SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); - - SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i32, Lo, Hi); - return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); -} - -// Catch division cases where we can use shortcuts with rcp and rsq -// instructions. -SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { - SDLoc SL(Op); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - EVT VT = Op.getValueType(); - bool Unsafe = DAG.getTarget().Options.UnsafeFPMath; - - if (const ConstantFPSDNode *CLHS = dyn_cast(LHS)) { - if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals())) && - CLHS->isExactlyValue(1.0)) { - // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to - // the CI documentation has a worst case error of 1 ulp. - // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to - // use it as long as we aren't trying to use denormals. - - // 1.0 / sqrt(x) -> rsq(x) - // - // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP - // error seems really high at 2^29 ULP. - if (RHS.getOpcode() == ISD::FSQRT) - return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); - - // 1.0 / x -> rcp(x) - return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); - } - } - - if (Unsafe) { - // Turn into multiply by the reciprocal. - // x / y -> x * (1.0 / y) - SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); - return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip); - } - - return SDValue(); -} - -SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { - SDValue FastLowered = LowerFastFDIV(Op, DAG); - if (FastLowered.getNode()) - return FastLowered; - - // This uses v_rcp_f32 which does not handle denormals. Let this hit a - // selection error for now rather than do something incorrect. - if (Subtarget->hasFP32Denormals()) - return SDValue(); - - SDLoc SL(Op); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - - SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); - - const APFloat K0Val(BitsToFloat(0x6f800000)); - const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); - - const APFloat K1Val(BitsToFloat(0x2f800000)); - const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); - - const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); - - EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); - - SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); - - SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); - - r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); - - SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); - - SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); - - return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); -} - -SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { - if (DAG.getTarget().Options.UnsafeFPMath) - return LowerFastFDIV(Op, DAG); - - SDLoc SL(Op); - SDValue X = Op.getOperand(0); - SDValue Y = Op.getOperand(1); - - const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64); - - SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1); - - SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X); - - SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0); - - SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0); - - SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One); - - SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp); - - SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One); - - SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X); - - SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1); - SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3); - - SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64, - NegDivScale0, Mul, DivScale1); - - SDValue Scale; - - if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { - // Workaround a hardware bug on SI where the condition output from div_scale - // is not usable. - - const SDValue Hi = DAG.getConstant(1, SL, MVT::i32); - - // Figure out if the scale to use for div_fmas. - SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); - SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y); - SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0); - SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1); - - SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi); - SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi); - - SDValue Scale0Hi - = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi); - SDValue Scale1Hi - = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi); - - SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ); - SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ); - Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen); - } else { - Scale = DivScale1.getValue(1); - } - - SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, - Fma4, Fma3, Mul, Scale); - - return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X); -} - -SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - - if (VT == MVT::f32) - return LowerFDIV32(Op, DAG); - - if (VT == MVT::f64) - return LowerFDIV64(Op, DAG); - - llvm_unreachable("Unexpected type for fdiv"); -} - -SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - StoreSDNode *Store = cast(Op); - EVT VT = Store->getMemoryVT(); - - // These stores are legal. - if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { - if (VT.isVector() && VT.getVectorNumElements() > 4) - return ScalarizeVectorStore(Op, DAG); - return SDValue(); - } - - SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); - if (Ret.getNode()) - return Ret; - - if (VT.isVector() && VT.getVectorNumElements() >= 8) - return ScalarizeVectorStore(Op, DAG); - - if (VT == MVT::i1) - return DAG.getTruncStore(Store->getChain(), DL, - DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), - Store->getBasePtr(), MVT::i1, Store->getMemOperand()); - - return SDValue(); -} - -SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - SDValue Arg = Op.getOperand(0); - SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, - DAG.getNode(ISD::FMUL, DL, VT, Arg, - DAG.getConstantFP(0.5/M_PI, DL, - VT))); - - switch (Op.getOpcode()) { - case ISD::FCOS: - return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart); - case ISD::FSIN: - return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart); - default: - llvm_unreachable("Wrong trig opcode"); - } -} - -//===----------------------------------------------------------------------===// -// Custom DAG optimizations -//===----------------------------------------------------------------------===// - -SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - EVT VT = N->getValueType(0); - EVT ScalarVT = VT.getScalarType(); - if (ScalarVT != MVT::f32) - return SDValue(); - - SelectionDAG &DAG = DCI.DAG; - SDLoc DL(N); - - SDValue Src = N->getOperand(0); - EVT SrcVT = Src.getValueType(); - - // TODO: We could try to match extracting the higher bytes, which would be - // easier if i8 vectors weren't promoted to i32 vectors, particularly after - // types are legalized. v4i8 -> v4f32 is probably the only case to worry - // about in practice. - if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) { - if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) { - SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src); - DCI.AddToWorklist(Cvt.getNode()); - return Cvt; - } - } - - // We are primarily trying to catch operations on illegal vector types - // before they are expanded. - // For scalars, we can use the more flexible method of checking masked bits - // after legalization. - if (!DCI.isBeforeLegalize() || - !SrcVT.isVector() || - SrcVT.getVectorElementType() != MVT::i8) { - return SDValue(); - } - - assert(DCI.isBeforeLegalize() && "Unexpected legal type"); - - // Weird sized vectors are a pain to handle, but we know 3 is really the same - // size as 4. - unsigned NElts = SrcVT.getVectorNumElements(); - if (!SrcVT.isSimple() && NElts != 3) - return SDValue(); - - // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to - // prevent a mess from expanding to v4i32 and repacking. - if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) { - EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT); - EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT); - EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts); - LoadSDNode *Load = cast(Src); - - unsigned AS = Load->getAddressSpace(); - unsigned Align = Load->getAlignment(); - Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext()); - unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty); - - // Don't try to replace the load if we have to expand it due to alignment - // problems. Otherwise we will end up scalarizing the load, and trying to - // repack into the vector for no real reason. - if (Align < ABIAlignment && - !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) { - return SDValue(); - } - - SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT, - Load->getChain(), - Load->getBasePtr(), - LoadVT, - Load->getMemOperand()); - - // Make sure successors of the original load stay after it by updating - // them to use the new Chain. - DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1)); - - SmallVector Elts; - if (RegVT.isVector()) - DAG.ExtractVectorElements(NewLoad, Elts); - else - Elts.push_back(NewLoad); - - SmallVector Ops; - - unsigned EltIdx = 0; - for (SDValue Elt : Elts) { - unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx); - for (unsigned I = 0; I < ComponentsInElt; ++I) { - unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I; - SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt); - DCI.AddToWorklist(Cvt.getNode()); - Ops.push_back(Cvt); - } - - ++EltIdx; - } - - assert(Ops.size() == NElts); - - return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops); - } - - return SDValue(); -} - -/// \brief Return true if the given offset Size in bytes can be folded into -/// the immediate offsets of a memory instruction for the given address space. -static bool canFoldOffset(unsigned OffsetSize, unsigned AS, - const AMDGPUSubtarget &STI) { - switch (AS) { - case AMDGPUAS::GLOBAL_ADDRESS: { - // MUBUF instructions a 12-bit offset in bytes. - return isUInt<12>(OffsetSize); - } - case AMDGPUAS::CONSTANT_ADDRESS: { - // SMRD instructions have an 8-bit offset in dwords on SI and - // a 20-bit offset in bytes on VI. - if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - return isUInt<20>(OffsetSize); - else - return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4); - } - case AMDGPUAS::LOCAL_ADDRESS: - case AMDGPUAS::REGION_ADDRESS: { - // The single offset versions have a 16-bit offset in bytes. - return isUInt<16>(OffsetSize); - } - case AMDGPUAS::PRIVATE_ADDRESS: - // Indirect register addressing does not use any offsets. - default: - return 0; - } -} - -// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) - -// This is a variant of -// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2), -// -// The normal DAG combiner will do this, but only if the add has one use since -// that would increase the number of instructions. -// -// This prevents us from seeing a constant offset that can be folded into a -// memory instruction's addressing mode. If we know the resulting add offset of -// a pointer can be folded into an addressing offset, we can replace the pointer -// operand with the add of new constant offset. This eliminates one of the uses, -// and may allow the remaining use to also be simplified. -// -SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, - unsigned AddrSpace, - DAGCombinerInfo &DCI) const { - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - - if (N0.getOpcode() != ISD::ADD) - return SDValue(); - - const ConstantSDNode *CN1 = dyn_cast(N1); - if (!CN1) - return SDValue(); - - const ConstantSDNode *CAdd = dyn_cast(N0.getOperand(1)); - if (!CAdd) - return SDValue(); - - // If the resulting offset is too large, we can't fold it into the addressing - // mode offset. - APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); - if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *Subtarget)) - return SDValue(); - - SelectionDAG &DAG = DCI.DAG; - SDLoc SL(N); - EVT VT = N->getValueType(0); - - SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1); - SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32); - - return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset); -} - -SDValue SITargetLowering::performAndCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - if (DCI.isBeforeLegalize()) - return SDValue(); - - SelectionDAG &DAG = DCI.DAG; - - // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> - // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity) - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - - if (LHS.getOpcode() == ISD::SETCC && - RHS.getOpcode() == ISD::SETCC) { - ISD::CondCode LCC = cast(LHS.getOperand(2))->get(); - ISD::CondCode RCC = cast(RHS.getOperand(2))->get(); - - SDValue X = LHS.getOperand(0); - SDValue Y = RHS.getOperand(0); - if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X) - return SDValue(); - - if (LCC == ISD::SETO) { - if (X != LHS.getOperand(1)) - return SDValue(); - - if (RCC == ISD::SETUNE) { - const ConstantFPSDNode *C1 = dyn_cast(RHS.getOperand(1)); - if (!C1 || !C1->isInfinity() || C1->isNegative()) - return SDValue(); - - const uint32_t Mask = SIInstrFlags::N_NORMAL | - SIInstrFlags::N_SUBNORMAL | - SIInstrFlags::N_ZERO | - SIInstrFlags::P_ZERO | - SIInstrFlags::P_SUBNORMAL | - SIInstrFlags::P_NORMAL; - - static_assert(((~(SIInstrFlags::S_NAN | - SIInstrFlags::Q_NAN | - SIInstrFlags::N_INFINITY | - SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask, - "mask not equal"); - - SDLoc DL(N); - return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, - X, DAG.getConstant(Mask, DL, MVT::i32)); - } - } - } - - return SDValue(); -} - -SDValue SITargetLowering::performOrCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - SelectionDAG &DAG = DCI.DAG; - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - - // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) - if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && - RHS.getOpcode() == AMDGPUISD::FP_CLASS) { - SDValue Src = LHS.getOperand(0); - if (Src != RHS.getOperand(0)) - return SDValue(); - - const ConstantSDNode *CLHS = dyn_cast(LHS.getOperand(1)); - const ConstantSDNode *CRHS = dyn_cast(RHS.getOperand(1)); - if (!CLHS || !CRHS) - return SDValue(); - - // Only 10 bits are used. - static const uint32_t MaxMask = 0x3ff; - - uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; - SDLoc DL(N); - return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, - Src, DAG.getConstant(NewMask, DL, MVT::i32)); - } - - return SDValue(); -} - -SDValue SITargetLowering::performClassCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - SelectionDAG &DAG = DCI.DAG; - SDValue Mask = N->getOperand(1); - - // fp_class x, 0 -> false - if (const ConstantSDNode *CMask = dyn_cast(Mask)) { - if (CMask->isNullValue()) - return DAG.getConstant(0, SDLoc(N), MVT::i1); - } - - return SDValue(); -} - -static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { - switch (Opc) { - case ISD::FMAXNUM: - return AMDGPUISD::FMAX3; - case ISD::SMAX: - return AMDGPUISD::SMAX3; - case ISD::UMAX: - return AMDGPUISD::UMAX3; - case ISD::FMINNUM: - return AMDGPUISD::FMIN3; - case ISD::SMIN: - return AMDGPUISD::SMIN3; - case ISD::UMIN: - return AMDGPUISD::UMIN3; - default: - llvm_unreachable("Not a min/max opcode"); - } -} - -SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, - DAGCombinerInfo &DCI) const { - SelectionDAG &DAG = DCI.DAG; - - unsigned Opc = N->getOpcode(); - SDValue Op0 = N->getOperand(0); - SDValue Op1 = N->getOperand(1); - - // Only do this if the inner op has one use since this will just increases - // register pressure for no benefit. - - // max(max(a, b), c) - if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { - SDLoc DL(N); - return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), - DL, - N->getValueType(0), - Op0.getOperand(0), - Op0.getOperand(1), - Op1); - } - - // max(a, max(b, c)) - if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { - SDLoc DL(N); - return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), - DL, - N->getValueType(0), - Op0, - Op1.getOperand(0), - Op1.getOperand(1)); - } - - return SDValue(); -} - -SDValue SITargetLowering::performSetCCCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - SelectionDAG &DAG = DCI.DAG; - SDLoc SL(N); - - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - EVT VT = LHS.getValueType(); - - if (VT != MVT::f32 && VT != MVT::f64) - return SDValue(); - - // Match isinf pattern - // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity)) - ISD::CondCode CC = cast(N->getOperand(2))->get(); - if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) { - const ConstantFPSDNode *CRHS = dyn_cast(RHS); - if (!CRHS) - return SDValue(); - - const APFloat &APF = CRHS->getValueAPF(); - if (APF.isInfinity() && !APF.isNegative()) { - unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY; - return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0), - DAG.getConstant(Mask, SL, MVT::i32)); - } - } - - return SDValue(); -} - -SDValue SITargetLowering::PerformDAGCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - SelectionDAG &DAG = DCI.DAG; - SDLoc DL(N); - - switch (N->getOpcode()) { - default: - return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); - case ISD::SETCC: - return performSetCCCombine(N, DCI); - case ISD::FMAXNUM: // TODO: What about fmax_legacy? - case ISD::FMINNUM: - case ISD::SMAX: - case ISD::SMIN: - case ISD::UMAX: - case ISD::UMIN: { - if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && - N->getValueType(0) != MVT::f64 && - getTargetMachine().getOptLevel() > CodeGenOpt::None) - return performMin3Max3Combine(N, DCI); - break; - } - - case AMDGPUISD::CVT_F32_UBYTE0: - case AMDGPUISD::CVT_F32_UBYTE1: - case AMDGPUISD::CVT_F32_UBYTE2: - case AMDGPUISD::CVT_F32_UBYTE3: { - unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; - - SDValue Src = N->getOperand(0); - APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); - - APInt KnownZero, KnownOne; - TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), - !DCI.isBeforeLegalizeOps()); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLO.ShrinkDemandedConstant(Src, Demanded) || - TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) { - DCI.CommitTargetLoweringOpt(TLO); - } - - break; - } - - case ISD::UINT_TO_FP: { - return performUCharToFloatCombine(N, DCI); - - case ISD::FADD: { - if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) - break; - - EVT VT = N->getValueType(0); - if (VT != MVT::f32) - break; - - // Only do this if we are not trying to support denormals. v_mad_f32 does - // not support denormals ever. - if (Subtarget->hasFP32Denormals()) - break; - - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - - // These should really be instruction patterns, but writing patterns with - // source modiifiers is a pain. - - // fadd (fadd (a, a), b) -> mad 2.0, a, b - if (LHS.getOpcode() == ISD::FADD) { - SDValue A = LHS.getOperand(0); - if (A == LHS.getOperand(1)) { - const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); - return DAG.getNode(ISD::FMAD, DL, VT, Two, A, RHS); - } - } - - // fadd (b, fadd (a, a)) -> mad 2.0, a, b - if (RHS.getOpcode() == ISD::FADD) { - SDValue A = RHS.getOperand(0); - if (A == RHS.getOperand(1)) { - const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); - return DAG.getNode(ISD::FMAD, DL, VT, Two, A, LHS); - } - } - - return SDValue(); - } - case ISD::FSUB: { - if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) - break; - - EVT VT = N->getValueType(0); - - // Try to get the fneg to fold into the source modifier. This undoes generic - // DAG combines and folds them into the mad. - // - // Only do this if we are not trying to support denormals. v_mad_f32 does - // not support denormals ever. - if (VT == MVT::f32 && - !Subtarget->hasFP32Denormals()) { - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - if (LHS.getOpcode() == ISD::FADD) { - // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) - - SDValue A = LHS.getOperand(0); - if (A == LHS.getOperand(1)) { - const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32); - SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS); - - return DAG.getNode(ISD::FMAD, DL, VT, Two, A, NegRHS); - } - } - - if (RHS.getOpcode() == ISD::FADD) { - // (fsub c, (fadd a, a)) -> mad -2.0, a, c - - SDValue A = RHS.getOperand(0); - if (A == RHS.getOperand(1)) { - const SDValue NegTwo = DAG.getConstantFP(-2.0, DL, MVT::f32); - return DAG.getNode(ISD::FMAD, DL, VT, NegTwo, A, LHS); - } - } - - return SDValue(); - } - - break; - } - } - case ISD::LOAD: - case ISD::STORE: - case ISD::ATOMIC_LOAD: - case ISD::ATOMIC_STORE: - case ISD::ATOMIC_CMP_SWAP: - case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: - case ISD::ATOMIC_SWAP: - case ISD::ATOMIC_LOAD_ADD: - case ISD::ATOMIC_LOAD_SUB: - case ISD::ATOMIC_LOAD_AND: - case ISD::ATOMIC_LOAD_OR: - case ISD::ATOMIC_LOAD_XOR: - case ISD::ATOMIC_LOAD_NAND: - case ISD::ATOMIC_LOAD_MIN: - case ISD::ATOMIC_LOAD_MAX: - case ISD::ATOMIC_LOAD_UMIN: - case ISD::ATOMIC_LOAD_UMAX: { // TODO: Target mem intrinsics. - if (DCI.isBeforeLegalize()) - break; - - MemSDNode *MemNode = cast(N); - SDValue Ptr = MemNode->getBasePtr(); - - // TODO: We could also do this for multiplies. - unsigned AS = MemNode->getAddressSpace(); - if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) { - SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI); - if (NewPtr) { - SmallVector NewOps(MemNode->op_begin(), MemNode->op_end()); - - NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr; - return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0); - } - } - break; - } - case ISD::AND: - return performAndCombine(N, DCI); - case ISD::OR: - return performOrCombine(N, DCI); - case AMDGPUISD::FP_CLASS: - return performClassCombine(N, DCI); - } - return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); -} - -/// \brief Analyze the possible immediate value Op -/// -/// Returns -1 if it isn't an immediate, 0 if it's and inline immediate -/// and the immediate value if it's a literal immediate -int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { - - const SIInstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); - - if (const ConstantSDNode *Node = dyn_cast(N)) { - if (TII->isInlineConstant(Node->getAPIntValue())) - return 0; - - uint64_t Val = Node->getZExtValue(); - return isUInt<32>(Val) ? Val : -1; - } - - if (const ConstantFPSDNode *Node = dyn_cast(N)) { - if (TII->isInlineConstant(Node->getValueAPF().bitcastToAPInt())) - return 0; - - if (Node->getValueType(0) == MVT::f32) - return FloatToBits(Node->getValueAPF().convertToFloat()); - - return -1; - } - - return -1; -} - -/// \brief Helper function for adjustWritemask -static unsigned SubIdx2Lane(unsigned Idx) { - switch (Idx) { - default: return 0; - case AMDGPU::sub0: return 0; - case AMDGPU::sub1: return 1; - case AMDGPU::sub2: return 2; - case AMDGPU::sub3: return 3; - } -} - -/// \brief Adjust the writemask of MIMG instructions -void SITargetLowering::adjustWritemask(MachineSDNode *&Node, - SelectionDAG &DAG) const { - SDNode *Users[4] = { }; - unsigned Lane = 0; - unsigned OldDmask = Node->getConstantOperandVal(0); - unsigned NewDmask = 0; - - // Try to figure out the used register components - for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); - I != E; ++I) { - - // Abort if we can't understand the usage - if (!I->isMachineOpcode() || - I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) - return; - - // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used. - // Note that subregs are packed, i.e. Lane==0 is the first bit set - // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit - // set, etc. - Lane = SubIdx2Lane(I->getConstantOperandVal(1)); - - // Set which texture component corresponds to the lane. - unsigned Comp; - for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) { - assert(Dmask); - Comp = countTrailingZeros(Dmask); - Dmask &= ~(1 << Comp); - } - - // Abort if we have more than one user per component - if (Users[Lane]) - return; - - Users[Lane] = *I; - NewDmask |= 1 << Comp; - } - - // Abort if there's no change - if (NewDmask == OldDmask) - return; - - // Adjust the writemask in the node - std::vector Ops; - Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32)); - Ops.insert(Ops.end(), Node->op_begin() + 1, Node->op_end()); - Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops); - - // If we only got one lane, replace it with a copy - // (if NewDmask has only one bit set...) - if (NewDmask && (NewDmask & (NewDmask-1)) == 0) { - SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(), - MVT::i32); - SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, - SDLoc(), Users[Lane]->getValueType(0), - SDValue(Node, 0), RC); - DAG.ReplaceAllUsesWith(Users[Lane], Copy); - return; - } - - // Update the users of the node with the new indices - for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { - - SDNode *User = Users[i]; - if (!User) - continue; - - SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); - DAG.UpdateNodeOperands(User, User->getOperand(0), Op); - - switch (Idx) { - default: break; - case AMDGPU::sub0: Idx = AMDGPU::sub1; break; - case AMDGPU::sub1: Idx = AMDGPU::sub2; break; - case AMDGPU::sub2: Idx = AMDGPU::sub3; break; - } - } -} - -/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG) -/// with frame index operands. -/// LLVM assumes that inputs are to these instructions are registers. -void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, - SelectionDAG &DAG) const { - - SmallVector Ops; - for (unsigned i = 0; i < Node->getNumOperands(); ++i) { - if (!isa(Node->getOperand(i))) { - Ops.push_back(Node->getOperand(i)); - continue; - } - - SDLoc DL(Node); - Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, - Node->getOperand(i).getValueType(), - Node->getOperand(i)), 0)); - } - - DAG.UpdateNodeOperands(Node, Ops); -} - -/// \brief Fold the instructions after selecting them. -SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, - SelectionDAG &DAG) const { - const SIInstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); - - if (TII->isMIMG(Node->getMachineOpcode())) - adjustWritemask(Node, DAG); - - if (Node->getMachineOpcode() == AMDGPU::INSERT_SUBREG || - Node->getMachineOpcode() == AMDGPU::REG_SEQUENCE) { - legalizeTargetIndependentNode(Node, DAG); - return Node; - } - return Node; -} - -/// \brief Assign the register class depending on the number of -/// bits set in the writemask -void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, - SDNode *Node) const { - const SIInstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); - - MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); - TII->legalizeOperands(MI); - - if (TII->isMIMG(MI->getOpcode())) { - unsigned VReg = MI->getOperand(0).getReg(); - unsigned Writemask = MI->getOperand(1).getImm(); - unsigned BitsSet = 0; - for (unsigned i = 0; i < 4; ++i) - BitsSet += Writemask & (1 << i) ? 1 : 0; - - const TargetRegisterClass *RC; - switch (BitsSet) { - default: return; - case 1: RC = &AMDGPU::VGPR_32RegClass; break; - case 2: RC = &AMDGPU::VReg_64RegClass; break; - case 3: RC = &AMDGPU::VReg_96RegClass; break; - } - - unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet); - MI->setDesc(TII->get(NewOpcode)); - MRI.setRegClass(VReg, RC); - return; - } - - // Replace unused atomics with the no return version. - int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI->getOpcode()); - if (NoRetAtomicOp != -1) { - if (!Node->hasAnyUseOfValue(0)) { - MI->setDesc(TII->get(NoRetAtomicOp)); - MI->RemoveOperand(0); - } - - return; - } -} - -static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) { - SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32); - return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0); -} - -MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, - SDLoc DL, - SDValue Ptr) const { - const SIInstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); -#if 1 - // XXX - Workaround for moveToVALU not handling different register class - // inserts for REG_SEQUENCE. - - // Build the half of the subregister with the constants. - const SDValue Ops0[] = { - DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32), - buildSMovImm32(DAG, DL, 0), - DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), - buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), - DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32) - }; - - SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, - MVT::v2i32, Ops0), 0); - - // Combine the constants and the pointer. - const SDValue Ops1[] = { - DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), - Ptr, - DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), - SubRegHi, - DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32) - }; - - return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); -#else - const SDValue Ops[] = { - DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32), - Ptr, - DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32), - buildSMovImm32(DAG, DL, 0), - DAG.getTargetConstant(AMDGPU::sub2, MVT::i32), - buildSMovImm32(DAG, DL, TII->getDefaultRsrcFormat() >> 32), - DAG.getTargetConstant(AMDGPU::sub3, MVT::i32) - }; - - return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); - -#endif -} - -/// \brief Return a resource descriptor with the 'Add TID' bit enabled -/// The TID (Thread ID) is multipled by the stride value (bits [61:48] -/// of the resource descriptor) to create an offset, which is added to the -/// resource ponter. -MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, - SDLoc DL, - SDValue Ptr, - uint32_t RsrcDword1, - uint64_t RsrcDword2And3) const { - SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr); - SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr); - if (RsrcDword1) { - PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi, - DAG.getConstant(RsrcDword1, DL, MVT::i32)), - 0); - } - - SDValue DataLo = buildSMovImm32(DAG, DL, - RsrcDword2And3 & UINT64_C(0xFFFFFFFF)); - SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32); - - const SDValue Ops[] = { - DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), - PtrLo, - DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), - PtrHi, - DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32), - DataLo, - DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32), - DataHi, - DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32) - }; - - return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); -} - -MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG, - SDLoc DL, - SDValue Ptr) const { - const SIInstrInfo *TII = - static_cast(Subtarget->getInstrInfo()); - uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | AMDGPU::RSRC_TID_ENABLE | - 0xffffffff; // Size - - return buildRSRC(DAG, DL, Ptr, 0, Rsrc); -} - -SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, - const TargetRegisterClass *RC, - unsigned Reg, EVT VT) const { - SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT); - - return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()), - cast(VReg)->getReg(), VT); -} - -//===----------------------------------------------------------------------===// -// SI Inline Assembly Support -//===----------------------------------------------------------------------===// - -std::pair -SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, - const std::string &Constraint, - MVT VT) const { - if (Constraint == "r") { - switch(VT.SimpleTy) { - default: llvm_unreachable("Unhandled type for 'r' inline asm constraint"); - case MVT::i64: - return std::make_pair(0U, &AMDGPU::SGPR_64RegClass); - case MVT::i32: - return std::make_pair(0U, &AMDGPU::SGPR_32RegClass); - } - } - - if (Constraint.size() > 1) { - const TargetRegisterClass *RC = nullptr; - if (Constraint[1] == 'v') { - RC = &AMDGPU::VGPR_32RegClass; - } else if (Constraint[1] == 's') { - RC = &AMDGPU::SGPR_32RegClass; - } - - if (RC) { - unsigned Idx = std::atoi(Constraint.substr(2).c_str()); - if (Idx < RC->getNumRegs()) - return std::make_pair(RC->getRegister(Idx), RC); - } - } - return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); -} diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h deleted file mode 100644 index a956b013bdb..00000000000 --- a/lib/Target/R600/SIISelLowering.h +++ /dev/null @@ -1,125 +0,0 @@ -//===-- SIISelLowering.h - SI DAG Lowering Interface ------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief SI DAG Lowering interface definition -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_SIISELLOWERING_H -#define LLVM_LIB_TARGET_R600_SIISELLOWERING_H - -#include "AMDGPUISelLowering.h" -#include "SIInstrInfo.h" - -namespace llvm { - -class SITargetLowering : public AMDGPUTargetLowering { - SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc DL, - SDValue Chain, unsigned Offset, bool Signed) const; - SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op, - SelectionDAG &DAG) const; - SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, - SelectionDAG &DAG) const override; - - SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const; - SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; - - void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; - - SDValue performUCharToFloatCombine(SDNode *N, - DAGCombinerInfo &DCI) const; - SDValue performSHLPtrCombine(SDNode *N, - unsigned AS, - DAGCombinerInfo &DCI) const; - SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const; - SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const; - SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const; - - SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const; - SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; - -public: - SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI); - - bool isShuffleMaskLegal(const SmallVectorImpl &/*Mask*/, - EVT /*VT*/) const override; - - bool isLegalAddressingMode(const AddrMode &AM, - Type *Ty, unsigned AS) const override; - - bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, - unsigned Align, - bool *IsFast) const override; - - EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, - unsigned SrcAlign, bool IsMemset, - bool ZeroMemset, - bool MemcpyStrSrc, - MachineFunction &MF) const override; - - TargetLoweringBase::LegalizeTypeAction - getPreferredVectorAction(EVT VT) const override; - - bool shouldConvertConstantLoadToIntImm(const APInt &Imm, - Type *Ty) const override; - - SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl &Ins, - SDLoc DL, SelectionDAG &DAG, - SmallVectorImpl &InVals) const override; - - MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI, - MachineBasicBlock * BB) const override; - bool enableAggressiveFMAFusion(EVT VT) const override; - EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override; - MVT getScalarShiftAmountTy(EVT VT) const override; - bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; - SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; - SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; - SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override; - void AdjustInstrPostInstrSelection(MachineInstr *MI, - SDNode *Node) const override; - - int32_t analyzeImmediate(const SDNode *N) const; - SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, - unsigned Reg, EVT VT) const override; - void legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const; - - MachineSDNode *wrapAddr64Rsrc(SelectionDAG &DAG, SDLoc DL, SDValue Ptr) const; - MachineSDNode *buildRSRC(SelectionDAG &DAG, - SDLoc DL, - SDValue Ptr, - uint32_t RsrcDword1, - uint64_t RsrcDword2And3) const; - MachineSDNode *buildScratchRSRC(SelectionDAG &DAG, - SDLoc DL, - SDValue Ptr) const; - - std::pair getRegForInlineAsmConstraint( - const TargetRegisterInfo *TRI, - const std::string &Constraint, MVT VT) const override; - SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, SDValue V) const; -}; - -} // End namespace llvm - -#endif diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp deleted file mode 100644 index 90a37f17468..00000000000 --- a/lib/Target/R600/SIInsertWaits.cpp +++ /dev/null @@ -1,480 +0,0 @@ -//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Insert wait instructions for memory reads and writes. -/// -/// Memory reads and writes are issued asynchronously, so we need to insert -/// S_WAITCNT instructions when we want to access any of their results or -/// overwrite any register that's used asynchronously. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "SIDefines.h" -#include "SIInstrInfo.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" - -using namespace llvm; - -namespace { - -/// \brief One variable for each of the hardware counters -typedef union { - struct { - unsigned VM; - unsigned EXP; - unsigned LGKM; - } Named; - unsigned Array[3]; - -} Counters; - -typedef enum { - OTHER, - SMEM, - VMEM -} InstType; - -typedef Counters RegCounters[512]; -typedef std::pair RegInterval; - -class SIInsertWaits : public MachineFunctionPass { - -private: - static char ID; - const SIInstrInfo *TII; - const SIRegisterInfo *TRI; - const MachineRegisterInfo *MRI; - - /// \brief Constant hardware limits - static const Counters WaitCounts; - - /// \brief Constant zero value - static const Counters ZeroCounts; - - /// \brief Counter values we have already waited on. - Counters WaitedOn; - - /// \brief Counter values for last instruction issued. - Counters LastIssued; - - /// \brief Registers used by async instructions. - RegCounters UsedRegs; - - /// \brief Registers defined by async instructions. - RegCounters DefinedRegs; - - /// \brief Different export instruction types seen since last wait. - unsigned ExpInstrTypesSeen; - - /// \brief Type of the last opcode. - InstType LastOpcodeType; - - bool LastInstWritesM0; - - /// \brief Get increment/decrement amount for this instruction. - Counters getHwCounts(MachineInstr &MI); - - /// \brief Is operand relevant for async execution? - bool isOpRelevant(MachineOperand &Op); - - /// \brief Get register interval an operand affects. - RegInterval getRegInterval(MachineOperand &Op); - - /// \brief Handle instructions async components - void pushInstruction(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I); - - /// \brief Insert the actual wait instruction - bool insertWait(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - const Counters &Counts); - - /// \brief Do we need def2def checks? - bool unorderedDefines(MachineInstr &MI); - - /// \brief Resolve all operand dependencies to counter requirements - Counters handleOperands(MachineInstr &MI); - - /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG. - void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); - -public: - SIInsertWaits(TargetMachine &tm) : - MachineFunctionPass(ID), - TII(nullptr), - TRI(nullptr), - ExpInstrTypesSeen(0) { } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "SI insert wait instructions"; - } - -}; - -} // End anonymous namespace - -char SIInsertWaits::ID = 0; - -const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } }; -const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } }; - -FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) { - return new SIInsertWaits(tm); -} - -Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { - - uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags; - Counters Result; - - Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT); - - // Only consider stores or EXP for EXP_CNT - Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT && - (MI.getOpcode() == AMDGPU::EXP || MI.getDesc().mayStore())); - - // LGKM may uses larger values - if (TSFlags & SIInstrFlags::LGKM_CNT) { - - if (TII->isSMRD(MI.getOpcode())) { - - MachineOperand &Op = MI.getOperand(0); - assert(Op.isReg() && "First LGKM operand must be a register!"); - - unsigned Reg = Op.getReg(); - unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize(); - Result.Named.LGKM = Size > 4 ? 2 : 1; - - } else { - // DS - Result.Named.LGKM = 1; - } - - } else { - Result.Named.LGKM = 0; - } - - return Result; -} - -bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { - - // Constants are always irrelevant - if (!Op.isReg()) - return false; - - // Defines are always relevant - if (Op.isDef()) - return true; - - // For exports all registers are relevant - MachineInstr &MI = *Op.getParent(); - if (MI.getOpcode() == AMDGPU::EXP) - return true; - - // For stores the stored value is also relevant - if (!MI.getDesc().mayStore()) - return false; - - // Check if this operand is the value being stored. - // Special case for DS instructions, since the address - // operand comes before the value operand and it may have - // multiple data operands. - - if (TII->isDS(MI.getOpcode())) { - MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data); - if (Data && Op.isIdenticalTo(*Data)) - return true; - - MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0); - if (Data0 && Op.isIdenticalTo(*Data0)) - return true; - - MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1); - if (Data1 && Op.isIdenticalTo(*Data1)) - return true; - - return false; - } - - // NOTE: This assumes that the value operand is before the - // address operand, and that there is only one value operand. - for (MachineInstr::mop_iterator I = MI.operands_begin(), - E = MI.operands_end(); I != E; ++I) { - - if (I->isReg() && I->isUse()) - return Op.isIdenticalTo(*I); - } - - return false; -} - -RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) { - - if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg())) - return std::make_pair(0, 0); - - unsigned Reg = Op.getReg(); - unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize(); - - assert(Size >= 4); - - RegInterval Result; - Result.first = TRI->getEncodingValue(Reg); - Result.second = Result.first + Size / 4; - - return Result; -} - -void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) { - - // Get the hardware counter increments and sum them up - Counters Increment = getHwCounts(*I); - unsigned Sum = 0; - - for (unsigned i = 0; i < 3; ++i) { - LastIssued.Array[i] += Increment.Array[i]; - Sum += Increment.Array[i]; - } - - // If we don't increase anything then that's it - if (Sum == 0) { - LastOpcodeType = OTHER; - return; - } - - if (MBB.getParent()->getSubtarget().getGeneration() >= - AMDGPUSubtarget::VOLCANIC_ISLANDS) { - // Any occurence of consecutive VMEM or SMEM instructions forms a VMEM - // or SMEM clause, respectively. - // - // The temporary workaround is to break the clauses with S_NOP. - // - // The proper solution would be to allocate registers such that all source - // and destination registers don't overlap, e.g. this is illegal: - // r0 = load r2 - // r2 = load r0 - if ((LastOpcodeType == SMEM && TII->isSMRD(I->getOpcode())) || - (LastOpcodeType == VMEM && Increment.Named.VM)) { - // Insert a NOP to break the clause. - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)) - .addImm(0); - LastInstWritesM0 = false; - } - - if (TII->isSMRD(I->getOpcode())) - LastOpcodeType = SMEM; - else if (Increment.Named.VM) - LastOpcodeType = VMEM; - } - - // Remember which export instructions we have seen - if (Increment.Named.EXP) { - ExpInstrTypesSeen |= I->getOpcode() == AMDGPU::EXP ? 1 : 2; - } - - for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { - - MachineOperand &Op = I->getOperand(i); - if (!isOpRelevant(Op)) - continue; - - RegInterval Interval = getRegInterval(Op); - for (unsigned j = Interval.first; j < Interval.second; ++j) { - - // Remember which registers we define - if (Op.isDef()) - DefinedRegs[j] = LastIssued; - - // and which one we are using - if (Op.isUse()) - UsedRegs[j] = LastIssued; - } - } -} - -bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - const Counters &Required) { - - // End of program? No need to wait on anything - if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM) - return false; - - // Figure out if the async instructions execute in order - bool Ordered[3]; - - // VM_CNT is always ordered - Ordered[0] = true; - - // EXP_CNT is unordered if we have both EXP & VM-writes - Ordered[1] = ExpInstrTypesSeen == 3; - - // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS - Ordered[2] = false; - - // The values we are going to put into the S_WAITCNT instruction - Counters Counts = WaitCounts; - - // Do we really need to wait? - bool NeedWait = false; - - for (unsigned i = 0; i < 3; ++i) { - - if (Required.Array[i] <= WaitedOn.Array[i]) - continue; - - NeedWait = true; - - if (Ordered[i]) { - unsigned Value = LastIssued.Array[i] - Required.Array[i]; - - // Adjust the value to the real hardware possibilities. - Counts.Array[i] = std::min(Value, WaitCounts.Array[i]); - - } else - Counts.Array[i] = 0; - - // Remember on what we have waited on. - WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i]; - } - - if (!NeedWait) - return false; - - // Reset EXP_CNT instruction types - if (Counts.Named.EXP == 0) - ExpInstrTypesSeen = 0; - - // Build the wait instruction - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) - .addImm((Counts.Named.VM & 0xF) | - ((Counts.Named.EXP & 0x7) << 4) | - ((Counts.Named.LGKM & 0x7) << 8)); - - LastOpcodeType = OTHER; - LastInstWritesM0 = false; - return true; -} - -/// \brief helper function for handleOperands -static void increaseCounters(Counters &Dst, const Counters &Src) { - - for (unsigned i = 0; i < 3; ++i) - Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]); -} - -Counters SIInsertWaits::handleOperands(MachineInstr &MI) { - - Counters Result = ZeroCounts; - - // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish, - // but we also want to wait for any other outstanding transfers before - // signalling other hardware blocks - if (MI.getOpcode() == AMDGPU::S_SENDMSG) - return LastIssued; - - // For each register affected by this - // instruction increase the result sequence - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - - MachineOperand &Op = MI.getOperand(i); - RegInterval Interval = getRegInterval(Op); - for (unsigned j = Interval.first; j < Interval.second; ++j) { - - if (Op.isDef()) { - increaseCounters(Result, UsedRegs[j]); - increaseCounters(Result, DefinedRegs[j]); - } - - if (Op.isUse()) - increaseCounters(Result, DefinedRegs[j]); - } - } - - return Result; -} - -void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) { - if (MBB.getParent()->getSubtarget().getGeneration() < - AMDGPUSubtarget::VOLCANIC_ISLANDS) - return; - - // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG. - if (LastInstWritesM0 && I->getOpcode() == AMDGPU::S_SENDMSG) { - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0); - LastInstWritesM0 = false; - return; - } - - // Set whether this instruction sets M0 - LastInstWritesM0 = false; - - unsigned NumOperands = I->getNumOperands(); - for (unsigned i = 0; i < NumOperands; i++) { - const MachineOperand &Op = I->getOperand(i); - - if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0) - LastInstWritesM0 = true; - } -} - -// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States" -// around other non-memory instructions. -bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { - bool Changes = false; - - TII = static_cast(MF.getSubtarget().getInstrInfo()); - TRI = - static_cast(MF.getSubtarget().getRegisterInfo()); - - MRI = &MF.getRegInfo(); - - WaitedOn = ZeroCounts; - LastIssued = ZeroCounts; - LastOpcodeType = OTHER; - LastInstWritesM0 = false; - - memset(&UsedRegs, 0, sizeof(UsedRegs)); - memset(&DefinedRegs, 0, sizeof(DefinedRegs)); - - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - - MachineBasicBlock &MBB = *BI; - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { - - // Wait for everything before a barrier. - if (I->getOpcode() == AMDGPU::S_BARRIER) - Changes |= insertWait(MBB, I, LastIssued); - else - Changes |= insertWait(MBB, I, handleOperands(*I)); - - pushInstruction(MBB, I); - handleSendMsg(MBB, I); - } - - // Wait for everything at the end of the MBB - Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); - } - - return Changes; -} diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td deleted file mode 100644 index 211666a9bdb..00000000000 --- a/lib/Target/R600/SIInstrFormats.td +++ /dev/null @@ -1,673 +0,0 @@ -//===-- SIInstrFormats.td - SI Instruction Encodings ----------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// SI Instruction format definitions. -// -//===----------------------------------------------------------------------===// - -class InstSI pattern> : - AMDGPUInst, PredicateControl { - - field bits<1> VM_CNT = 0; - field bits<1> EXP_CNT = 0; - field bits<1> LGKM_CNT = 0; - - field bits<1> SALU = 0; - field bits<1> VALU = 0; - - field bits<1> SOP1 = 0; - field bits<1> SOP2 = 0; - field bits<1> SOPC = 0; - field bits<1> SOPK = 0; - field bits<1> SOPP = 0; - - field bits<1> VOP1 = 0; - field bits<1> VOP2 = 0; - field bits<1> VOP3 = 0; - field bits<1> VOPC = 0; - - field bits<1> MUBUF = 0; - field bits<1> MTBUF = 0; - field bits<1> SMRD = 0; - field bits<1> DS = 0; - field bits<1> MIMG = 0; - field bits<1> FLAT = 0; - field bits<1> WQM = 0; - field bits<1> VGPRSpill = 0; - - // These need to be kept in sync with the enum in SIInstrFlags. - let TSFlags{0} = VM_CNT; - let TSFlags{1} = EXP_CNT; - let TSFlags{2} = LGKM_CNT; - - let TSFlags{3} = SALU; - let TSFlags{4} = VALU; - - let TSFlags{5} = SOP1; - let TSFlags{6} = SOP2; - let TSFlags{7} = SOPC; - let TSFlags{8} = SOPK; - let TSFlags{9} = SOPP; - - let TSFlags{10} = VOP1; - let TSFlags{11} = VOP2; - let TSFlags{12} = VOP3; - let TSFlags{13} = VOPC; - - let TSFlags{14} = MUBUF; - let TSFlags{15} = MTBUF; - let TSFlags{16} = SMRD; - let TSFlags{17} = DS; - let TSFlags{18} = MIMG; - let TSFlags{19} = FLAT; - let TSFlags{20} = WQM; - let TSFlags{21} = VGPRSpill; - - // Most instructions require adjustments after selection to satisfy - // operand requirements. - let hasPostISelHook = 1; - let SchedRW = [Write32Bit]; -} - -class Enc32 { - field bits<32> Inst; - int Size = 4; -} - -class Enc64 { - field bits<64> Inst; - int Size = 8; -} - -class VOPDstOperand : RegisterOperand ; -def VOPDstVCC : VOPDstOperand ; - -let Uses = [EXEC] in { - -class VOPAnyCommon pattern> : - InstSI { - - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let UseNamedOperandTable = 1; - let VALU = 1; -} - -class VOPCCommon pattern> : - VOPAnyCommon <(outs VOPDstVCC:$dst), ins, asm, pattern> { - - let DisableEncoding = "$dst"; - let VOPC = 1; - let Size = 4; -} - -class VOP1Common pattern> : - VOPAnyCommon { - - let VOP1 = 1; - let Size = 4; -} - -class VOP2Common pattern> : - VOPAnyCommon { - - let VOP2 = 1; - let Size = 4; -} - -class VOP3Common pattern> : - VOPAnyCommon { - - // Using complex patterns gives VOP3 patterns a very high complexity rating, - // but standalone patterns are almost always prefered, so we need to adjust the - // priority lower. The goal is to use a high number to reduce complexity to - // zero (or less than zero). - let AddedComplexity = -1000; - - let VOP3 = 1; - let VALU = 1; - - let AsmMatchConverter = "cvtVOP3"; - let isCodeGenOnly = 0; - - int Size = 8; -} - -} // End Uses = [EXEC] - -//===----------------------------------------------------------------------===// -// Scalar operations -//===----------------------------------------------------------------------===// - -class SOP1e op> : Enc32 { - bits<7> sdst; - bits<8> ssrc0; - - let Inst{7-0} = ssrc0; - let Inst{15-8} = op; - let Inst{22-16} = sdst; - let Inst{31-23} = 0x17d; //encoding; -} - -class SOP2e op> : Enc32 { - bits<7> sdst; - bits<8> ssrc0; - bits<8> ssrc1; - - let Inst{7-0} = ssrc0; - let Inst{15-8} = ssrc1; - let Inst{22-16} = sdst; - let Inst{29-23} = op; - let Inst{31-30} = 0x2; // encoding -} - -class SOPCe op> : Enc32 { - bits<8> ssrc0; - bits<8> ssrc1; - - let Inst{7-0} = ssrc0; - let Inst{15-8} = ssrc1; - let Inst{22-16} = op; - let Inst{31-23} = 0x17e; -} - -class SOPKe op> : Enc32 { - bits <7> sdst; - bits <16> simm16; - - let Inst{15-0} = simm16; - let Inst{22-16} = sdst; - let Inst{27-23} = op; - let Inst{31-28} = 0xb; //encoding -} - -class SOPK64e op> : Enc64 { - bits <7> sdst = 0; - bits <16> simm16; - bits <32> imm; - - let Inst{15-0} = simm16; - let Inst{22-16} = sdst; - let Inst{27-23} = op; - let Inst{31-28} = 0xb; - - let Inst{63-32} = imm; -} - -class SOPPe op> : Enc32 { - bits <16> simm16; - - let Inst{15-0} = simm16; - let Inst{22-16} = op; - let Inst{31-23} = 0x17f; // encoding -} - -class SMRDe op, bits<1> imm> : Enc32 { - bits<7> sdst; - bits<7> sbase; - bits<8> offset; - - let Inst{7-0} = offset; - let Inst{8} = imm; - let Inst{14-9} = sbase{6-1}; - let Inst{21-15} = sdst; - let Inst{26-22} = op; - let Inst{31-27} = 0x18; //encoding -} - -let SchedRW = [WriteSALU] in { -class SOP1 pattern> : - InstSI { - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let isCodeGenOnly = 0; - let SALU = 1; - let SOP1 = 1; -} - -class SOP2 pattern> : - InstSI { - - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let isCodeGenOnly = 0; - let SALU = 1; - let SOP2 = 1; - - let UseNamedOperandTable = 1; -} - -class SOPC op, dag outs, dag ins, string asm, list pattern> : - InstSI, SOPCe { - - let DisableEncoding = "$dst"; - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let SALU = 1; - let SOPC = 1; - let isCodeGenOnly = 0; - - let UseNamedOperandTable = 1; -} - -class SOPK pattern> : - InstSI { - - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let SALU = 1; - let SOPK = 1; - - let UseNamedOperandTable = 1; -} - -class SOPP op, dag ins, string asm, list pattern = []> : - InstSI <(outs), ins, asm, pattern >, SOPPe { - - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let SALU = 1; - let SOPP = 1; - - let UseNamedOperandTable = 1; -} - -} // let SchedRW = [WriteSALU] - -class SMRD pattern> : - InstSI { - - let LGKM_CNT = 1; - let SMRD = 1; - let mayStore = 0; - let mayLoad = 1; - let hasSideEffects = 0; - let UseNamedOperandTable = 1; - let SchedRW = [WriteSMEM]; -} - -//===----------------------------------------------------------------------===// -// Vector ALU operations -//===----------------------------------------------------------------------===// - -class VOP1e op> : Enc32 { - bits<8> vdst; - bits<9> src0; - - let Inst{8-0} = src0; - let Inst{16-9} = op; - let Inst{24-17} = vdst; - let Inst{31-25} = 0x3f; //encoding -} - -class VOP2e op> : Enc32 { - bits<8> vdst; - bits<9> src0; - bits<8> src1; - - let Inst{8-0} = src0; - let Inst{16-9} = src1; - let Inst{24-17} = vdst; - let Inst{30-25} = op; - let Inst{31} = 0x0; //encoding -} - -class VOP2_MADKe op> : Enc64 { - - bits<8> vdst; - bits<9> src0; - bits<8> vsrc1; - bits<32> src2; - - let Inst{8-0} = src0; - let Inst{16-9} = vsrc1; - let Inst{24-17} = vdst; - let Inst{30-25} = op; - let Inst{31} = 0x0; // encoding - let Inst{63-32} = src2; -} - -class VOP3e op> : Enc64 { - bits<8> vdst; - bits<2> src0_modifiers; - bits<9> src0; - bits<2> src1_modifiers; - bits<9> src1; - bits<2> src2_modifiers; - bits<9> src2; - bits<1> clamp; - bits<2> omod; - - let Inst{7-0} = vdst; - let Inst{8} = src0_modifiers{1}; - let Inst{9} = src1_modifiers{1}; - let Inst{10} = src2_modifiers{1}; - let Inst{11} = clamp; - let Inst{25-17} = op; - let Inst{31-26} = 0x34; //encoding - let Inst{40-32} = src0; - let Inst{49-41} = src1; - let Inst{58-50} = src2; - let Inst{60-59} = omod; - let Inst{61} = src0_modifiers{0}; - let Inst{62} = src1_modifiers{0}; - let Inst{63} = src2_modifiers{0}; -} - -class VOP3be op> : Enc64 { - bits<8> vdst; - bits<2> src0_modifiers; - bits<9> src0; - bits<2> src1_modifiers; - bits<9> src1; - bits<2> src2_modifiers; - bits<9> src2; - bits<7> sdst; - bits<2> omod; - - let Inst{7-0} = vdst; - let Inst{14-8} = sdst; - let Inst{25-17} = op; - let Inst{31-26} = 0x34; //encoding - let Inst{40-32} = src0; - let Inst{49-41} = src1; - let Inst{58-50} = src2; - let Inst{60-59} = omod; - let Inst{61} = src0_modifiers{0}; - let Inst{62} = src1_modifiers{0}; - let Inst{63} = src2_modifiers{0}; -} - -class VOPCe op> : Enc32 { - bits<9> src0; - bits<8> vsrc1; - - let Inst{8-0} = src0; - let Inst{16-9} = vsrc1; - let Inst{24-17} = op; - let Inst{31-25} = 0x3e; -} - -class VINTRPe op> : Enc32 { - bits<8> vdst; - bits<8> vsrc; - bits<2> attrchan; - bits<6> attr; - - let Inst{7-0} = vsrc; - let Inst{9-8} = attrchan; - let Inst{15-10} = attr; - let Inst{17-16} = op; - let Inst{25-18} = vdst; - let Inst{31-26} = 0x32; // encoding -} - -class DSe op> : Enc64 { - bits<8> vdst; - bits<1> gds; - bits<8> addr; - bits<8> data0; - bits<8> data1; - bits<8> offset0; - bits<8> offset1; - - let Inst{7-0} = offset0; - let Inst{15-8} = offset1; - let Inst{17} = gds; - let Inst{25-18} = op; - let Inst{31-26} = 0x36; //encoding - let Inst{39-32} = addr; - let Inst{47-40} = data0; - let Inst{55-48} = data1; - let Inst{63-56} = vdst; -} - -class MUBUFe op> : Enc64 { - bits<12> offset; - bits<1> offen; - bits<1> idxen; - bits<1> glc; - bits<1> addr64; - bits<1> lds; - bits<8> vaddr; - bits<8> vdata; - bits<7> srsrc; - bits<1> slc; - bits<1> tfe; - bits<8> soffset; - - let Inst{11-0} = offset; - let Inst{12} = offen; - let Inst{13} = idxen; - let Inst{14} = glc; - let Inst{15} = addr64; - let Inst{16} = lds; - let Inst{24-18} = op; - let Inst{31-26} = 0x38; //encoding - let Inst{39-32} = vaddr; - let Inst{47-40} = vdata; - let Inst{52-48} = srsrc{6-2}; - let Inst{54} = slc; - let Inst{55} = tfe; - let Inst{63-56} = soffset; -} - -class MTBUFe op> : Enc64 { - bits<8> vdata; - bits<12> offset; - bits<1> offen; - bits<1> idxen; - bits<1> glc; - bits<1> addr64; - bits<4> dfmt; - bits<3> nfmt; - bits<8> vaddr; - bits<7> srsrc; - bits<1> slc; - bits<1> tfe; - bits<8> soffset; - - let Inst{11-0} = offset; - let Inst{12} = offen; - let Inst{13} = idxen; - let Inst{14} = glc; - let Inst{15} = addr64; - let Inst{18-16} = op; - let Inst{22-19} = dfmt; - let Inst{25-23} = nfmt; - let Inst{31-26} = 0x3a; //encoding - let Inst{39-32} = vaddr; - let Inst{47-40} = vdata; - let Inst{52-48} = srsrc{6-2}; - let Inst{54} = slc; - let Inst{55} = tfe; - let Inst{63-56} = soffset; -} - -class MIMGe op> : Enc64 { - bits<8> vdata; - bits<4> dmask; - bits<1> unorm; - bits<1> glc; - bits<1> da; - bits<1> r128; - bits<1> tfe; - bits<1> lwe; - bits<1> slc; - bits<8> vaddr; - bits<7> srsrc; - bits<7> ssamp; - - let Inst{11-8} = dmask; - let Inst{12} = unorm; - let Inst{13} = glc; - let Inst{14} = da; - let Inst{15} = r128; - let Inst{16} = tfe; - let Inst{17} = lwe; - let Inst{24-18} = op; - let Inst{25} = slc; - let Inst{31-26} = 0x3c; - let Inst{39-32} = vaddr; - let Inst{47-40} = vdata; - let Inst{52-48} = srsrc{6-2}; - let Inst{57-53} = ssamp{6-2}; -} - -class FLATe op> : Enc64 { - bits<8> addr; - bits<8> data; - bits<8> vdst; - bits<1> slc; - bits<1> glc; - bits<1> tfe; - - // 15-0 is reserved. - let Inst{16} = glc; - let Inst{17} = slc; - let Inst{24-18} = op; - let Inst{31-26} = 0x37; // Encoding. - let Inst{39-32} = addr; - let Inst{47-40} = data; - // 54-48 is reserved. - let Inst{55} = tfe; - let Inst{63-56} = vdst; -} - -class EXPe : Enc64 { - bits<4> en; - bits<6> tgt; - bits<1> compr; - bits<1> done; - bits<1> vm; - bits<8> vsrc0; - bits<8> vsrc1; - bits<8> vsrc2; - bits<8> vsrc3; - - let Inst{3-0} = en; - let Inst{9-4} = tgt; - let Inst{10} = compr; - let Inst{11} = done; - let Inst{12} = vm; - let Inst{31-26} = 0x3e; - let Inst{39-32} = vsrc0; - let Inst{47-40} = vsrc1; - let Inst{55-48} = vsrc2; - let Inst{63-56} = vsrc3; -} - -let Uses = [EXEC] in { - -class VOP1 op, dag outs, dag ins, string asm, list pattern> : - VOP1Common , - VOP1e { - let isCodeGenOnly = 0; -} - -class VOP2 op, dag outs, dag ins, string asm, list pattern> : - VOP2Common , VOP2e { - let isCodeGenOnly = 0; -} - -class VOPC op, dag ins, string asm, list pattern> : - VOPCCommon , VOPCe ; - -class VINTRPCommon pattern> : - InstSI { - let mayLoad = 1; - let mayStore = 0; - let hasSideEffects = 0; -} - -} // End Uses = [EXEC] - -//===----------------------------------------------------------------------===// -// Vector I/O operations -//===----------------------------------------------------------------------===// - -let Uses = [EXEC] in { - -class DS pattern> : - InstSI { - - let LGKM_CNT = 1; - let DS = 1; - let UseNamedOperandTable = 1; - let Uses = [M0]; - - // Most instruction load and store data, so set this as the default. - let mayLoad = 1; - let mayStore = 1; - - let hasSideEffects = 0; - let AsmMatchConverter = "cvtDS"; - let SchedRW = [WriteLDS]; -} - -class MUBUF pattern> : - InstSI { - - let VM_CNT = 1; - let EXP_CNT = 1; - let MUBUF = 1; - - let hasSideEffects = 0; - let UseNamedOperandTable = 1; - let AsmMatchConverter = "cvtMubuf"; - let SchedRW = [WriteVMEM]; -} - -class MTBUF pattern> : - InstSI { - - let VM_CNT = 1; - let EXP_CNT = 1; - let MTBUF = 1; - - let hasSideEffects = 0; - let UseNamedOperandTable = 1; - let SchedRW = [WriteVMEM]; -} - -class FLAT op, dag outs, dag ins, string asm, list pattern> : - InstSI, FLATe { - let FLAT = 1; - // Internally, FLAT instruction are executed as both an LDS and a - // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT - // and are not considered done until both have been decremented. - let VM_CNT = 1; - let LGKM_CNT = 1; - - let Uses = [EXEC, FLAT_SCR]; // M0 - - let UseNamedOperandTable = 1; - let hasSideEffects = 0; - let AsmMatchConverter = "cvtFlat"; - let SchedRW = [WriteVMEM]; -} - -class MIMG op, dag outs, dag ins, string asm, list pattern> : - InstSI , MIMGe { - - let VM_CNT = 1; - let EXP_CNT = 1; - let MIMG = 1; - - let hasSideEffects = 0; // XXX ???? -} - - -} // End Uses = [EXEC] diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp deleted file mode 100644 index d647c25286f..00000000000 --- a/lib/Target/R600/SIInstrInfo.cpp +++ /dev/null @@ -1,2723 +0,0 @@ -//===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief SI Implementation of TargetInstrInfo. -// -//===----------------------------------------------------------------------===// - - -#include "SIInstrInfo.h" -#include "AMDGPUTargetMachine.h" -#include "SIDefines.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/Function.h" -#include "llvm/CodeGen/RegisterScavenging.h" -#include "llvm/MC/MCInstrDesc.h" -#include "llvm/Support/Debug.h" - -using namespace llvm; - -SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st) - : AMDGPUInstrInfo(st), RI() {} - -//===----------------------------------------------------------------------===// -// TargetInstrInfo callbacks -//===----------------------------------------------------------------------===// - -static unsigned getNumOperandsNoGlue(SDNode *Node) { - unsigned N = Node->getNumOperands(); - while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) - --N; - return N; -} - -static SDValue findChainOperand(SDNode *Load) { - SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1); - assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node"); - return LastOp; -} - -/// \brief Returns true if both nodes have the same value for the given -/// operand \p Op, or if both nodes do not have this operand. -static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { - unsigned Opc0 = N0->getMachineOpcode(); - unsigned Opc1 = N1->getMachineOpcode(); - - int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); - int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); - - if (Op0Idx == -1 && Op1Idx == -1) - return true; - - - if ((Op0Idx == -1 && Op1Idx != -1) || - (Op1Idx == -1 && Op0Idx != -1)) - return false; - - // getNamedOperandIdx returns the index for the MachineInstr's operands, - // which includes the result as the first operand. We are indexing into the - // MachineSDNode's operands, so we need to skip the result operand to get - // the real index. - --Op0Idx; - --Op1Idx; - - return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); -} - -bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, - AliasAnalysis *AA) const { - // TODO: The generic check fails for VALU instructions that should be - // rematerializable due to implicit reads of exec. We really want all of the - // generic logic for this except for this. - switch (MI->getOpcode()) { - case AMDGPU::V_MOV_B32_e32: - case AMDGPU::V_MOV_B32_e64: - return true; - default: - return false; - } -} - -bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, - int64_t &Offset0, - int64_t &Offset1) const { - if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) - return false; - - unsigned Opc0 = Load0->getMachineOpcode(); - unsigned Opc1 = Load1->getMachineOpcode(); - - // Make sure both are actually loads. - if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) - return false; - - if (isDS(Opc0) && isDS(Opc1)) { - - // FIXME: Handle this case: - if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) - return false; - - // Check base reg. - if (Load0->getOperand(1) != Load1->getOperand(1)) - return false; - - // Check chain. - if (findChainOperand(Load0) != findChainOperand(Load1)) - return false; - - // Skip read2 / write2 variants for simplicity. - // TODO: We should report true if the used offsets are adjacent (excluded - // st64 versions). - if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 || - AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1) - return false; - - Offset0 = cast(Load0->getOperand(2))->getZExtValue(); - Offset1 = cast(Load1->getOperand(2))->getZExtValue(); - return true; - } - - if (isSMRD(Opc0) && isSMRD(Opc1)) { - assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); - - // Check base reg. - if (Load0->getOperand(0) != Load1->getOperand(0)) - return false; - - const ConstantSDNode *Load0Offset = - dyn_cast(Load0->getOperand(1)); - const ConstantSDNode *Load1Offset = - dyn_cast(Load1->getOperand(1)); - - if (!Load0Offset || !Load1Offset) - return false; - - // Check chain. - if (findChainOperand(Load0) != findChainOperand(Load1)) - return false; - - Offset0 = Load0Offset->getZExtValue(); - Offset1 = Load1Offset->getZExtValue(); - return true; - } - - // MUBUF and MTBUF can access the same addresses. - if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { - - // MUBUF and MTBUF have vaddr at different indices. - if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || - findChainOperand(Load0) != findChainOperand(Load1) || - !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || - !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) - return false; - - int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); - int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); - - if (OffIdx0 == -1 || OffIdx1 == -1) - return false; - - // getNamedOperandIdx returns the index for MachineInstrs. Since they - // inlcude the output in the operand list, but SDNodes don't, we need to - // subtract the index by one. - --OffIdx0; - --OffIdx1; - - SDValue Off0 = Load0->getOperand(OffIdx0); - SDValue Off1 = Load1->getOperand(OffIdx1); - - // The offset might be a FrameIndexSDNode. - if (!isa(Off0) || !isa(Off1)) - return false; - - Offset0 = cast(Off0)->getZExtValue(); - Offset1 = cast(Off1)->getZExtValue(); - return true; - } - - return false; -} - -static bool isStride64(unsigned Opc) { - switch (Opc) { - case AMDGPU::DS_READ2ST64_B32: - case AMDGPU::DS_READ2ST64_B64: - case AMDGPU::DS_WRITE2ST64_B32: - case AMDGPU::DS_WRITE2ST64_B64: - return true; - default: - return false; - } -} - -bool SIInstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt, - unsigned &BaseReg, unsigned &Offset, - const TargetRegisterInfo *TRI) const { - unsigned Opc = LdSt->getOpcode(); - if (isDS(Opc)) { - const MachineOperand *OffsetImm = getNamedOperand(*LdSt, - AMDGPU::OpName::offset); - if (OffsetImm) { - // Normal, single offset LDS instruction. - const MachineOperand *AddrReg = getNamedOperand(*LdSt, - AMDGPU::OpName::addr); - - BaseReg = AddrReg->getReg(); - Offset = OffsetImm->getImm(); - return true; - } - - // The 2 offset instructions use offset0 and offset1 instead. We can treat - // these as a load with a single offset if the 2 offsets are consecutive. We - // will use this for some partially aligned loads. - const MachineOperand *Offset0Imm = getNamedOperand(*LdSt, - AMDGPU::OpName::offset0); - const MachineOperand *Offset1Imm = getNamedOperand(*LdSt, - AMDGPU::OpName::offset1); - - uint8_t Offset0 = Offset0Imm->getImm(); - uint8_t Offset1 = Offset1Imm->getImm(); - assert(Offset1 > Offset0); - - if (Offset1 - Offset0 == 1) { - // Each of these offsets is in element sized units, so we need to convert - // to bytes of the individual reads. - - unsigned EltSize; - if (LdSt->mayLoad()) - EltSize = getOpRegClass(*LdSt, 0)->getSize() / 2; - else { - assert(LdSt->mayStore()); - int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); - EltSize = getOpRegClass(*LdSt, Data0Idx)->getSize(); - } - - if (isStride64(Opc)) - EltSize *= 64; - - const MachineOperand *AddrReg = getNamedOperand(*LdSt, - AMDGPU::OpName::addr); - BaseReg = AddrReg->getReg(); - Offset = EltSize * Offset0; - return true; - } - - return false; - } - - if (isMUBUF(Opc) || isMTBUF(Opc)) { - if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1) - return false; - - const MachineOperand *AddrReg = getNamedOperand(*LdSt, - AMDGPU::OpName::vaddr); - if (!AddrReg) - return false; - - const MachineOperand *OffsetImm = getNamedOperand(*LdSt, - AMDGPU::OpName::offset); - BaseReg = AddrReg->getReg(); - Offset = OffsetImm->getImm(); - return true; - } - - if (isSMRD(Opc)) { - const MachineOperand *OffsetImm = getNamedOperand(*LdSt, - AMDGPU::OpName::offset); - if (!OffsetImm) - return false; - - const MachineOperand *SBaseReg = getNamedOperand(*LdSt, - AMDGPU::OpName::sbase); - BaseReg = SBaseReg->getReg(); - Offset = OffsetImm->getImm(); - return true; - } - - return false; -} - -bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, - MachineInstr *SecondLdSt, - unsigned NumLoads) const { - unsigned Opc0 = FirstLdSt->getOpcode(); - unsigned Opc1 = SecondLdSt->getOpcode(); - - // TODO: This needs finer tuning - if (NumLoads > 4) - return false; - - if (isDS(Opc0) && isDS(Opc1)) - return true; - - if (isSMRD(Opc0) && isSMRD(Opc1)) - return true; - - if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) - return true; - - return false; -} - -void -SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, - bool KillSrc) const { - - // If we are trying to copy to or from SCC, there is a bug somewhere else in - // the backend. While it may be theoretically possible to do this, it should - // never be necessary. - assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC); - - static const int16_t Sub0_15[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, - AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, - AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, - AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 0 - }; - - static const int16_t Sub0_7[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, - AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 0 - }; - - static const int16_t Sub0_3[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 0 - }; - - static const int16_t Sub0_2[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 0 - }; - - static const int16_t Sub0_1[] = { - AMDGPU::sub0, AMDGPU::sub1, 0 - }; - - unsigned Opcode; - const int16_t *SubIndices; - - if (AMDGPU::SReg_32RegClass.contains(DestReg)) { - assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); - BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - return; - - } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) { - if (DestReg == AMDGPU::VCC) { - if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { - BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) - .addReg(SrcReg, getKillRegState(KillSrc)); - } else { - // FIXME: Hack until VReg_1 removed. - assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); - BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32), AMDGPU::VCC) - .addImm(0) - .addReg(SrcReg, getKillRegState(KillSrc)); - } - - return; - } - - assert(AMDGPU::SReg_64RegClass.contains(SrcReg)); - BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - return; - - } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) { - assert(AMDGPU::SReg_128RegClass.contains(SrcReg)); - Opcode = AMDGPU::S_MOV_B32; - SubIndices = Sub0_3; - - } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) { - assert(AMDGPU::SReg_256RegClass.contains(SrcReg)); - Opcode = AMDGPU::S_MOV_B32; - SubIndices = Sub0_7; - - } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) { - assert(AMDGPU::SReg_512RegClass.contains(SrcReg)); - Opcode = AMDGPU::S_MOV_B32; - SubIndices = Sub0_15; - - } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) { - assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || - AMDGPU::SReg_32RegClass.contains(SrcReg)); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - return; - - } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) { - assert(AMDGPU::VReg_64RegClass.contains(SrcReg) || - AMDGPU::SReg_64RegClass.contains(SrcReg)); - Opcode = AMDGPU::V_MOV_B32_e32; - SubIndices = Sub0_1; - - } else if (AMDGPU::VReg_96RegClass.contains(DestReg)) { - assert(AMDGPU::VReg_96RegClass.contains(SrcReg)); - Opcode = AMDGPU::V_MOV_B32_e32; - SubIndices = Sub0_2; - - } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) { - assert(AMDGPU::VReg_128RegClass.contains(SrcReg) || - AMDGPU::SReg_128RegClass.contains(SrcReg)); - Opcode = AMDGPU::V_MOV_B32_e32; - SubIndices = Sub0_3; - - } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) { - assert(AMDGPU::VReg_256RegClass.contains(SrcReg) || - AMDGPU::SReg_256RegClass.contains(SrcReg)); - Opcode = AMDGPU::V_MOV_B32_e32; - SubIndices = Sub0_7; - - } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) { - assert(AMDGPU::VReg_512RegClass.contains(SrcReg) || - AMDGPU::SReg_512RegClass.contains(SrcReg)); - Opcode = AMDGPU::V_MOV_B32_e32; - SubIndices = Sub0_15; - - } else { - llvm_unreachable("Can't copy register!"); - } - - while (unsigned SubIdx = *SubIndices++) { - MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, - get(Opcode), RI.getSubReg(DestReg, SubIdx)); - - Builder.addReg(RI.getSubReg(SrcReg, SubIdx), getKillRegState(KillSrc)); - - if (*SubIndices) - Builder.addReg(DestReg, RegState::Define | RegState::Implicit); - } -} - -unsigned SIInstrInfo::commuteOpcode(const MachineInstr &MI) const { - const unsigned Opcode = MI.getOpcode(); - - int NewOpc; - - // Try to map original to commuted opcode - NewOpc = AMDGPU::getCommuteRev(Opcode); - // Check if the commuted (REV) opcode exists on the target. - if (NewOpc != -1 && pseudoToMCOpcode(NewOpc) != -1) - return NewOpc; - - // Try to map commuted to original opcode - NewOpc = AMDGPU::getCommuteOrig(Opcode); - // Check if the original (non-REV) opcode exists on the target. - if (NewOpc != -1 && pseudoToMCOpcode(NewOpc) != -1) - return NewOpc; - - return Opcode; -} - -unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { - - if (DstRC->getSize() == 4) { - return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; - } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) { - return AMDGPU::S_MOV_B64; - } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) { - return AMDGPU::V_MOV_B64_PSEUDO; - } - return AMDGPU::COPY; -} - -void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned SrcReg, bool isKill, - int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { - MachineFunction *MF = MBB.getParent(); - SIMachineFunctionInfo *MFI = MF->getInfo(); - MachineFrameInfo *FrameInfo = MF->getFrameInfo(); - DebugLoc DL = MBB.findDebugLoc(MI); - int Opcode = -1; - - if (RI.isSGPRClass(RC)) { - // We are only allowed to create one new instruction when spilling - // registers, so we need to use pseudo instruction for spilling - // SGPRs. - switch (RC->getSize() * 8) { - case 32: Opcode = AMDGPU::SI_SPILL_S32_SAVE; break; - case 64: Opcode = AMDGPU::SI_SPILL_S64_SAVE; break; - case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break; - case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break; - case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break; - } - } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) { - MFI->setHasSpilledVGPRs(); - - switch(RC->getSize() * 8) { - case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break; - case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break; - case 96: Opcode = AMDGPU::SI_SPILL_V96_SAVE; break; - case 128: Opcode = AMDGPU::SI_SPILL_V128_SAVE; break; - case 256: Opcode = AMDGPU::SI_SPILL_V256_SAVE; break; - case 512: Opcode = AMDGPU::SI_SPILL_V512_SAVE; break; - } - } - - if (Opcode != -1) { - FrameInfo->setObjectAlignment(FrameIndex, 4); - BuildMI(MBB, MI, DL, get(Opcode)) - .addReg(SrcReg) - .addFrameIndex(FrameIndex) - // Place-holder registers, these will be filled in by - // SIPrepareScratchRegs. - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef); - } else { - LLVMContext &Ctx = MF->getFunction()->getContext(); - Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" - " spill register"); - BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) - .addReg(SrcReg); - } -} - -void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned DestReg, int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { - MachineFunction *MF = MBB.getParent(); - const SIMachineFunctionInfo *MFI = MF->getInfo(); - MachineFrameInfo *FrameInfo = MF->getFrameInfo(); - DebugLoc DL = MBB.findDebugLoc(MI); - int Opcode = -1; - - if (RI.isSGPRClass(RC)){ - switch(RC->getSize() * 8) { - case 32: Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break; - case 64: Opcode = AMDGPU::SI_SPILL_S64_RESTORE; break; - case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break; - case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break; - case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break; - } - } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) { - switch(RC->getSize() * 8) { - case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break; - case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break; - case 96: Opcode = AMDGPU::SI_SPILL_V96_RESTORE; break; - case 128: Opcode = AMDGPU::SI_SPILL_V128_RESTORE; break; - case 256: Opcode = AMDGPU::SI_SPILL_V256_RESTORE; break; - case 512: Opcode = AMDGPU::SI_SPILL_V512_RESTORE; break; - } - } - - if (Opcode != -1) { - FrameInfo->setObjectAlignment(FrameIndex, 4); - BuildMI(MBB, MI, DL, get(Opcode), DestReg) - .addFrameIndex(FrameIndex) - // Place-holder registers, these will be filled in by - // SIPrepareScratchRegs. - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef); - - } else { - LLVMContext &Ctx = MF->getFunction()->getContext(); - Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" - " restore register"); - BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); - } -} - -/// \param @Offset Offset in bytes of the FrameIndex being spilled -unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - RegScavenger *RS, unsigned TmpReg, - unsigned FrameOffset, - unsigned Size) const { - MachineFunction *MF = MBB.getParent(); - SIMachineFunctionInfo *MFI = MF->getInfo(); - const AMDGPUSubtarget &ST = MF->getSubtarget(); - const SIRegisterInfo *TRI = - static_cast(ST.getRegisterInfo()); - DebugLoc DL = MBB.findDebugLoc(MI); - unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF); - unsigned WavefrontSize = ST.getWavefrontSize(); - - unsigned TIDReg = MFI->getTIDReg(); - if (!MFI->hasCalculatedTID()) { - MachineBasicBlock &Entry = MBB.getParent()->front(); - MachineBasicBlock::iterator Insert = Entry.front(); - DebugLoc DL = Insert->getDebugLoc(); - - TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass); - if (TIDReg == AMDGPU::NoRegister) - return TIDReg; - - - if (MFI->getShaderType() == ShaderType::COMPUTE && - WorkGroupSize > WavefrontSize) { - - unsigned TIDIGXReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_X); - unsigned TIDIGYReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Y); - unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Z); - unsigned InputPtrReg = - TRI->getPreloadedValue(*MF, SIRegisterInfo::INPUT_PTR); - for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { - if (!Entry.isLiveIn(Reg)) - Entry.addLiveIn(Reg); - } - - RS->enterBasicBlock(&Entry); - unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); - unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); - BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) - .addReg(InputPtrReg) - .addImm(SI::KernelInputOffsets::NGROUPS_Z); - BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) - .addReg(InputPtrReg) - .addImm(SI::KernelInputOffsets::NGROUPS_Y); - - // NGROUPS.X * NGROUPS.Y - BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) - .addReg(STmp1) - .addReg(STmp0); - // (NGROUPS.X * NGROUPS.Y) * TIDIG.X - BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) - .addReg(STmp1) - .addReg(TIDIGXReg); - // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) - BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) - .addReg(STmp0) - .addReg(TIDIGYReg) - .addReg(TIDReg); - // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z - BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg) - .addReg(TIDReg) - .addReg(TIDIGZReg); - } else { - // Get the wave id - BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), - TIDReg) - .addImm(-1) - .addImm(0); - - BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64), - TIDReg) - .addImm(-1) - .addReg(TIDReg); - } - - BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), - TIDReg) - .addImm(2) - .addReg(TIDReg); - MFI->setTIDReg(TIDReg); - } - - // Add FrameIndex to LDS offset - unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize); - BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) - .addImm(LDSOffset) - .addReg(TIDReg); - - return TmpReg; -} - -void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI, - int Count) const { - while (Count > 0) { - int Arg; - if (Count >= 8) - Arg = 7; - else - Arg = Count - 1; - Count -= 8; - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(AMDGPU::S_NOP)) - .addImm(Arg); - } -} - -bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { - MachineBasicBlock &MBB = *MI->getParent(); - DebugLoc DL = MBB.findDebugLoc(MI); - switch (MI->getOpcode()) { - default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); - - case AMDGPU::SI_CONSTDATA_PTR: { - unsigned Reg = MI->getOperand(0).getReg(); - unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0); - unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1); - - BuildMI(MBB, MI, DL, get(AMDGPU::S_GETPC_B64), Reg); - - // Add 32-bit offset from this instruction to the start of the constant data. - BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_U32), RegLo) - .addReg(RegLo) - .addTargetIndex(AMDGPU::TI_CONSTDATA_START) - .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit); - BuildMI(MBB, MI, DL, get(AMDGPU::S_ADDC_U32), RegHi) - .addReg(RegHi) - .addImm(0) - .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit) - .addReg(AMDGPU::SCC, RegState::Implicit); - MI->eraseFromParent(); - break; - } - case AMDGPU::SGPR_USE: - // This is just a placeholder for register allocation. - MI->eraseFromParent(); - break; - - case AMDGPU::V_MOV_B64_PSEUDO: { - unsigned Dst = MI->getOperand(0).getReg(); - unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); - unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); - - const MachineOperand &SrcOp = MI->getOperand(1); - // FIXME: Will this work for 64-bit floating point immediates? - assert(!SrcOp.isFPImm()); - if (SrcOp.isImm()) { - APInt Imm(64, SrcOp.getImm()); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) - .addImm(Imm.getLoBits(32).getZExtValue()) - .addReg(Dst, RegState::Implicit); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) - .addImm(Imm.getHiBits(32).getZExtValue()) - .addReg(Dst, RegState::Implicit); - } else { - assert(SrcOp.isReg()); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) - .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) - .addReg(Dst, RegState::Implicit); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) - .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) - .addReg(Dst, RegState::Implicit); - } - MI->eraseFromParent(); - break; - } - - case AMDGPU::V_CNDMASK_B64_PSEUDO: { - unsigned Dst = MI->getOperand(0).getReg(); - unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); - unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); - unsigned Src0 = MI->getOperand(1).getReg(); - unsigned Src1 = MI->getOperand(2).getReg(); - const MachineOperand &SrcCond = MI->getOperand(3); - - BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo) - .addReg(RI.getSubReg(Src0, AMDGPU::sub0)) - .addReg(RI.getSubReg(Src1, AMDGPU::sub0)) - .addOperand(SrcCond); - BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi) - .addReg(RI.getSubReg(Src0, AMDGPU::sub1)) - .addReg(RI.getSubReg(Src1, AMDGPU::sub1)) - .addOperand(SrcCond); - MI->eraseFromParent(); - break; - } - } - return true; -} - -MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, - bool NewMI) const { - - if (MI->getNumOperands() < 3) - return nullptr; - - int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::src0); - assert(Src0Idx != -1 && "Should always have src0 operand"); - - MachineOperand &Src0 = MI->getOperand(Src0Idx); - if (!Src0.isReg()) - return nullptr; - - int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::src1); - if (Src1Idx == -1) - return nullptr; - - MachineOperand &Src1 = MI->getOperand(Src1Idx); - - // Make sure it's legal to commute operands for VOP2. - if (isVOP2(MI->getOpcode()) && - (!isOperandLegal(MI, Src0Idx, &Src1) || - !isOperandLegal(MI, Src1Idx, &Src0))) { - return nullptr; - } - - if (!Src1.isReg()) { - // Allow commuting instructions with Imm operands. - if (NewMI || !Src1.isImm() || - (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) { - return nullptr; - } - - // Be sure to copy the source modifiers to the right place. - if (MachineOperand *Src0Mods - = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { - MachineOperand *Src1Mods - = getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers); - - int Src0ModsVal = Src0Mods->getImm(); - if (!Src1Mods && Src0ModsVal != 0) - return nullptr; - - // XXX - This assert might be a lie. It might be useful to have a neg - // modifier with 0.0. - int Src1ModsVal = Src1Mods->getImm(); - assert((Src1ModsVal == 0) && "Not expecting modifiers with immediates"); - - Src1Mods->setImm(Src0ModsVal); - Src0Mods->setImm(Src1ModsVal); - } - - unsigned Reg = Src0.getReg(); - unsigned SubReg = Src0.getSubReg(); - if (Src1.isImm()) - Src0.ChangeToImmediate(Src1.getImm()); - else - llvm_unreachable("Should only have immediates"); - - Src1.ChangeToRegister(Reg, false); - Src1.setSubReg(SubReg); - } else { - MI = TargetInstrInfo::commuteInstruction(MI, NewMI); - } - - if (MI) - MI->setDesc(get(commuteOpcode(*MI))); - - return MI; -} - -// This needs to be implemented because the source modifiers may be inserted -// between the true commutable operands, and the base -// TargetInstrInfo::commuteInstruction uses it. -bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, - unsigned &SrcOpIdx1, - unsigned &SrcOpIdx2) const { - const MCInstrDesc &MCID = MI->getDesc(); - if (!MCID.isCommutable()) - return false; - - unsigned Opc = MI->getOpcode(); - int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); - if (Src0Idx == -1) - return false; - - // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on - // immediate. - if (!MI->getOperand(Src0Idx).isReg()) - return false; - - int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); - if (Src1Idx == -1) - return false; - - if (!MI->getOperand(Src1Idx).isReg()) - return false; - - // If any source modifiers are set, the generic instruction commuting won't - // understand how to copy the source modifiers. - if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) || - hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers)) - return false; - - SrcOpIdx1 = Src0Idx; - SrcOpIdx2 = Src1Idx; - return true; -} - -MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned DstReg, - unsigned SrcReg) const { - return BuildMI(*MBB, I, MBB->findDebugLoc(I), get(AMDGPU::V_MOV_B32_e32), - DstReg) .addReg(SrcReg); -} - -bool SIInstrInfo::isMov(unsigned Opcode) const { - switch(Opcode) { - default: return false; - case AMDGPU::S_MOV_B32: - case AMDGPU::S_MOV_B64: - case AMDGPU::V_MOV_B32_e32: - case AMDGPU::V_MOV_B32_e64: - return true; - } -} - -bool -SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { - return RC != &AMDGPU::EXECRegRegClass; -} - -static void removeModOperands(MachineInstr &MI) { - unsigned Opc = MI.getOpcode(); - int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, - AMDGPU::OpName::src0_modifiers); - int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, - AMDGPU::OpName::src1_modifiers); - int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, - AMDGPU::OpName::src2_modifiers); - - MI.RemoveOperand(Src2ModIdx); - MI.RemoveOperand(Src1ModIdx); - MI.RemoveOperand(Src0ModIdx); -} - -bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, - unsigned Reg, MachineRegisterInfo *MRI) const { - if (!MRI->hasOneNonDBGUse(Reg)) - return false; - - unsigned Opc = UseMI->getOpcode(); - if (Opc == AMDGPU::V_MAD_F32) { - // Don't fold if we are using source modifiers. The new VOP2 instructions - // don't have them. - if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) || - hasModifiersSet(*UseMI, AMDGPU::OpName::src1_modifiers) || - hasModifiersSet(*UseMI, AMDGPU::OpName::src2_modifiers)) { - return false; - } - - MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0); - MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1); - MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2); - - // Multiplied part is the constant: Use v_madmk_f32 - // We should only expect these to be on src0 due to canonicalizations. - if (Src0->isReg() && Src0->getReg() == Reg) { - if (!Src1->isReg() || - (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) - return false; - - if (!Src2->isReg() || - (Src2->isReg() && RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))) - return false; - - // We need to do some weird looking operand shuffling since the madmk - // operands are out of the normal expected order with the multiplied - // constant as the last operand. - // - // v_mad_f32 src0, src1, src2 -> v_madmk_f32 src0 * src2K + src1 - // src0 -> src2 K - // src1 -> src0 - // src2 -> src1 - - const int64_t Imm = DefMI->getOperand(1).getImm(); - - // FIXME: This would be a lot easier if we could return a new instruction - // instead of having to modify in place. - - // Remove these first since they are at the end. - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, - AMDGPU::OpName::omod)); - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, - AMDGPU::OpName::clamp)); - - unsigned Src1Reg = Src1->getReg(); - unsigned Src1SubReg = Src1->getSubReg(); - unsigned Src2Reg = Src2->getReg(); - unsigned Src2SubReg = Src2->getSubReg(); - Src0->setReg(Src1Reg); - Src0->setSubReg(Src1SubReg); - Src0->setIsKill(Src1->isKill()); - - Src1->setReg(Src2Reg); - Src1->setSubReg(Src2SubReg); - Src1->setIsKill(Src2->isKill()); - - Src2->ChangeToImmediate(Imm); - - removeModOperands(*UseMI); - UseMI->setDesc(get(AMDGPU::V_MADMK_F32)); - - bool DeleteDef = MRI->hasOneNonDBGUse(Reg); - if (DeleteDef) - DefMI->eraseFromParent(); - - return true; - } - - // Added part is the constant: Use v_madak_f32 - if (Src2->isReg() && Src2->getReg() == Reg) { - // Not allowed to use constant bus for another operand. - // We can however allow an inline immediate as src0. - if (!Src0->isImm() && - (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) - return false; - - if (!Src1->isReg() || - (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) - return false; - - const int64_t Imm = DefMI->getOperand(1).getImm(); - - // FIXME: This would be a lot easier if we could return a new instruction - // instead of having to modify in place. - - // Remove these first since they are at the end. - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, - AMDGPU::OpName::omod)); - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, - AMDGPU::OpName::clamp)); - - Src2->ChangeToImmediate(Imm); - - // These come before src2. - removeModOperands(*UseMI); - UseMI->setDesc(get(AMDGPU::V_MADAK_F32)); - - bool DeleteDef = MRI->hasOneNonDBGUse(Reg); - if (DeleteDef) - DefMI->eraseFromParent(); - - return true; - } - } - - return false; -} - -bool -SIInstrInfo::isTriviallyReMaterializable(const MachineInstr *MI, - AliasAnalysis *AA) const { - switch(MI->getOpcode()) { - default: return AMDGPUInstrInfo::isTriviallyReMaterializable(MI, AA); - case AMDGPU::S_MOV_B32: - case AMDGPU::S_MOV_B64: - case AMDGPU::V_MOV_B32_e32: - return MI->getOperand(1).isImm(); - } -} - -static bool offsetsDoNotOverlap(int WidthA, int OffsetA, - int WidthB, int OffsetB) { - int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; - int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; - int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; - return LowOffset + LowWidth <= HighOffset; -} - -bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa, - MachineInstr *MIb) const { - unsigned BaseReg0, Offset0; - unsigned BaseReg1, Offset1; - - if (getLdStBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && - getLdStBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { - assert(MIa->hasOneMemOperand() && MIb->hasOneMemOperand() && - "read2 / write2 not expected here yet"); - unsigned Width0 = (*MIa->memoperands_begin())->getSize(); - unsigned Width1 = (*MIb->memoperands_begin())->getSize(); - if (BaseReg0 == BaseReg1 && - offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { - return true; - } - } - - return false; -} - -bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, - MachineInstr *MIb, - AliasAnalysis *AA) const { - unsigned Opc0 = MIa->getOpcode(); - unsigned Opc1 = MIb->getOpcode(); - - assert(MIa && (MIa->mayLoad() || MIa->mayStore()) && - "MIa must load from or modify a memory location"); - assert(MIb && (MIb->mayLoad() || MIb->mayStore()) && - "MIb must load from or modify a memory location"); - - if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects()) - return false; - - // XXX - Can we relax this between address spaces? - if (MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef()) - return false; - - // TODO: Should we check the address space from the MachineMemOperand? That - // would allow us to distinguish objects we know don't alias based on the - // underlying addres space, even if it was lowered to a different one, - // e.g. private accesses lowered to use MUBUF instructions on a scratch - // buffer. - if (isDS(Opc0)) { - if (isDS(Opc1)) - return checkInstOffsetsDoNotOverlap(MIa, MIb); - - return !isFLAT(Opc1); - } - - if (isMUBUF(Opc0) || isMTBUF(Opc0)) { - if (isMUBUF(Opc1) || isMTBUF(Opc1)) - return checkInstOffsetsDoNotOverlap(MIa, MIb); - - return !isFLAT(Opc1) && !isSMRD(Opc1); - } - - if (isSMRD(Opc0)) { - if (isSMRD(Opc1)) - return checkInstOffsetsDoNotOverlap(MIa, MIb); - - return !isFLAT(Opc1) && !isMUBUF(Opc0) && !isMTBUF(Opc0); - } - - if (isFLAT(Opc0)) { - if (isFLAT(Opc1)) - return checkInstOffsetsDoNotOverlap(MIa, MIb); - - return false; - } - - return false; -} - -bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { - int64_t SVal = Imm.getSExtValue(); - if (SVal >= -16 && SVal <= 64) - return true; - - if (Imm.getBitWidth() == 64) { - uint64_t Val = Imm.getZExtValue(); - return (DoubleToBits(0.0) == Val) || - (DoubleToBits(1.0) == Val) || - (DoubleToBits(-1.0) == Val) || - (DoubleToBits(0.5) == Val) || - (DoubleToBits(-0.5) == Val) || - (DoubleToBits(2.0) == Val) || - (DoubleToBits(-2.0) == Val) || - (DoubleToBits(4.0) == Val) || - (DoubleToBits(-4.0) == Val); - } - - // The actual type of the operand does not seem to matter as long - // as the bits match one of the inline immediate values. For example: - // - // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal, - // so it is a legal inline immediate. - // - // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in - // floating-point, so it is a legal inline immediate. - uint32_t Val = Imm.getZExtValue(); - - return (FloatToBits(0.0f) == Val) || - (FloatToBits(1.0f) == Val) || - (FloatToBits(-1.0f) == Val) || - (FloatToBits(0.5f) == Val) || - (FloatToBits(-0.5f) == Val) || - (FloatToBits(2.0f) == Val) || - (FloatToBits(-2.0f) == Val) || - (FloatToBits(4.0f) == Val) || - (FloatToBits(-4.0f) == Val); -} - -bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, - unsigned OpSize) const { - if (MO.isImm()) { - // MachineOperand provides no way to tell the true operand size, since it - // only records a 64-bit value. We need to know the size to determine if a - // 32-bit floating point immediate bit pattern is legal for an integer - // immediate. It would be for any 32-bit integer operand, but would not be - // for a 64-bit one. - - unsigned BitSize = 8 * OpSize; - return isInlineConstant(APInt(BitSize, MO.getImm(), true)); - } - - return false; -} - -bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO, - unsigned OpSize) const { - return MO.isImm() && !isInlineConstant(MO, OpSize); -} - -static bool compareMachineOp(const MachineOperand &Op0, - const MachineOperand &Op1) { - if (Op0.getType() != Op1.getType()) - return false; - - switch (Op0.getType()) { - case MachineOperand::MO_Register: - return Op0.getReg() == Op1.getReg(); - case MachineOperand::MO_Immediate: - return Op0.getImm() == Op1.getImm(); - default: - llvm_unreachable("Didn't expect to be comparing these operand types"); - } -} - -bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, - const MachineOperand &MO) const { - const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo]; - - assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); - - if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) - return true; - - if (OpInfo.RegClass < 0) - return false; - - unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize(); - if (isLiteralConstant(MO, OpSize)) - return RI.opCanUseLiteralConstant(OpInfo.OperandType); - - return RI.opCanUseInlineConstant(OpInfo.OperandType); -} - -bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { - int Op32 = AMDGPU::getVOPe32(Opcode); - if (Op32 == -1) - return false; - - return pseudoToMCOpcode(Op32) != -1; -} - -bool SIInstrInfo::hasModifiers(unsigned Opcode) const { - // The src0_modifier operand is present on all instructions - // that have modifiers. - - return AMDGPU::getNamedOperandIdx(Opcode, - AMDGPU::OpName::src0_modifiers) != -1; -} - -bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, - unsigned OpName) const { - const MachineOperand *Mods = getNamedOperand(MI, OpName); - return Mods && Mods->getImm(); -} - -bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, - const MachineOperand &MO, - unsigned OpSize) const { - // Literal constants use the constant bus. - if (isLiteralConstant(MO, OpSize)) - return true; - - if (!MO.isReg() || !MO.isUse()) - return false; - - if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) - return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); - - // FLAT_SCR is just an SGPR pair. - if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR)) - return true; - - // EXEC register uses the constant bus. - if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC) - return true; - - // SGPRs use the constant bus - if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC || - (!MO.isImplicit() && - (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || - AMDGPU::SGPR_64RegClass.contains(MO.getReg())))) { - return true; - } - - return false; -} - -bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, - StringRef &ErrInfo) const { - uint16_t Opcode = MI->getOpcode(); - const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); - int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); - int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); - int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); - - // Make sure the number of operands is correct. - const MCInstrDesc &Desc = get(Opcode); - if (!Desc.isVariadic() && - Desc.getNumOperands() != MI->getNumExplicitOperands()) { - ErrInfo = "Instruction has wrong number of operands."; - return false; - } - - // Make sure the register classes are correct - for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { - if (MI->getOperand(i).isFPImm()) { - ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " - "all fp values to integers."; - return false; - } - - int RegClass = Desc.OpInfo[i].RegClass; - - switch (Desc.OpInfo[i].OperandType) { - case MCOI::OPERAND_REGISTER: - if (MI->getOperand(i).isImm()) { - ErrInfo = "Illegal immediate value for operand."; - return false; - } - break; - case AMDGPU::OPERAND_REG_IMM32: - break; - case AMDGPU::OPERAND_REG_INLINE_C: - if (isLiteralConstant(MI->getOperand(i), - RI.getRegClass(RegClass)->getSize())) { - ErrInfo = "Illegal immediate value for operand."; - return false; - } - break; - case MCOI::OPERAND_IMMEDIATE: - // Check if this operand is an immediate. - // FrameIndex operands will be replaced by immediates, so they are - // allowed. - if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFI()) { - ErrInfo = "Expected immediate, but got non-immediate"; - return false; - } - // Fall-through - default: - continue; - } - - if (!MI->getOperand(i).isReg()) - continue; - - if (RegClass != -1) { - unsigned Reg = MI->getOperand(i).getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) - continue; - - const TargetRegisterClass *RC = RI.getRegClass(RegClass); - if (!RC->contains(Reg)) { - ErrInfo = "Operand has incorrect register class."; - return false; - } - } - } - - - // Verify VOP* - if (isVOP1(Opcode) || isVOP2(Opcode) || isVOP3(Opcode) || isVOPC(Opcode)) { - // Only look at the true operands. Only a real operand can use the constant - // bus, and we don't want to check pseudo-operands like the source modifier - // flags. - const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; - - unsigned ConstantBusCount = 0; - unsigned SGPRUsed = AMDGPU::NoRegister; - for (int OpIdx : OpIndices) { - if (OpIdx == -1) - break; - const MachineOperand &MO = MI->getOperand(OpIdx); - if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) { - if (MO.isReg()) { - if (MO.getReg() != SGPRUsed) - ++ConstantBusCount; - SGPRUsed = MO.getReg(); - } else { - ++ConstantBusCount; - } - } - } - if (ConstantBusCount > 1) { - ErrInfo = "VOP* instruction uses the constant bus more than once"; - return false; - } - } - - // Verify misc. restrictions on specific instructions. - if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || - Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { - const MachineOperand &Src0 = MI->getOperand(Src0Idx); - const MachineOperand &Src1 = MI->getOperand(Src1Idx); - const MachineOperand &Src2 = MI->getOperand(Src2Idx); - if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { - if (!compareMachineOp(Src0, Src1) && - !compareMachineOp(Src0, Src2)) { - ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; - return false; - } - } - } - - return true; -} - -unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { - switch (MI.getOpcode()) { - default: return AMDGPU::INSTRUCTION_LIST_END; - case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; - case AMDGPU::COPY: return AMDGPU::COPY; - case AMDGPU::PHI: return AMDGPU::PHI; - case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; - case AMDGPU::S_MOV_B32: - return MI.getOperand(1).isReg() ? - AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; - case AMDGPU::S_ADD_I32: - case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32; - case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32; - case AMDGPU::S_SUB_I32: - case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; - case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; - case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; - case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32; - case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32; - case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32; - case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e32; - case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e32; - case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e32; - case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e32; - case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; - case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; - case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; - case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; - case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; - case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; - case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; - case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; - case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; - case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; - case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; - case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; - case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; - case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; - case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; - case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; - case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; - case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; - case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; - case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; - case AMDGPU::S_LOAD_DWORD_IMM: - case AMDGPU::S_LOAD_DWORD_SGPR: return AMDGPU::BUFFER_LOAD_DWORD_ADDR64; - case AMDGPU::S_LOAD_DWORDX2_IMM: - case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; - case AMDGPU::S_LOAD_DWORDX4_IMM: - case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; - case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; - case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; - case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; - case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; - } -} - -bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const { - return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END; -} - -const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, - unsigned OpNo) const { - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); - const MCInstrDesc &Desc = get(MI.getOpcode()); - if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || - Desc.OpInfo[OpNo].RegClass == -1) { - unsigned Reg = MI.getOperand(OpNo).getReg(); - - if (TargetRegisterInfo::isVirtualRegister(Reg)) - return MRI.getRegClass(Reg); - return RI.getPhysRegClass(Reg); - } - - unsigned RCID = Desc.OpInfo[OpNo].RegClass; - return RI.getRegClass(RCID); -} - -bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { - switch (MI.getOpcode()) { - case AMDGPU::COPY: - case AMDGPU::REG_SEQUENCE: - case AMDGPU::PHI: - case AMDGPU::INSERT_SUBREG: - return RI.hasVGPRs(getOpRegClass(MI, 0)); - default: - return RI.hasVGPRs(getOpRegClass(MI, OpNo)); - } -} - -void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const { - MachineBasicBlock::iterator I = MI; - MachineBasicBlock *MBB = MI->getParent(); - MachineOperand &MO = MI->getOperand(OpIdx); - MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass; - const TargetRegisterClass *RC = RI.getRegClass(RCID); - unsigned Opcode = AMDGPU::V_MOV_B32_e32; - if (MO.isReg()) - Opcode = AMDGPU::COPY; - else if (RI.isSGPRClass(RC)) - Opcode = AMDGPU::S_MOV_B32; - - - const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); - if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) - VRC = &AMDGPU::VReg_64RegClass; - else - VRC = &AMDGPU::VGPR_32RegClass; - - unsigned Reg = MRI.createVirtualRegister(VRC); - DebugLoc DL = MBB->findDebugLoc(I); - BuildMI(*MI->getParent(), I, DL, get(Opcode), Reg) - .addOperand(MO); - MO.ChangeToRegister(Reg, false); -} - -unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, - MachineRegisterInfo &MRI, - MachineOperand &SuperReg, - const TargetRegisterClass *SuperRC, - unsigned SubIdx, - const TargetRegisterClass *SubRC) - const { - assert(SuperReg.isReg()); - - unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); - unsigned SubReg = MRI.createVirtualRegister(SubRC); - - // Just in case the super register is itself a sub-register, copy it to a new - // value so we don't need to worry about merging its subreg index with the - // SubIdx passed to this function. The register coalescer should be able to - // eliminate this extra copy. - MachineBasicBlock *MBB = MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); - - BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) - .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); - - BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) - .addReg(NewSuperReg, 0, SubIdx); - - return SubReg; -} - -MachineOperand SIInstrInfo::buildExtractSubRegOrImm( - MachineBasicBlock::iterator MII, - MachineRegisterInfo &MRI, - MachineOperand &Op, - const TargetRegisterClass *SuperRC, - unsigned SubIdx, - const TargetRegisterClass *SubRC) const { - if (Op.isImm()) { - // XXX - Is there a better way to do this? - if (SubIdx == AMDGPU::sub0) - return MachineOperand::CreateImm(Op.getImm() & 0xFFFFFFFF); - if (SubIdx == AMDGPU::sub1) - return MachineOperand::CreateImm(Op.getImm() >> 32); - - llvm_unreachable("Unhandled register index for immediate"); - } - - unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, - SubIdx, SubRC); - return MachineOperand::CreateReg(SubReg, false); -} - -unsigned SIInstrInfo::split64BitImm(SmallVectorImpl &Worklist, - MachineBasicBlock::iterator MI, - MachineRegisterInfo &MRI, - const TargetRegisterClass *RC, - const MachineOperand &Op) const { - MachineBasicBlock *MBB = MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); - unsigned LoDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned HiDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned Dst = MRI.createVirtualRegister(RC); - - MachineInstr *Lo = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), - LoDst) - .addImm(Op.getImm() & 0xFFFFFFFF); - MachineInstr *Hi = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), - HiDst) - .addImm(Op.getImm() >> 32); - - BuildMI(*MBB, MI, DL, get(TargetOpcode::REG_SEQUENCE), Dst) - .addReg(LoDst) - .addImm(AMDGPU::sub0) - .addReg(HiDst) - .addImm(AMDGPU::sub1); - - Worklist.push_back(Lo); - Worklist.push_back(Hi); - - return Dst; -} - -// Change the order of operands from (0, 1, 2) to (0, 2, 1) -void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const { - assert(Inst->getNumExplicitOperands() == 3); - MachineOperand Op1 = Inst->getOperand(1); - Inst->RemoveOperand(1); - Inst->addOperand(Op1); -} - -bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, - const MachineOperand *MO) const { - const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); - const MCInstrDesc &InstDesc = get(MI->getOpcode()); - const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; - const TargetRegisterClass *DefinedRC = - OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; - if (!MO) - MO = &MI->getOperand(OpIdx); - - if (isVALU(InstDesc.Opcode) && - usesConstantBus(MRI, *MO, DefinedRC->getSize())) { - unsigned SGPRUsed = - MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister; - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - if (i == OpIdx) - continue; - const MachineOperand &Op = MI->getOperand(i); - if (Op.isReg() && Op.getReg() != SGPRUsed && - usesConstantBus(MRI, Op, getOpSize(*MI, i))) { - return false; - } - } - } - - if (MO->isReg()) { - assert(DefinedRC); - const TargetRegisterClass *RC = MRI.getRegClass(MO->getReg()); - - // In order to be legal, the common sub-class must be equal to the - // class of the current operand. For example: - // - // v_mov_b32 s0 ; Operand defined as vsrc_32 - // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL - // - // s_sendmsg 0, s0 ; Operand defined as m0reg - // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL - - return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; - } - - - // Handle non-register types that are treated like immediates. - assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); - - if (!DefinedRC) { - // This operand expects an immediate. - return true; - } - - return isImmOperandLegal(MI, OpIdx, *MO); -} - -void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { - MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); - - int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::src0); - int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::src1); - int Src2Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::src2); - - // Legalize VOP2 - if (isVOP2(MI->getOpcode()) && Src1Idx != -1) { - // Legalize src0 - if (!isOperandLegal(MI, Src0Idx)) - legalizeOpWithMove(MI, Src0Idx); - - // Legalize src1 - if (isOperandLegal(MI, Src1Idx)) - return; - - // Usually src0 of VOP2 instructions allow more types of inputs - // than src1, so try to commute the instruction to decrease our - // chances of having to insert a MOV instruction to legalize src1. - if (MI->isCommutable()) { - if (commuteInstruction(MI)) - // If we are successful in commuting, then we know MI is legal, so - // we are done. - return; - } - - legalizeOpWithMove(MI, Src1Idx); - return; - } - - // XXX - Do any VOP3 instructions read VCC? - // Legalize VOP3 - if (isVOP3(MI->getOpcode())) { - int VOP3Idx[3] = { Src0Idx, Src1Idx, Src2Idx }; - - // Find the one SGPR operand we are allowed to use. - unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); - - for (unsigned i = 0; i < 3; ++i) { - int Idx = VOP3Idx[i]; - if (Idx == -1) - break; - MachineOperand &MO = MI->getOperand(Idx); - - if (MO.isReg()) { - if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) - continue; // VGPRs are legal - - assert(MO.getReg() != AMDGPU::SCC && "SCC operand to VOP3 instruction"); - - if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { - SGPRReg = MO.getReg(); - // We can use one SGPR in each VOP3 instruction. - continue; - } - } else if (!isLiteralConstant(MO, getOpSize(MI->getOpcode(), Idx))) { - // If it is not a register and not a literal constant, then it must be - // an inline constant which is always legal. - continue; - } - // If we make it this far, then the operand is not legal and we must - // legalize it. - legalizeOpWithMove(MI, Idx); - } - } - - // Legalize REG_SEQUENCE and PHI - // The register class of the operands much be the same type as the register - // class of the output. - if (MI->getOpcode() == AMDGPU::REG_SEQUENCE || - MI->getOpcode() == AMDGPU::PHI) { - const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; - for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { - if (!MI->getOperand(i).isReg() || - !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) - continue; - const TargetRegisterClass *OpRC = - MRI.getRegClass(MI->getOperand(i).getReg()); - if (RI.hasVGPRs(OpRC)) { - VRC = OpRC; - } else { - SRC = OpRC; - } - } - - // If any of the operands are VGPR registers, then they all most be - // otherwise we will create illegal VGPR->SGPR copies when legalizing - // them. - if (VRC || !RI.isSGPRClass(getOpRegClass(*MI, 0))) { - if (!VRC) { - assert(SRC); - VRC = RI.getEquivalentVGPRClass(SRC); - } - RC = VRC; - } else { - RC = SRC; - } - - // Update all the operands so they have the same type. - for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { - if (!MI->getOperand(i).isReg() || - !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) - continue; - unsigned DstReg = MRI.createVirtualRegister(RC); - MachineBasicBlock *InsertBB; - MachineBasicBlock::iterator Insert; - if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) { - InsertBB = MI->getParent(); - Insert = MI; - } else { - // MI is a PHI instruction. - InsertBB = MI->getOperand(i + 1).getMBB(); - Insert = InsertBB->getFirstTerminator(); - } - BuildMI(*InsertBB, Insert, MI->getDebugLoc(), - get(AMDGPU::COPY), DstReg) - .addOperand(MI->getOperand(i)); - MI->getOperand(i).setReg(DstReg); - } - } - - // Legalize INSERT_SUBREG - // src0 must have the same register class as dst - if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) { - unsigned Dst = MI->getOperand(0).getReg(); - unsigned Src0 = MI->getOperand(1).getReg(); - const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); - const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); - if (DstRC != Src0RC) { - MachineBasicBlock &MBB = *MI->getParent(); - unsigned NewSrc0 = MRI.createVirtualRegister(DstRC); - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0) - .addReg(Src0); - MI->getOperand(1).setReg(NewSrc0); - } - return; - } - - // Legalize MUBUF* instructions - // FIXME: If we start using the non-addr64 instructions for compute, we - // may need to legalize them here. - int SRsrcIdx = - AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc); - if (SRsrcIdx != -1) { - // We have an MUBUF instruction - MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx); - unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass; - if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), - RI.getRegClass(SRsrcRC))) { - // The operands are legal. - // FIXME: We may need to legalize operands besided srsrc. - return; - } - - MachineBasicBlock &MBB = *MI->getParent(); - // Extract the the ptr from the resource descriptor. - - // SRsrcPtrLo = srsrc:sub0 - unsigned SRsrcPtrLo = buildExtractSubReg(MI, MRI, *SRsrc, - &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VGPR_32RegClass); - - // SRsrcPtrHi = srsrc:sub1 - unsigned SRsrcPtrHi = buildExtractSubReg(MI, MRI, *SRsrc, - &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VGPR_32RegClass); - - // Create an empty resource descriptor - unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); - uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); - - // Zero64 = 0 - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64), - Zero64) - .addImm(0); - - // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - SRsrcFormatLo) - .addImm(RsrcDataFormat & 0xFFFFFFFF); - - // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - SRsrcFormatHi) - .addImm(RsrcDataFormat >> 32); - - // NewSRsrc = {Zero64, SRsrcFormat} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), - NewSRsrc) - .addReg(Zero64) - .addImm(AMDGPU::sub0_sub1) - .addReg(SRsrcFormatLo) - .addImm(AMDGPU::sub2) - .addReg(SRsrcFormatHi) - .addImm(AMDGPU::sub3); - - MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); - unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); - unsigned NewVAddrLo; - unsigned NewVAddrHi; - if (VAddr) { - // This is already an ADDR64 instruction so we need to add the pointer - // extracted from the resource descriptor to the current value of VAddr. - NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - - // NewVaddrLo = SRsrcPtrLo + VAddr:sub0 - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32), - NewVAddrLo) - .addReg(SRsrcPtrLo) - .addReg(VAddr->getReg(), 0, AMDGPU::sub0) - .addReg(AMDGPU::VCC, RegState::ImplicitDefine); - - // NewVaddrHi = SRsrcPtrHi + VAddr:sub1 - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADDC_U32_e32), - NewVAddrHi) - .addReg(SRsrcPtrHi) - .addReg(VAddr->getReg(), 0, AMDGPU::sub1) - .addReg(AMDGPU::VCC, RegState::ImplicitDefine) - .addReg(AMDGPU::VCC, RegState::Implicit); - - } else { - // This instructions is the _OFFSET variant, so we need to convert it to - // ADDR64. - MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata); - MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset); - MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset); - - // Create the new instruction. - unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode()); - MachineInstr *Addr64 = - BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) - .addOperand(*VData) - .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. - // This will be replaced later - // with the new value of vaddr. - .addOperand(*SRsrc) - .addOperand(*SOffset) - .addOperand(*Offset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0); // tfe - - MI->removeFromParent(); - MI = Addr64; - - NewVAddrLo = SRsrcPtrLo; - NewVAddrHi = SRsrcPtrHi; - VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); - SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); - } - - // NewVaddr = {NewVaddrHi, NewVaddrLo} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), - NewVAddr) - .addReg(NewVAddrLo) - .addImm(AMDGPU::sub0) - .addReg(NewVAddrHi) - .addImm(AMDGPU::sub1); - - - // Update the instruction to use NewVaddr - VAddr->setReg(NewVAddr); - // Update the instruction to use NewSRsrc - SRsrc->setReg(NewSRsrc); - } -} - -void SIInstrInfo::splitSMRD(MachineInstr *MI, - const TargetRegisterClass *HalfRC, - unsigned HalfImmOp, unsigned HalfSGPROp, - MachineInstr *&Lo, MachineInstr *&Hi) const { - - DebugLoc DL = MI->getDebugLoc(); - MachineBasicBlock *MBB = MI->getParent(); - MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - unsigned RegLo = MRI.createVirtualRegister(HalfRC); - unsigned RegHi = MRI.createVirtualRegister(HalfRC); - unsigned HalfSize = HalfRC->getSize(); - const MachineOperand *OffOp = - getNamedOperand(*MI, AMDGPU::OpName::offset); - const MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase); - - // The SMRD has an 8-bit offset in dwords on SI and a 20-bit offset in bytes - // on VI. - - bool IsKill = SBase->isKill(); - if (OffOp) { - bool isVI = - MBB->getParent()->getSubtarget().getGeneration() >= - AMDGPUSubtarget::VOLCANIC_ISLANDS; - unsigned OffScale = isVI ? 1 : 4; - // Handle the _IMM variant - unsigned LoOffset = OffOp->getImm() * OffScale; - unsigned HiOffset = LoOffset + HalfSize; - Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo) - // Use addReg instead of addOperand - // to make sure kill flag is cleared. - .addReg(SBase->getReg(), 0, SBase->getSubReg()) - .addImm(LoOffset / OffScale); - - if (!isUInt<20>(HiOffset) || (!isVI && !isUInt<8>(HiOffset / OffScale))) { - unsigned OffsetSGPR = - MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR) - .addImm(HiOffset); // The offset in register is in bytes. - Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi) - .addReg(SBase->getReg(), getKillRegState(IsKill), - SBase->getSubReg()) - .addReg(OffsetSGPR); - } else { - Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi) - .addReg(SBase->getReg(), getKillRegState(IsKill), - SBase->getSubReg()) - .addImm(HiOffset / OffScale); - } - } else { - // Handle the _SGPR variant - MachineOperand *SOff = getNamedOperand(*MI, AMDGPU::OpName::soff); - Lo = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegLo) - .addReg(SBase->getReg(), 0, SBase->getSubReg()) - .addOperand(*SOff); - unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR) - .addOperand(*SOff) - .addImm(HalfSize); - Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp)) - .addReg(SBase->getReg(), getKillRegState(IsKill), - SBase->getSubReg()) - .addReg(OffsetSGPR); - } - - unsigned SubLo, SubHi; - switch (HalfSize) { - case 4: - SubLo = AMDGPU::sub0; - SubHi = AMDGPU::sub1; - break; - case 8: - SubLo = AMDGPU::sub0_sub1; - SubHi = AMDGPU::sub2_sub3; - break; - case 16: - SubLo = AMDGPU::sub0_sub1_sub2_sub3; - SubHi = AMDGPU::sub4_sub5_sub6_sub7; - break; - case 32: - SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; - SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15; - break; - default: - llvm_unreachable("Unhandled HalfSize"); - } - - BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE)) - .addOperand(MI->getOperand(0)) - .addReg(RegLo) - .addImm(SubLo) - .addReg(RegHi) - .addImm(SubHi); -} - -void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const { - MachineBasicBlock *MBB = MI->getParent(); - switch (MI->getOpcode()) { - case AMDGPU::S_LOAD_DWORD_IMM: - case AMDGPU::S_LOAD_DWORD_SGPR: - case AMDGPU::S_LOAD_DWORDX2_IMM: - case AMDGPU::S_LOAD_DWORDX2_SGPR: - case AMDGPU::S_LOAD_DWORDX4_IMM: - case AMDGPU::S_LOAD_DWORDX4_SGPR: { - unsigned NewOpcode = getVALUOp(*MI); - unsigned RegOffset; - unsigned ImmOffset; - - if (MI->getOperand(2).isReg()) { - RegOffset = MI->getOperand(2).getReg(); - ImmOffset = 0; - } else { - assert(MI->getOperand(2).isImm()); - // SMRD instructions take a dword offsets on SI and byte offset on VI - // and MUBUF instructions always take a byte offset. - ImmOffset = MI->getOperand(2).getImm(); - if (MBB->getParent()->getSubtarget().getGeneration() <= - AMDGPUSubtarget::SEA_ISLANDS) - ImmOffset <<= 2; - RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - - if (isUInt<12>(ImmOffset)) { - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - RegOffset) - .addImm(0); - } else { - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - RegOffset) - .addImm(ImmOffset); - ImmOffset = 0; - } - } - - unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); - unsigned DWord0 = RegOffset; - unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); - - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1) - .addImm(0); - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2) - .addImm(RsrcDataFormat & 0xFFFFFFFF); - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3) - .addImm(RsrcDataFormat >> 32); - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc) - .addReg(DWord0) - .addImm(AMDGPU::sub0) - .addReg(DWord1) - .addImm(AMDGPU::sub1) - .addReg(DWord2) - .addImm(AMDGPU::sub2) - .addReg(DWord3) - .addImm(AMDGPU::sub3); - MI->setDesc(get(NewOpcode)); - if (MI->getOperand(2).isReg()) { - MI->getOperand(2).setReg(SRsrc); - } else { - MI->getOperand(2).ChangeToRegister(SRsrc, false); - } - MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); - MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset)); - MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // glc - MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // slc - MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // tfe - - const TargetRegisterClass *NewDstRC = - RI.getRegClass(get(NewOpcode).OpInfo[0].RegClass); - - unsigned DstReg = MI->getOperand(0).getReg(); - unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); - MRI.replaceRegWith(DstReg, NewDstReg); - break; - } - case AMDGPU::S_LOAD_DWORDX8_IMM: - case AMDGPU::S_LOAD_DWORDX8_SGPR: { - MachineInstr *Lo, *Hi; - splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM, - AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi); - MI->eraseFromParent(); - moveSMRDToVALU(Lo, MRI); - moveSMRDToVALU(Hi, MRI); - break; - } - - case AMDGPU::S_LOAD_DWORDX16_IMM: - case AMDGPU::S_LOAD_DWORDX16_SGPR: { - MachineInstr *Lo, *Hi; - splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM, - AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi); - MI->eraseFromParent(); - moveSMRDToVALU(Lo, MRI); - moveSMRDToVALU(Hi, MRI); - break; - } - } -} - -void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { - SmallVector Worklist; - Worklist.push_back(&TopInst); - - while (!Worklist.empty()) { - MachineInstr *Inst = Worklist.pop_back_val(); - MachineBasicBlock *MBB = Inst->getParent(); - MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - - unsigned Opcode = Inst->getOpcode(); - unsigned NewOpcode = getVALUOp(*Inst); - - // Handle some special cases - switch (Opcode) { - default: - if (isSMRD(Inst->getOpcode())) { - moveSMRDToVALU(Inst, MRI); - } - break; - case AMDGPU::S_MOV_B64: { - DebugLoc DL = Inst->getDebugLoc(); - - // If the source operand is a register we can replace this with a - // copy. - if (Inst->getOperand(1).isReg()) { - MachineInstr *Copy = BuildMI(*MBB, Inst, DL, get(TargetOpcode::COPY)) - .addOperand(Inst->getOperand(0)) - .addOperand(Inst->getOperand(1)); - Worklist.push_back(Copy); - } else { - // Otherwise, we need to split this into two movs, because there is - // no 64-bit VALU move instruction. - unsigned Reg = Inst->getOperand(0).getReg(); - unsigned Dst = split64BitImm(Worklist, - Inst, - MRI, - MRI.getRegClass(Reg), - Inst->getOperand(1)); - MRI.replaceRegWith(Reg, Dst); - } - Inst->eraseFromParent(); - continue; - } - case AMDGPU::S_AND_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32); - Inst->eraseFromParent(); - continue; - - case AMDGPU::S_OR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32); - Inst->eraseFromParent(); - continue; - - case AMDGPU::S_XOR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32); - Inst->eraseFromParent(); - continue; - - case AMDGPU::S_NOT_B64: - splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); - Inst->eraseFromParent(); - continue; - - case AMDGPU::S_BCNT1_I32_B64: - splitScalar64BitBCNT(Worklist, Inst); - Inst->eraseFromParent(); - continue; - - case AMDGPU::S_BFE_I64: { - splitScalar64BitBFE(Worklist, Inst); - Inst->eraseFromParent(); - continue; - } - - case AMDGPU::S_LSHL_B32: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - NewOpcode = AMDGPU::V_LSHLREV_B32_e64; - swapOperands(Inst); - } - break; - case AMDGPU::S_ASHR_I32: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - NewOpcode = AMDGPU::V_ASHRREV_I32_e64; - swapOperands(Inst); - } - break; - case AMDGPU::S_LSHR_B32: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - NewOpcode = AMDGPU::V_LSHRREV_B32_e64; - swapOperands(Inst); - } - break; - case AMDGPU::S_LSHL_B64: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - NewOpcode = AMDGPU::V_LSHLREV_B64; - swapOperands(Inst); - } - break; - case AMDGPU::S_ASHR_I64: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - NewOpcode = AMDGPU::V_ASHRREV_I64; - swapOperands(Inst); - } - break; - case AMDGPU::S_LSHR_B64: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - NewOpcode = AMDGPU::V_LSHRREV_B64; - swapOperands(Inst); - } - break; - - case AMDGPU::S_BFE_U64: - case AMDGPU::S_BFM_B64: - llvm_unreachable("Moving this op to VALU not implemented"); - } - - if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { - // We cannot move this instruction to the VALU, so we should try to - // legalize its operands instead. - legalizeOperands(Inst); - continue; - } - - // Use the new VALU Opcode. - const MCInstrDesc &NewDesc = get(NewOpcode); - Inst->setDesc(NewDesc); - - // Remove any references to SCC. Vector instructions can't read from it, and - // We're just about to add the implicit use / defs of VCC, and we don't want - // both. - for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) { - MachineOperand &Op = Inst->getOperand(i); - if (Op.isReg() && Op.getReg() == AMDGPU::SCC) - Inst->RemoveOperand(i); - } - - if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { - // We are converting these to a BFE, so we need to add the missing - // operands for the size and offset. - unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; - Inst->addOperand(MachineOperand::CreateImm(0)); - Inst->addOperand(MachineOperand::CreateImm(Size)); - - } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { - // The VALU version adds the second operand to the result, so insert an - // extra 0 operand. - Inst->addOperand(MachineOperand::CreateImm(0)); - } - - addDescImplicitUseDef(NewDesc, Inst); - - if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { - const MachineOperand &OffsetWidthOp = Inst->getOperand(2); - // If we need to move this to VGPRs, we need to unpack the second operand - // back into the 2 separate ones for bit offset and width. - assert(OffsetWidthOp.isImm() && - "Scalar BFE is only implemented for constant width and offset"); - uint32_t Imm = OffsetWidthOp.getImm(); - - uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. - uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. - Inst->RemoveOperand(2); // Remove old immediate. - Inst->addOperand(MachineOperand::CreateImm(Offset)); - Inst->addOperand(MachineOperand::CreateImm(BitWidth)); - } - - // Update the destination register class. - - const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0); - - switch (Opcode) { - // For target instructions, getOpRegClass just returns the virtual - // register class associated with the operand, so we need to find an - // equivalent VGPR register class in order to move the instruction to the - // VALU. - case AMDGPU::COPY: - case AMDGPU::PHI: - case AMDGPU::REG_SEQUENCE: - case AMDGPU::INSERT_SUBREG: - if (RI.hasVGPRs(NewDstRC)) - continue; - NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); - if (!NewDstRC) - continue; - break; - default: - break; - } - - unsigned DstReg = Inst->getOperand(0).getReg(); - unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); - MRI.replaceRegWith(DstReg, NewDstReg); - - // Legalize the operands - legalizeOperands(Inst); - - for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg), - E = MRI.use_end(); I != E; ++I) { - MachineInstr &UseMI = *I->getParent(); - if (!canReadVGPR(UseMI, I.getOperandNo())) { - Worklist.push_back(&UseMI); - } - } - } -} - -//===----------------------------------------------------------------------===// -// Indirect addressing callbacks -//===----------------------------------------------------------------------===// - -unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex, - unsigned Channel) const { - assert(Channel == 0); - return RegIndex; -} - -const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const { - return &AMDGPU::VGPR_32RegClass; -} - -void SIInstrInfo::splitScalar64BitUnaryOp( - SmallVectorImpl &Worklist, - MachineInstr *Inst, - unsigned Opcode) const { - MachineBasicBlock &MBB = *Inst->getParent(); - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - - MachineOperand &Dest = Inst->getOperand(0); - MachineOperand &Src0 = Inst->getOperand(1); - DebugLoc DL = Inst->getDebugLoc(); - - MachineBasicBlock::iterator MII = Inst; - - const MCInstrDesc &InstDesc = get(Opcode); - const TargetRegisterClass *Src0RC = Src0.isReg() ? - MRI.getRegClass(Src0.getReg()) : - &AMDGPU::SGPR_32RegClass; - - const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); - - MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, - AMDGPU::sub0, Src0SubRC); - - const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); - const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0); - - unsigned DestSub0 = MRI.createVirtualRegister(DestRC); - MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) - .addOperand(SrcReg0Sub0); - - MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, - AMDGPU::sub1, Src0SubRC); - - unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC); - MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) - .addOperand(SrcReg0Sub1); - - unsigned FullDestReg = MRI.createVirtualRegister(DestRC); - BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) - .addReg(DestSub0) - .addImm(AMDGPU::sub0) - .addReg(DestSub1) - .addImm(AMDGPU::sub1); - - MRI.replaceRegWith(Dest.getReg(), FullDestReg); - - // Try to legalize the operands in case we need to swap the order to keep it - // valid. - Worklist.push_back(LoHalf); - Worklist.push_back(HiHalf); -} - -void SIInstrInfo::splitScalar64BitBinaryOp( - SmallVectorImpl &Worklist, - MachineInstr *Inst, - unsigned Opcode) const { - MachineBasicBlock &MBB = *Inst->getParent(); - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - - MachineOperand &Dest = Inst->getOperand(0); - MachineOperand &Src0 = Inst->getOperand(1); - MachineOperand &Src1 = Inst->getOperand(2); - DebugLoc DL = Inst->getDebugLoc(); - - MachineBasicBlock::iterator MII = Inst; - - const MCInstrDesc &InstDesc = get(Opcode); - const TargetRegisterClass *Src0RC = Src0.isReg() ? - MRI.getRegClass(Src0.getReg()) : - &AMDGPU::SGPR_32RegClass; - - const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); - const TargetRegisterClass *Src1RC = Src1.isReg() ? - MRI.getRegClass(Src1.getReg()) : - &AMDGPU::SGPR_32RegClass; - - const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); - - MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, - AMDGPU::sub0, Src0SubRC); - MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, - AMDGPU::sub0, Src1SubRC); - - const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); - const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0); - - unsigned DestSub0 = MRI.createVirtualRegister(DestRC); - MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) - .addOperand(SrcReg0Sub0) - .addOperand(SrcReg1Sub0); - - MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, - AMDGPU::sub1, Src0SubRC); - MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, - AMDGPU::sub1, Src1SubRC); - - unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC); - MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) - .addOperand(SrcReg0Sub1) - .addOperand(SrcReg1Sub1); - - unsigned FullDestReg = MRI.createVirtualRegister(DestRC); - BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) - .addReg(DestSub0) - .addImm(AMDGPU::sub0) - .addReg(DestSub1) - .addImm(AMDGPU::sub1); - - MRI.replaceRegWith(Dest.getReg(), FullDestReg); - - // Try to legalize the operands in case we need to swap the order to keep it - // valid. - Worklist.push_back(LoHalf); - Worklist.push_back(HiHalf); -} - -void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl &Worklist, - MachineInstr *Inst) const { - MachineBasicBlock &MBB = *Inst->getParent(); - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - - MachineBasicBlock::iterator MII = Inst; - DebugLoc DL = Inst->getDebugLoc(); - - MachineOperand &Dest = Inst->getOperand(0); - MachineOperand &Src = Inst->getOperand(1); - - const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); - const TargetRegisterClass *SrcRC = Src.isReg() ? - MRI.getRegClass(Src.getReg()) : - &AMDGPU::SGPR_32RegClass; - - unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - - const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); - - MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, - AMDGPU::sub0, SrcSubRC); - MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, - AMDGPU::sub1, SrcSubRC); - - MachineInstr *First = BuildMI(MBB, MII, DL, InstDesc, MidReg) - .addOperand(SrcRegSub0) - .addImm(0); - - MachineInstr *Second = BuildMI(MBB, MII, DL, InstDesc, ResultReg) - .addOperand(SrcRegSub1) - .addReg(MidReg); - - MRI.replaceRegWith(Dest.getReg(), ResultReg); - - Worklist.push_back(First); - Worklist.push_back(Second); -} - -void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl &Worklist, - MachineInstr *Inst) const { - MachineBasicBlock &MBB = *Inst->getParent(); - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - MachineBasicBlock::iterator MII = Inst; - DebugLoc DL = Inst->getDebugLoc(); - - MachineOperand &Dest = Inst->getOperand(0); - uint32_t Imm = Inst->getOperand(2).getImm(); - uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. - uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. - - (void) Offset; - - // Only sext_inreg cases handled. - assert(Inst->getOpcode() == AMDGPU::S_BFE_I64 && - BitWidth <= 32 && - Offset == 0 && - "Not implemented"); - - if (BitWidth < 32) { - unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); - - BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) - .addReg(Inst->getOperand(1).getReg(), 0, AMDGPU::sub0) - .addImm(0) - .addImm(BitWidth); - - BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) - .addImm(31) - .addReg(MidRegLo); - - BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) - .addReg(MidRegLo) - .addImm(AMDGPU::sub0) - .addReg(MidRegHi) - .addImm(AMDGPU::sub1); - - MRI.replaceRegWith(Dest.getReg(), ResultReg); - return; - } - - MachineOperand &Src = Inst->getOperand(1); - unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); - - BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) - .addImm(31) - .addReg(Src.getReg(), 0, AMDGPU::sub0); - - BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) - .addReg(Src.getReg(), 0, AMDGPU::sub0) - .addImm(AMDGPU::sub0) - .addReg(TmpReg) - .addImm(AMDGPU::sub1); - - MRI.replaceRegWith(Dest.getReg(), ResultReg); -} - -void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc, - MachineInstr *Inst) const { - // Add the implict and explicit register definitions. - if (NewDesc.ImplicitUses) { - for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) { - unsigned Reg = NewDesc.ImplicitUses[i]; - Inst->addOperand(MachineOperand::CreateReg(Reg, false, true)); - } - } - - if (NewDesc.ImplicitDefs) { - for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) { - unsigned Reg = NewDesc.ImplicitDefs[i]; - Inst->addOperand(MachineOperand::CreateReg(Reg, true, true)); - } - } -} - -unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, - int OpIndices[3]) const { - const MCInstrDesc &Desc = get(MI->getOpcode()); - - // Find the one SGPR operand we are allowed to use. - unsigned SGPRReg = AMDGPU::NoRegister; - - // First we need to consider the instruction's operand requirements before - // legalizing. Some operands are required to be SGPRs, such as implicit uses - // of VCC, but we are still bound by the constant bus requirement to only use - // one. - // - // If the operand's class is an SGPR, we can never move it. - - for (const MachineOperand &MO : MI->implicit_operands()) { - // We only care about reads. - if (MO.isDef()) - continue; - - if (MO.getReg() == AMDGPU::VCC) - return AMDGPU::VCC; - - if (MO.getReg() == AMDGPU::FLAT_SCR) - return AMDGPU::FLAT_SCR; - } - - unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; - const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); - - for (unsigned i = 0; i < 3; ++i) { - int Idx = OpIndices[i]; - if (Idx == -1) - break; - - const MachineOperand &MO = MI->getOperand(Idx); - if (RI.isSGPRClassID(Desc.OpInfo[Idx].RegClass)) - SGPRReg = MO.getReg(); - - if (MO.isReg() && RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) - UsedSGPRs[i] = MO.getReg(); - } - - if (SGPRReg != AMDGPU::NoRegister) - return SGPRReg; - - // We don't have a required SGPR operand, so we have a bit more freedom in - // selecting operands to move. - - // Try to select the most used SGPR. If an SGPR is equal to one of the - // others, we choose that. - // - // e.g. - // V_FMA_F32 v0, s0, s0, s0 -> No moves - // V_FMA_F32 v0, s0, s1, s0 -> Move s1 - - if (UsedSGPRs[0] != AMDGPU::NoRegister) { - if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) - SGPRReg = UsedSGPRs[0]; - } - - if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { - if (UsedSGPRs[1] == UsedSGPRs[2]) - SGPRReg = UsedSGPRs[1]; - } - - return SGPRReg; -} - -MachineInstrBuilder SIInstrInfo::buildIndirectWrite( - MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, - unsigned Address, unsigned OffsetReg) const { - const DebugLoc &DL = MBB->findDebugLoc(I); - unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( - getIndirectIndexBegin(*MBB->getParent())); - - return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_DST_V1)) - .addReg(IndirectBaseReg, RegState::Define) - .addOperand(I->getOperand(0)) - .addReg(IndirectBaseReg) - .addReg(OffsetReg) - .addImm(0) - .addReg(ValueReg); -} - -MachineInstrBuilder SIInstrInfo::buildIndirectRead( - MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, - unsigned Address, unsigned OffsetReg) const { - const DebugLoc &DL = MBB->findDebugLoc(I); - unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( - getIndirectIndexBegin(*MBB->getParent())); - - return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC)) - .addOperand(I->getOperand(0)) - .addOperand(I->getOperand(1)) - .addReg(IndirectBaseReg) - .addReg(OffsetReg) - .addImm(0); - -} - -void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved, - const MachineFunction &MF) const { - int End = getIndirectIndexEnd(MF); - int Begin = getIndirectIndexBegin(MF); - - if (End == -1) - return; - - - for (int Index = Begin; Index <= End; ++Index) - Reserved.set(AMDGPU::VGPR_32RegClass.getRegister(Index)); - - for (int Index = std::max(0, Begin - 1); Index <= End; ++Index) - Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index)); - - for (int Index = std::max(0, Begin - 2); Index <= End; ++Index) - Reserved.set(AMDGPU::VReg_96RegClass.getRegister(Index)); - - for (int Index = std::max(0, Begin - 3); Index <= End; ++Index) - Reserved.set(AMDGPU::VReg_128RegClass.getRegister(Index)); - - for (int Index = std::max(0, Begin - 7); Index <= End; ++Index) - Reserved.set(AMDGPU::VReg_256RegClass.getRegister(Index)); - - for (int Index = std::max(0, Begin - 15); Index <= End; ++Index) - Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index)); -} - -MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, - unsigned OperandName) const { - int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); - if (Idx == -1) - return nullptr; - - return &MI.getOperand(Idx); -} - -uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { - uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; - if (ST.isAmdHsaOS()) - RsrcDataFormat |= (1ULL << 56); - - return RsrcDataFormat; -} diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h deleted file mode 100644 index 64b5120841c..00000000000 --- a/lib/Target/R600/SIInstrInfo.h +++ /dev/null @@ -1,391 +0,0 @@ -//===-- SIInstrInfo.h - SI Instruction Info Interface -----------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Interface definition for SIInstrInfo. -// -//===----------------------------------------------------------------------===// - - -#ifndef LLVM_LIB_TARGET_R600_SIINSTRINFO_H -#define LLVM_LIB_TARGET_R600_SIINSTRINFO_H - -#include "AMDGPUInstrInfo.h" -#include "SIDefines.h" -#include "SIRegisterInfo.h" - -namespace llvm { - -class SIInstrInfo : public AMDGPUInstrInfo { -private: - const SIRegisterInfo RI; - - unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, - MachineRegisterInfo &MRI, - MachineOperand &SuperReg, - const TargetRegisterClass *SuperRC, - unsigned SubIdx, - const TargetRegisterClass *SubRC) const; - MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, - MachineRegisterInfo &MRI, - MachineOperand &SuperReg, - const TargetRegisterClass *SuperRC, - unsigned SubIdx, - const TargetRegisterClass *SubRC) const; - - unsigned split64BitImm(SmallVectorImpl &Worklist, - MachineBasicBlock::iterator MI, - MachineRegisterInfo &MRI, - const TargetRegisterClass *RC, - const MachineOperand &Op) const; - - void swapOperands(MachineBasicBlock::iterator Inst) const; - - void splitScalar64BitUnaryOp(SmallVectorImpl &Worklist, - MachineInstr *Inst, unsigned Opcode) const; - - void splitScalar64BitBinaryOp(SmallVectorImpl &Worklist, - MachineInstr *Inst, unsigned Opcode) const; - - void splitScalar64BitBCNT(SmallVectorImpl &Worklist, - MachineInstr *Inst) const; - void splitScalar64BitBFE(SmallVectorImpl &Worklist, - MachineInstr *Inst) const; - - void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const; - - bool checkInstOffsetsDoNotOverlap(MachineInstr *MIa, - MachineInstr *MIb) const; - - unsigned findUsedSGPR(const MachineInstr *MI, int OpIndices[3]) const; - -public: - explicit SIInstrInfo(const AMDGPUSubtarget &st); - - const SIRegisterInfo &getRegisterInfo() const override { - return RI; - } - - bool isReallyTriviallyReMaterializable(const MachineInstr *MI, - AliasAnalysis *AA) const override; - - bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, - int64_t &Offset1, - int64_t &Offset2) const override; - - bool getLdStBaseRegImmOfs(MachineInstr *LdSt, - unsigned &BaseReg, unsigned &Offset, - const TargetRegisterInfo *TRI) const final; - - bool shouldClusterLoads(MachineInstr *FirstLdSt, - MachineInstr *SecondLdSt, - unsigned NumLoads) const final; - - void copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, - bool KillSrc) const override; - - unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - RegScavenger *RS, - unsigned TmpReg, - unsigned Offset, - unsigned Size) const; - - void storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned SrcReg, bool isKill, int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const override; - - void loadRegFromStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned DestReg, int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const override; - - bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; - - // \brief Returns an opcode that can be used to move a value to a \p DstRC - // register. If there is no hardware instruction that can store to \p - // DstRC, then AMDGPU::COPY is returned. - unsigned getMovOpcode(const TargetRegisterClass *DstRC) const; - unsigned commuteOpcode(const MachineInstr &MI) const; - - MachineInstr *commuteInstruction(MachineInstr *MI, - bool NewMI = false) const override; - bool findCommutedOpIndices(MachineInstr *MI, - unsigned &SrcOpIdx1, - unsigned &SrcOpIdx2) const override; - - bool isTriviallyReMaterializable(const MachineInstr *MI, - AliasAnalysis *AA = nullptr) const; - - bool areMemAccessesTriviallyDisjoint( - MachineInstr *MIa, MachineInstr *MIb, - AliasAnalysis *AA = nullptr) const override; - - MachineInstr *buildMovInstr(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned DstReg, unsigned SrcReg) const override; - bool isMov(unsigned Opcode) const override; - - bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override; - - bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, - unsigned Reg, MachineRegisterInfo *MRI) const final; - - unsigned getMachineCSELookAheadLimit() const override { return 500; } - - bool isSALU(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::SALU; - } - - bool isVALU(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::VALU; - } - - bool isSOP1(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::SOP1; - } - - bool isSOP2(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::SOP2; - } - - bool isSOPC(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::SOPC; - } - - bool isSOPK(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::SOPK; - } - - bool isSOPP(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::SOPP; - } - - bool isVOP1(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::VOP1; - } - - bool isVOP2(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::VOP2; - } - - bool isVOP3(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::VOP3; - } - - bool isVOPC(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::VOPC; - } - - bool isMUBUF(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::MUBUF; - } - - bool isMTBUF(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::MTBUF; - } - - bool isSMRD(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::SMRD; - } - - bool isDS(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::DS; - } - - bool isMIMG(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::MIMG; - } - - bool isFLAT(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::FLAT; - } - - bool isWQM(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::WQM; - } - - bool isVGPRSpill(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::VGPRSpill; - } - - bool isInlineConstant(const APInt &Imm) const; - bool isInlineConstant(const MachineOperand &MO, unsigned OpSize) const; - bool isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const; - - bool isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, - const MachineOperand &MO) const; - - /// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding. - /// This function will return false if you pass it a 32-bit instruction. - bool hasVALU32BitEncoding(unsigned Opcode) const; - - /// \brief Returns true if this operand uses the constant bus. - bool usesConstantBus(const MachineRegisterInfo &MRI, - const MachineOperand &MO, - unsigned OpSize) const; - - /// \brief Return true if this instruction has any modifiers. - /// e.g. src[012]_mod, omod, clamp. - bool hasModifiers(unsigned Opcode) const; - - bool hasModifiersSet(const MachineInstr &MI, - unsigned OpName) const; - - bool verifyInstruction(const MachineInstr *MI, - StringRef &ErrInfo) const override; - - static unsigned getVALUOp(const MachineInstr &MI); - - bool isSALUOpSupportedOnVALU(const MachineInstr &MI) const; - - /// \brief Return the correct register class for \p OpNo. For target-specific - /// instructions, this will return the register class that has been defined - /// in tablegen. For generic instructions, like REG_SEQUENCE it will return - /// the register class of its machine operand. - /// to infer the correct register class base on the other operands. - const TargetRegisterClass *getOpRegClass(const MachineInstr &MI, - unsigned OpNo) const; - - /// \brief Return the size in bytes of the operand OpNo on the given - // instruction opcode. - unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const { - const MCOperandInfo &OpInfo = get(Opcode).OpInfo[OpNo]; - - if (OpInfo.RegClass == -1) { - // If this is an immediate operand, this must be a 32-bit literal. - assert(OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE); - return 4; - } - - return RI.getRegClass(OpInfo.RegClass)->getSize(); - } - - /// \brief This form should usually be preferred since it handles operands - /// with unknown register classes. - unsigned getOpSize(const MachineInstr &MI, unsigned OpNo) const { - return getOpRegClass(MI, OpNo)->getSize(); - } - - /// \returns true if it is legal for the operand at index \p OpNo - /// to read a VGPR. - bool canReadVGPR(const MachineInstr &MI, unsigned OpNo) const; - - /// \brief Legalize the \p OpIndex operand of this instruction by inserting - /// a MOV. For example: - /// ADD_I32_e32 VGPR0, 15 - /// to - /// MOV VGPR1, 15 - /// ADD_I32_e32 VGPR0, VGPR1 - /// - /// If the operand being legalized is a register, then a COPY will be used - /// instead of MOV. - void legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const; - - /// \brief Check if \p MO is a legal operand if it was the \p OpIdx Operand - /// for \p MI. - bool isOperandLegal(const MachineInstr *MI, unsigned OpIdx, - const MachineOperand *MO = nullptr) const; - - /// \brief Legalize all operands in this instruction. This function may - /// create new instruction and insert them before \p MI. - void legalizeOperands(MachineInstr *MI) const; - - /// \brief Split an SMRD instruction into two smaller loads of half the - // size storing the results in \p Lo and \p Hi. - void splitSMRD(MachineInstr *MI, const TargetRegisterClass *HalfRC, - unsigned HalfImmOp, unsigned HalfSGPROp, - MachineInstr *&Lo, MachineInstr *&Hi) const; - - void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const; - - /// \brief Replace this instruction's opcode with the equivalent VALU - /// opcode. This function will also move the users of \p MI to the - /// VALU if necessary. - void moveToVALU(MachineInstr &MI) const; - - unsigned calculateIndirectAddress(unsigned RegIndex, - unsigned Channel) const override; - - const TargetRegisterClass *getIndirectAddrRegClass() const override; - - MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, - unsigned Address, - unsigned OffsetReg) const override; - - MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, - unsigned Address, - unsigned OffsetReg) const override; - void reserveIndirectRegisters(BitVector &Reserved, - const MachineFunction &MF) const; - - void LoadM0(MachineInstr *MoveRel, MachineBasicBlock::iterator I, - unsigned SavReg, unsigned IndexReg) const; - - void insertNOPs(MachineBasicBlock::iterator MI, int Count) const; - - /// \brief Returns the operand named \p Op. If \p MI does not have an - /// operand named \c Op, this function returns nullptr. - MachineOperand *getNamedOperand(MachineInstr &MI, unsigned OperandName) const; - - const MachineOperand *getNamedOperand(const MachineInstr &MI, - unsigned OpName) const { - return getNamedOperand(const_cast(MI), OpName); - } - - uint64_t getDefaultRsrcDataFormat() const; - -}; - -namespace AMDGPU { - - int getVOPe64(uint16_t Opcode); - int getVOPe32(uint16_t Opcode); - int getCommuteRev(uint16_t Opcode); - int getCommuteOrig(uint16_t Opcode); - int getAddr64Inst(uint16_t Opcode); - int getAtomicRetOp(uint16_t Opcode); - int getAtomicNoRetOp(uint16_t Opcode); - - const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; - const uint64_t RSRC_TID_ENABLE = 1LL << 55; - -} // End namespace AMDGPU - -namespace SI { -namespace KernelInputOffsets { - -/// Offsets in bytes from the start of the input buffer -enum Offsets { - NGROUPS_X = 0, - NGROUPS_Y = 4, - NGROUPS_Z = 8, - GLOBAL_SIZE_X = 12, - GLOBAL_SIZE_Y = 16, - GLOBAL_SIZE_Z = 20, - LOCAL_SIZE_X = 24, - LOCAL_SIZE_Y = 28, - LOCAL_SIZE_Z = 32 -}; - -} // End namespace KernelInputOffsets -} // End namespace SI - -} // End namespace llvm - -#endif diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td deleted file mode 100644 index 93e4ca74ec3..00000000000 --- a/lib/Target/R600/SIInstrInfo.td +++ /dev/null @@ -1,2647 +0,0 @@ -//===-- SIInstrInfo.td - SI Instruction Infos -------------*- tablegen -*--===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -def isCI : Predicate<"Subtarget->getGeneration() " - ">= AMDGPUSubtarget::SEA_ISLANDS">; -def isVI : Predicate < - "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">, - AssemblerPredicate<"FeatureGCN3Encoding">; - -def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">; - -class vop { - field bits<9> SI3; - field bits<10> VI3; -} - -class vopc si, bits<8> vi = !add(0x40, si)> : vop { - field bits<8> SI = si; - field bits<8> VI = vi; - - field bits<9> SI3 = {0, si{7-0}}; - field bits<10> VI3 = {0, 0, vi{7-0}}; -} - -class vop1 si, bits<8> vi = si> : vop { - field bits<8> SI = si; - field bits<8> VI = vi; - - field bits<9> SI3 = {1, 1, si{6-0}}; - field bits<10> VI3 = !add(0x140, vi); -} - -class vop2 si, bits<6> vi = si> : vop { - field bits<6> SI = si; - field bits<6> VI = vi; - - field bits<9> SI3 = {1, 0, 0, si{5-0}}; - field bits<10> VI3 = {0, 1, 0, 0, vi{5-0}}; -} - -// Specify a VOP2 opcode for SI and VOP3 opcode for VI -// that doesn't have VOP2 encoding on VI -class vop23 si, bits<10> vi> : vop2 { - let VI3 = vi; -} - -class vop3 si, bits<10> vi = {0, si}> : vop { - let SI3 = si; - let VI3 = vi; -} - -class sop1 si, bits<8> vi = si> { - field bits<8> SI = si; - field bits<8> VI = vi; -} - -class sop2 si, bits<7> vi = si> { - field bits<7> SI = si; - field bits<7> VI = vi; -} - -class sopk si, bits<5> vi = si> { - field bits<5> SI = si; - field bits<5> VI = vi; -} - -// Execpt for the NONE field, this must be kept in sync with the SISubtarget enum -// in AMDGPUInstrInfo.cpp -def SISubtarget { - int NONE = -1; - int SI = 0; - int VI = 1; -} - -//===----------------------------------------------------------------------===// -// SI DAG Nodes -//===----------------------------------------------------------------------===// - -def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT", - SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>, - [SDNPMayLoad, SDNPMemOperand] ->; - -def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", - SDTypeProfile<0, 13, - [SDTCisVT<0, v4i32>, // rsrc(SGPR) - SDTCisVT<1, iAny>, // vdata(VGPR) - SDTCisVT<2, i32>, // num_channels(imm) - SDTCisVT<3, i32>, // vaddr(VGPR) - SDTCisVT<4, i32>, // soffset(SGPR) - SDTCisVT<5, i32>, // inst_offset(imm) - SDTCisVT<6, i32>, // dfmt(imm) - SDTCisVT<7, i32>, // nfmt(imm) - SDTCisVT<8, i32>, // offen(imm) - SDTCisVT<9, i32>, // idxen(imm) - SDTCisVT<10, i32>, // glc(imm) - SDTCisVT<11, i32>, // slc(imm) - SDTCisVT<12, i32> // tfe(imm) - ]>, - [SDNPMayStore, SDNPMemOperand, SDNPHasChain] ->; - -def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT", - SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i16>, - SDTCisVT<3, i32>]> ->; - -class SDSample : SDNode , SDTCisVT<2, v32i8>, - SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]> ->; - -def SIsample : SDSample<"AMDGPUISD::SAMPLE">; -def SIsampleb : SDSample<"AMDGPUISD::SAMPLEB">; -def SIsampled : SDSample<"AMDGPUISD::SAMPLED">; -def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">; - -def SIconstdata_ptr : SDNode< - "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 0, [SDTCisVT<0, i64>]> ->; - -//===----------------------------------------------------------------------===// -// SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1 -// to be glued to the memory instructions. -//===----------------------------------------------------------------------===// - -def SIld_local : SDNode <"ISD::LOAD", SDTLoad, - [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] ->; - -def si_ld_local : PatFrag <(ops node:$ptr), (SIld_local node:$ptr), [{ - return isLocalLoad(cast(N)); -}]>; - -def si_load_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{ - return cast(N)->getAddressingMode() == ISD::UNINDEXED && - cast(N)->getExtensionType() == ISD::NON_EXTLOAD; -}]>; - -def si_load_local_align8 : Aligned8Bytes < - (ops node:$ptr), (si_load_local node:$ptr) ->; - -def si_sextload_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{ - return cast(N)->getExtensionType() == ISD::SEXTLOAD; -}]>; -def si_az_extload_local : AZExtLoadBase ; - -multiclass SIExtLoadLocal { - - def _i8 : PatFrag <(ops node:$ptr), (ld_node node:$ptr), - [{return cast(N)->getMemoryVT() == MVT::i8;}] - >; - - def _i16 : PatFrag <(ops node:$ptr), (ld_node node:$ptr), - [{return cast(N)->getMemoryVT() == MVT::i16;}] - >; -} - -defm si_sextload_local : SIExtLoadLocal ; -defm si_az_extload_local : SIExtLoadLocal ; - -def SIst_local : SDNode <"ISD::STORE", SDTStore, - [SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue] ->; - -def si_st_local : PatFrag < - (ops node:$val, node:$ptr), (SIst_local node:$val, node:$ptr), [{ - return isLocalStore(cast(N)); -}]>; - -def si_store_local : PatFrag < - (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{ - return cast(N)->getAddressingMode() == ISD::UNINDEXED && - !cast(N)->isTruncatingStore(); -}]>; - -def si_store_local_align8 : Aligned8Bytes < - (ops node:$val, node:$ptr), (si_store_local node:$val, node:$ptr) ->; - -def si_truncstore_local : PatFrag < - (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{ - return cast(N)->isTruncatingStore(); -}]>; - -def si_truncstore_local_i8 : PatFrag < - (ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i8; -}]>; - -def si_truncstore_local_i16 : PatFrag < - (ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i16; -}]>; - -multiclass SIAtomicM0Glue2 { - - def _glue : SDNode <"ISD::ATOMIC_"#op_name, SDTAtomic2, - [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] - >; - - def _local : local_binary_atomic_op (NAME#"_glue")>; -} - -defm si_atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">; -defm si_atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">; -defm si_atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">; -defm si_atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">; -defm si_atomic_load_or : SIAtomicM0Glue2 <"LOAD_OR">; -defm si_atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">; -defm si_atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">; -defm si_atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">; -defm si_atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">; -defm si_atomic_swap : SIAtomicM0Glue2 <"SWAP">; - -def si_atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3, - [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] ->; - -defm si_atomic_cmp_swap : AtomicCmpSwapLocal ; - -// Transformation function, extract the lower 32bit of a 64bit immediate -def LO32 : SDNodeXFormgetTargetConstant(N->getZExtValue() & 0xffffffff, SDLoc(N), - MVT::i32); -}]>; - -def LO32f : SDNodeXFormgetValueAPF().bitcastToAPInt().trunc(32); - return CurDAG->getTargetConstantFP(APFloat(APFloat::IEEEsingle, V), MVT::f32); -}]>; - -// Transformation function, extract the upper 32bit of a 64bit immediate -def HI32 : SDNodeXFormgetTargetConstant(N->getZExtValue() >> 32, SDLoc(N), MVT::i32); -}]>; - -def HI32f : SDNodeXFormgetValueAPF().bitcastToAPInt().lshr(32).trunc(32); - return CurDAG->getTargetConstantFP(APFloat(APFloat::IEEEsingle, V), SDLoc(N), - MVT::f32); -}]>; - -def IMM8bitDWORD : PatLeaf <(imm), - [{return (N->getZExtValue() & ~0x3FC) == 0;}] ->; - -def as_dword_i32imm : SDNodeXFormgetTargetConstant(N->getZExtValue() >> 2, SDLoc(N), MVT::i32); -}]>; - -def as_i1imm : SDNodeXFormgetTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1); -}]>; - -def as_i8imm : SDNodeXFormgetTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i8); -}]>; - -def as_i16imm : SDNodeXFormgetTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i16); -}]>; - -def as_i32imm: SDNodeXFormgetTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32); -}]>; - -def as_i64imm: SDNodeXFormgetTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i64); -}]>; - -// Copied from the AArch64 backend: -def bitcast_fpimm_to_i32 : SDNodeXFormgetTargetConstant( - N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32); -}]>; - -// Copied from the AArch64 backend: -def bitcast_fpimm_to_i64 : SDNodeXFormgetTargetConstant( - N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64); -}]>; - -def IMM8bit : PatLeaf <(imm), - [{return isUInt<8>(N->getZExtValue());}] ->; - -def IMM12bit : PatLeaf <(imm), - [{return isUInt<12>(N->getZExtValue());}] ->; - -def IMM16bit : PatLeaf <(imm), - [{return isUInt<16>(N->getZExtValue());}] ->; - -def IMM20bit : PatLeaf <(imm), - [{return isUInt<20>(N->getZExtValue());}] ->; - -def IMM32bit : PatLeaf <(imm), - [{return isUInt<32>(N->getZExtValue());}] ->; - -def mubuf_vaddr_offset : PatFrag< - (ops node:$ptr, node:$offset, node:$imm_offset), - (add (add node:$ptr, node:$offset), node:$imm_offset) ->; - -class InlineImm : PatLeaf <(vt imm), [{ - return isInlineImmediate(N); -}]>; - -class InlineFPImm : PatLeaf <(vt fpimm), [{ - return isInlineImmediate(N); -}]>; - -class SGPRImm : PatLeafgetGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) { - return false; - } - const SIRegisterInfo *SIRI = - static_cast(Subtarget->getRegisterInfo()); - for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); - U != E; ++U) { - if (SIRI->isSGPRClass(getOperandRegClass(*U, U.getOperandNo()))) { - return true; - } - } - return false; -}]>; - -//===----------------------------------------------------------------------===// -// Custom Operands -//===----------------------------------------------------------------------===// - -def FRAMEri32 : Operand { - let MIOperandInfo = (ops i32:$ptr, i32imm:$index); -} - -def SoppBrTarget : AsmOperandClass { - let Name = "SoppBrTarget"; - let ParserMethod = "parseSOppBrTarget"; -} - -def sopp_brtarget : Operand { - let EncoderMethod = "getSOPPBrEncoding"; - let OperandType = "OPERAND_PCREL"; - let ParserMatchClass = SoppBrTarget; -} - -include "SIInstrFormats.td" -include "VIInstrFormats.td" - -def MubufOffsetMatchClass : AsmOperandClass { - let Name = "MubufOffset"; - let ParserMethod = "parseMubufOptionalOps"; - let RenderMethod = "addImmOperands"; -} - -class DSOffsetBaseMatchClass : AsmOperandClass { - let Name = "DSOffset"#parser; - let ParserMethod = parser; - let RenderMethod = "addImmOperands"; - let PredicateMethod = "isDSOffset"; -} - -def DSOffsetMatchClass : DSOffsetBaseMatchClass <"parseDSOptionalOps">; -def DSOffsetGDSMatchClass : DSOffsetBaseMatchClass <"parseDSOffsetOptional">; - -def DSOffset01MatchClass : AsmOperandClass { - let Name = "DSOffset1"; - let ParserMethod = "parseDSOff01OptionalOps"; - let RenderMethod = "addImmOperands"; - let PredicateMethod = "isDSOffset01"; -} - -class GDSBaseMatchClass : AsmOperandClass { - let Name = "GDS"#parser; - let PredicateMethod = "isImm"; - let ParserMethod = parser; - let RenderMethod = "addImmOperands"; -} - -def GDSMatchClass : GDSBaseMatchClass <"parseDSOptionalOps">; -def GDS01MatchClass : GDSBaseMatchClass <"parseDSOff01OptionalOps">; - -class GLCBaseMatchClass : AsmOperandClass { - let Name = "GLC"#parser; - let PredicateMethod = "isImm"; - let ParserMethod = parser; - let RenderMethod = "addImmOperands"; -} - -def GLCMubufMatchClass : GLCBaseMatchClass <"parseMubufOptionalOps">; -def GLCFlatMatchClass : GLCBaseMatchClass <"parseFlatOptionalOps">; - -class SLCBaseMatchClass : AsmOperandClass { - let Name = "SLC"#parser; - let PredicateMethod = "isImm"; - let ParserMethod = parser; - let RenderMethod = "addImmOperands"; -} - -def SLCMubufMatchClass : SLCBaseMatchClass <"parseMubufOptionalOps">; -def SLCFlatMatchClass : SLCBaseMatchClass <"parseFlatOptionalOps">; -def SLCFlatAtomicMatchClass : SLCBaseMatchClass <"parseFlatAtomicOptionalOps">; - -class TFEBaseMatchClass : AsmOperandClass { - let Name = "TFE"#parser; - let PredicateMethod = "isImm"; - let ParserMethod = parser; - let RenderMethod = "addImmOperands"; -} - -def TFEMubufMatchClass : TFEBaseMatchClass <"parseMubufOptionalOps">; -def TFEFlatMatchClass : TFEBaseMatchClass <"parseFlatOptionalOps">; -def TFEFlatAtomicMatchClass : TFEBaseMatchClass <"parseFlatAtomicOptionalOps">; - -def OModMatchClass : AsmOperandClass { - let Name = "OMod"; - let PredicateMethod = "isImm"; - let ParserMethod = "parseVOP3OptionalOps"; - let RenderMethod = "addImmOperands"; -} - -def ClampMatchClass : AsmOperandClass { - let Name = "Clamp"; - let PredicateMethod = "isImm"; - let ParserMethod = "parseVOP3OptionalOps"; - let RenderMethod = "addImmOperands"; -} - -let OperandType = "OPERAND_IMMEDIATE" in { - -def offen : Operand { - let PrintMethod = "printOffen"; -} -def idxen : Operand { - let PrintMethod = "printIdxen"; -} -def addr64 : Operand { - let PrintMethod = "printAddr64"; -} -def mbuf_offset : Operand { - let PrintMethod = "printMBUFOffset"; - let ParserMatchClass = MubufOffsetMatchClass; -} -class ds_offset_base : Operand { - let PrintMethod = "printDSOffset"; - let ParserMatchClass = mc; -} -def ds_offset : ds_offset_base ; -def ds_offset_gds : ds_offset_base ; - -def ds_offset0 : Operand { - let PrintMethod = "printDSOffset0"; - let ParserMatchClass = DSOffset01MatchClass; -} -def ds_offset1 : Operand { - let PrintMethod = "printDSOffset1"; - let ParserMatchClass = DSOffset01MatchClass; -} -class gds_base : Operand { - let PrintMethod = "printGDS"; - let ParserMatchClass = mc; -} -def gds : gds_base ; - -def gds01 : gds_base ; - -class glc_base : Operand { - let PrintMethod = "printGLC"; - let ParserMatchClass = mc; -} - -def glc : glc_base ; -def glc_flat : glc_base ; - -class slc_base : Operand { - let PrintMethod = "printSLC"; - let ParserMatchClass = mc; -} - -def slc : slc_base ; -def slc_flat : slc_base ; -def slc_flat_atomic : slc_base ; - -class tfe_base : Operand { - let PrintMethod = "printTFE"; - let ParserMatchClass = mc; -} - -def tfe : tfe_base ; -def tfe_flat : tfe_base ; -def tfe_flat_atomic : tfe_base ; - -def omod : Operand { - let PrintMethod = "printOModSI"; - let ParserMatchClass = OModMatchClass; -} - -def ClampMod : Operand { - let PrintMethod = "printClampSI"; - let ParserMatchClass = ClampMatchClass; -} - -} // End OperandType = "OPERAND_IMMEDIATE" - -def VOPDstS64 : VOPDstOperand ; - -//===----------------------------------------------------------------------===// -// Complex patterns -//===----------------------------------------------------------------------===// - -def DS1Addr1Offset : ComplexPattern; -def DS64Bit4ByteAligned : ComplexPattern; - -def MUBUFAddr32 : ComplexPattern; -def MUBUFAddr64 : ComplexPattern; -def MUBUFAddr64Atomic : ComplexPattern; -def MUBUFScratch : ComplexPattern; -def MUBUFOffset : ComplexPattern; -def MUBUFOffsetAtomic : ComplexPattern; - -def VOP3Mods0 : ComplexPattern; -def VOP3Mods0Clamp : ComplexPattern; -def VOP3Mods0Clamp0OMod : ComplexPattern; -def VOP3Mods : ComplexPattern; - -//===----------------------------------------------------------------------===// -// SI assembler operands -//===----------------------------------------------------------------------===// - -def SIOperand { - int ZERO = 0x80; - int VCC = 0x6A; - int FLAT_SCR = 0x68; -} - -def SRCMODS { - int NONE = 0; - int NEG = 1; -} - -def DSTCLAMP { - int NONE = 0; -} - -def DSTOMOD { - int NONE = 0; -} - -//===----------------------------------------------------------------------===// -// -// SI Instruction multiclass helpers. -// -// Instructions with _32 take 32-bit operands. -// Instructions with _64 take 64-bit operands. -// -// VOP_* instructions can use either a 32-bit or 64-bit encoding. The 32-bit -// encoding is the standard encoding, but instruction that make use of -// any of the instruction modifiers must use the 64-bit encoding. -// -// Instructions with _e32 use the 32-bit encoding. -// Instructions with _e64 use the 64-bit encoding. -// -//===----------------------------------------------------------------------===// - -class SIMCInstr { - string PseudoInstr = pseudo; - int Subtarget = subtarget; -} - -//===----------------------------------------------------------------------===// -// EXP classes -//===----------------------------------------------------------------------===// - -class EXPCommon : InstSI< - (outs), - (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm, - VGPR_32:$src0, VGPR_32:$src1, VGPR_32:$src2, VGPR_32:$src3), - "exp $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3", - [] > { - - let EXP_CNT = 1; - let Uses = [EXEC]; -} - -multiclass EXP_m { - - let isPseudo = 1, isCodeGenOnly = 1 in { - def "" : EXPCommon, SIMCInstr <"exp", SISubtarget.NONE> ; - } - - def _si : EXPCommon, SIMCInstr <"exp", SISubtarget.SI>, EXPe; - - def _vi : EXPCommon, SIMCInstr <"exp", SISubtarget.VI>, EXPe_vi; -} - -//===----------------------------------------------------------------------===// -// Scalar classes -//===----------------------------------------------------------------------===// - -class SOP1_Pseudo pattern> : - SOP1 , - SIMCInstr { - let isPseudo = 1; - let isCodeGenOnly = 1; -} - -class SOP1_Real_si : - SOP1 , - SOP1e , - SIMCInstr { - let isCodeGenOnly = 0; - let AssemblerPredicates = [isSICI]; -} - -class SOP1_Real_vi : - SOP1 , - SOP1e , - SIMCInstr { - let isCodeGenOnly = 0; - let AssemblerPredicates = [isVI]; -} - -multiclass SOP1_m pattern> { - - def "" : SOP1_Pseudo ; - - def _si : SOP1_Real_si ; - - def _vi : SOP1_Real_vi ; - -} - -multiclass SOP1_32 pattern> : SOP1_m < - op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0), - opName#" $dst, $src0", pattern ->; - -multiclass SOP1_64 pattern> : SOP1_m < - op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0), - opName#" $dst, $src0", pattern ->; - -// no input, 64-bit output. -multiclass SOP1_64_0 pattern> { - def "" : SOP1_Pseudo ; - - def _si : SOP1_Real_si { - let ssrc0 = 0; - } - - def _vi : SOP1_Real_vi { - let ssrc0 = 0; - } -} - -// 64-bit input, no output -multiclass SOP1_1 pattern> { - def "" : SOP1_Pseudo ; - - def _si : SOP1_Real_si { - let sdst = 0; - } - - def _vi : SOP1_Real_vi { - let sdst = 0; - } -} - -// 64-bit input, 32-bit output. -multiclass SOP1_32_64 pattern> : SOP1_m < - op, opName, (outs SReg_32:$dst), (ins SSrc_64:$src0), - opName#" $dst, $src0", pattern ->; - -class SOP2_Pseudo pattern> : - SOP2, - SIMCInstr { - let isPseudo = 1; - let isCodeGenOnly = 1; - let Size = 4; - - // Pseudo instructions have no encodings, but adding this field here allows - // us to do: - // let sdst = xxx in { - // for multiclasses that include both real and pseudo instructions. - field bits<7> sdst = 0; -} - -class SOP2_Real_si : - SOP2, - SOP2e, - SIMCInstr { - let AssemblerPredicates = [isSICI]; -} - -class SOP2_Real_vi : - SOP2, - SOP2e, - SIMCInstr { - let AssemblerPredicates = [isVI]; -} - -multiclass SOP2_SELECT_32 pattern> { - def "" : SOP2_Pseudo ; - - def _si : SOP2_Real_si ; - - def _vi : SOP2_Real_vi ; -} - -multiclass SOP2_m pattern> { - - def "" : SOP2_Pseudo ; - - def _si : SOP2_Real_si ; - - def _vi : SOP2_Real_vi ; - -} - -multiclass SOP2_32 pattern> : SOP2_m < - op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1), - opName#" $dst, $src0, $src1", pattern ->; - -multiclass SOP2_64 pattern> : SOP2_m < - op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_64:$src1), - opName#" $dst, $src0, $src1", pattern ->; - -multiclass SOP2_64_32 pattern> : SOP2_m < - op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1), - opName#" $dst, $src0, $src1", pattern ->; - -class SOPC_Helper op, RegisterOperand rc, ValueType vt, - string opName, PatLeaf cond> : SOPC < - op, (outs SCCReg:$dst), (ins rc:$src0, rc:$src1), - opName#" $src0, $src1", []>; - -class SOPC_32 op, string opName, PatLeaf cond = COND_NULL> - : SOPC_Helper; - -class SOPC_64 op, string opName, PatLeaf cond = COND_NULL> - : SOPC_Helper; - -class SOPK_Pseudo pattern> : - SOPK , - SIMCInstr { - let isPseudo = 1; - let isCodeGenOnly = 1; -} - -class SOPK_Real_si : - SOPK , - SOPKe , - SIMCInstr { - let AssemblerPredicates = [isSICI]; - let isCodeGenOnly = 0; -} - -class SOPK_Real_vi : - SOPK , - SOPKe , - SIMCInstr { - let AssemblerPredicates = [isVI]; - let isCodeGenOnly = 0; -} - -multiclass SOPK_m { - def "" : SOPK_Pseudo ; - - def _si : SOPK_Real_si ; - - def _vi : SOPK_Real_vi ; - -} - -multiclass SOPK_32 pattern> { - def "" : SOPK_Pseudo ; - - def _si : SOPK_Real_si ; - - def _vi : SOPK_Real_vi ; -} - -multiclass SOPK_SCC pattern> { - def "" : SOPK_Pseudo ; - - let DisableEncoding = "$dst" in { - def _si : SOPK_Real_si ; - - def _vi : SOPK_Real_vi ; - } -} - -multiclass SOPK_32TIE pattern> : SOPK_m < - op, opName, (outs SReg_32:$sdst), (ins SReg_32:$src0, u16imm:$simm16), - " $sdst, $simm16" ->; - -multiclass SOPK_IMM32 { - - def "" : SOPK_Pseudo ; - - def _si : SOPK , - SOPK64e , - SIMCInstr { - let AssemblerPredicates = [isSICI]; - let isCodeGenOnly = 0; - } - - def _vi : SOPK , - SOPK64e , - SIMCInstr { - let AssemblerPredicates = [isVI]; - let isCodeGenOnly = 0; - } -} -//===----------------------------------------------------------------------===// -// SMRD classes -//===----------------------------------------------------------------------===// - -class SMRD_Pseudo pattern> : - SMRD , - SIMCInstr { - let isPseudo = 1; - let isCodeGenOnly = 1; -} - -class SMRD_Real_si op, string opName, bit imm, dag outs, dag ins, - string asm> : - SMRD , - SMRDe , - SIMCInstr { - let AssemblerPredicates = [isSICI]; -} - -class SMRD_Real_vi op, string opName, bit imm, dag outs, dag ins, - string asm> : - SMRD , - SMEMe_vi , - SIMCInstr { - let AssemblerPredicates = [isVI]; -} - -multiclass SMRD_m op, string opName, bit imm, dag outs, dag ins, - string asm, list pattern> { - - def "" : SMRD_Pseudo ; - - def _si : SMRD_Real_si ; - - // glc is only applicable to scalar stores, which are not yet - // implemented. - let glc = 0 in { - def _vi : SMRD_Real_vi <{0, 0, 0, op}, opName, imm, outs, ins, asm>; - } -} - -multiclass SMRD_Helper op, string opName, RegisterClass baseClass, - RegisterClass dstClass> { - defm _IMM : SMRD_m < - op, opName#"_IMM", 1, (outs dstClass:$dst), - (ins baseClass:$sbase, u32imm:$offset), - opName#" $dst, $sbase, $offset", [] - >; - - defm _SGPR : SMRD_m < - op, opName#"_SGPR", 0, (outs dstClass:$dst), - (ins baseClass:$sbase, SReg_32:$soff), - opName#" $dst, $sbase, $soff", [] - >; -} - -//===----------------------------------------------------------------------===// -// Vector ALU classes -//===----------------------------------------------------------------------===// - -// This must always be right before the operand being input modified. -def InputMods : OperandWithDefaultOps { - let PrintMethod = "printOperandAndMods"; -} - -def InputModsMatchClass : AsmOperandClass { - let Name = "RegWithInputMods"; -} - -def InputModsNoDefault : Operand { - let PrintMethod = "printOperandAndMods"; - let ParserMatchClass = InputModsMatchClass; -} - -class getNumSrcArgs { - int ret = - !if (!eq(Src1.Value, untyped.Value), 1, // VOP1 - !if (!eq(Src2.Value, untyped.Value), 2, // VOP2 - 3)); // VOP3 -} - -// Returns the register class to use for the destination of VOP[123C] -// instructions for the given VT. -class getVALUDstForVT { - RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand, - !if(!eq(VT.Size, 64), VOPDstOperand, - VOPDstOperand)); // else VT == i1 -} - -// Returns the register class to use for source 0 of VOP[12C] -// instructions for the given VT. -class getVOPSrc0ForVT { - RegisterOperand ret = !if(!eq(VT.Size, 32), VSrc_32, VSrc_64); -} - -// Returns the register class to use for source 1 of VOP[12C] for the -// given VT. -class getVOPSrc1ForVT { - RegisterClass ret = !if(!eq(VT.Size, 32), VGPR_32, VReg_64); -} - -// Returns the register class to use for sources of VOP3 instructions for the -// given VT. -class getVOP3SrcForVT { - RegisterOperand ret = !if(!eq(VT.Size, 32), VCSrc_32, VCSrc_64); -} - -// Returns 1 if the source arguments have modifiers, 0 if they do not. -class hasModifiers { - bit ret = !if(!eq(SrcVT.Value, f32.Value), 1, - !if(!eq(SrcVT.Value, f64.Value), 1, 0)); -} - -// Returns the input arguments for VOP[12C] instructions for the given SrcVT. -class getIns32 { - dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0), // VOP1 - !if(!eq(NumSrcArgs, 2), (ins Src0RC:$src0, Src1RC:$src1), // VOP2 - (ins))); -} - -// Returns the input arguments for VOP3 instructions for the given SrcVT. -class getIns64 { - - dag ret = - !if (!eq(NumSrcArgs, 1), - !if (!eq(HasModifiers, 1), - // VOP1 with modifiers - (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0, - ClampMod:$clamp, omod:$omod) - /* else */, - // VOP1 without modifiers - (ins Src0RC:$src0) - /* endif */ ), - !if (!eq(NumSrcArgs, 2), - !if (!eq(HasModifiers, 1), - // VOP 2 with modifiers - (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0, - InputModsNoDefault:$src1_modifiers, Src1RC:$src1, - ClampMod:$clamp, omod:$omod) - /* else */, - // VOP2 without modifiers - (ins Src0RC:$src0, Src1RC:$src1) - /* endif */ ) - /* NumSrcArgs == 3 */, - !if (!eq(HasModifiers, 1), - // VOP3 with modifiers - (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0, - InputModsNoDefault:$src1_modifiers, Src1RC:$src1, - InputModsNoDefault:$src2_modifiers, Src2RC:$src2, - ClampMod:$clamp, omod:$omod) - /* else */, - // VOP3 without modifiers - (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2) - /* endif */ ))); -} - -// Returns the assembly string for the inputs and outputs of a VOP[12C] -// instruction. This does not add the _e32 suffix, so it can be reused -// by getAsm64. -class getAsm32 { - string src1 = ", $src1"; - string src2 = ", $src2"; - string ret = "$dst, $src0"# - !if(!eq(NumSrcArgs, 1), "", src1)# - !if(!eq(NumSrcArgs, 3), src2, ""); -} - -// Returns the assembly string for the inputs and outputs of a VOP3 -// instruction. -class getAsm64 { - string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); - string src1 = !if(!eq(NumSrcArgs, 1), "", - !if(!eq(NumSrcArgs, 2), " $src1_modifiers", - " $src1_modifiers,")); - string src2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", ""); - string ret = - !if(!eq(HasModifiers, 0), - getAsm32.ret, - "$dst, "#src0#src1#src2#"$clamp"#"$omod"); -} - - -class VOPProfile _ArgVT> { - - field list ArgVT = _ArgVT; - - field ValueType DstVT = ArgVT[0]; - field ValueType Src0VT = ArgVT[1]; - field ValueType Src1VT = ArgVT[2]; - field ValueType Src2VT = ArgVT[3]; - field RegisterOperand DstRC = getVALUDstForVT.ret; - field RegisterOperand Src0RC32 = getVOPSrc0ForVT.ret; - field RegisterClass Src1RC32 = getVOPSrc1ForVT.ret; - field RegisterOperand Src0RC64 = getVOP3SrcForVT.ret; - field RegisterOperand Src1RC64 = getVOP3SrcForVT.ret; - field RegisterOperand Src2RC64 = getVOP3SrcForVT.ret; - - field int NumSrcArgs = getNumSrcArgs.ret; - field bit HasModifiers = hasModifiers.ret; - - field dag Outs = (outs DstRC:$dst); - - field dag Ins32 = getIns32.ret; - field dag Ins64 = getIns64.ret; - - field string Asm32 = getAsm32.ret; - field string Asm64 = getAsm64.ret; -} - -// FIXME: I think these F16/I16 profiles will need to use f16/i16 types in order -// for the instruction patterns to work. -def VOP_F16_F16 : VOPProfile <[f32, f32, untyped, untyped]>; -def VOP_F16_I16 : VOPProfile <[f32, i32, untyped, untyped]>; -def VOP_I16_F16 : VOPProfile <[i32, f32, untyped, untyped]>; - -def VOP_F16_F16_F16 : VOPProfile <[f32, f32, f32, untyped]>; -def VOP_F16_F16_I16 : VOPProfile <[f32, f32, i32, untyped]>; -def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>; - -def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>; -def VOP_F32_F64 : VOPProfile <[f32, f64, untyped, untyped]>; -def VOP_F32_I32 : VOPProfile <[f32, i32, untyped, untyped]>; -def VOP_F64_F32 : VOPProfile <[f64, f32, untyped, untyped]>; -def VOP_F64_F64 : VOPProfile <[f64, f64, untyped, untyped]>; -def VOP_F64_I32 : VOPProfile <[f64, i32, untyped, untyped]>; -def VOP_I32_F32 : VOPProfile <[i32, f32, untyped, untyped]>; -def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>; -def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>; - -def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>; -def VOP_F32_F32_I32 : VOPProfile <[f32, f32, i32, untyped]>; -def VOP_F64_F64_F64 : VOPProfile <[f64, f64, f64, untyped]>; -def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>; -def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>; -def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>; -def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>; -def VOP_I32_I32_I32_VCC : VOPProfile <[i32, i32, i32, untyped]> { - let Src0RC32 = VCSrc_32; -} - -def VOP_I1_F32_I32 : VOPProfile <[i1, f32, i32, untyped]> { - let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); - let Asm64 = "$dst, $src0_modifiers, $src1"; -} - -def VOP_I1_F64_I32 : VOPProfile <[i1, f64, i32, untyped]> { - let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); - let Asm64 = "$dst, $src0_modifiers, $src1"; -} - -def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>; -def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>; -def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>; -def VOP_CNDMASK : VOPProfile <[i32, i32, i32, untyped]> { - let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VCCReg:$src2); - let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, SSrc_64:$src2); - let Asm64 = "$dst, $src0, $src1, $src2"; -} - -def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>; -def VOP_MADK : VOPProfile <[f32, f32, f32, f32]> { - field dag Ins = (ins VCSrc_32:$src0, VGPR_32:$vsrc1, u32imm:$src2); - field string Asm = "$dst, $src0, $vsrc1, $src2"; -} -def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>; -def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>; -def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>; - - -class VOP { - string OpName = opName; -} - -class VOP2_REV { - string RevOp = revOp; - bit IsOrig = isOrig; -} - -class AtomicNoRet { - string NoRetOp = noRetOp; - bit IsRet = isRet; -} - -class VOP1_Pseudo pattern, string opName> : - VOP1Common , - VOP , - SIMCInstr , - MnemonicAlias { - let isPseudo = 1; - let isCodeGenOnly = 1; - - field bits<8> vdst; - field bits<9> src0; -} - -class VOP1_Real_si : - VOP1, - SIMCInstr { - let AssemblerPredicate = SIAssemblerPredicate; -} - -class VOP1_Real_vi : - VOP1, - SIMCInstr { - let AssemblerPredicates = [isVI]; -} - -multiclass VOP1_m pattern, - string opName> { - def "" : VOP1_Pseudo ; - - def _si : VOP1_Real_si ; - - def _vi : VOP1_Real_vi ; -} - -multiclass VOP1SI_m pattern, - string opName> { - def "" : VOP1_Pseudo ; - - def _si : VOP1_Real_si ; -} - -class VOP2_Pseudo pattern, string opName> : - VOP2Common , - VOP , - SIMCInstr, - MnemonicAlias { - let isPseudo = 1; - let isCodeGenOnly = 1; -} - -class VOP2_Real_si : - VOP2 , - SIMCInstr { - let AssemblerPredicates = [isSICI]; -} - -class VOP2_Real_vi : - VOP2 , - SIMCInstr { - let AssemblerPredicates = [isVI]; -} - -multiclass VOP2SI_m pattern, - string opName, string revOp> { - def "" : VOP2_Pseudo , - VOP2_REV; - - def _si : VOP2_Real_si ; -} - -multiclass VOP2_m pattern, - string opName, string revOp> { - def "" : VOP2_Pseudo , - VOP2_REV; - - def _si : VOP2_Real_si ; - - def _vi : VOP2_Real_vi ; - -} - -class VOP3DisableFields { - - bits<2> src0_modifiers = !if(HasModifiers, ?, 0); - bits<2> src1_modifiers = !if(HasModifiers, !if(HasSrc1, ?, 0), 0); - bits<2> src2_modifiers = !if(HasModifiers, !if(HasSrc2, ?, 0), 0); - bits<2> omod = !if(HasModifiers, ?, 0); - bits<1> clamp = !if(HasModifiers, ?, 0); - bits<9> src1 = !if(HasSrc1, ?, 0); - bits<9> src2 = !if(HasSrc2, ?, 0); -} - -class VOP3DisableModFields { - bits<2> src0_modifiers = !if(HasSrc0Mods, ?, 0); - bits<2> src1_modifiers = !if(HasSrc1Mods, ?, 0); - bits<2> src2_modifiers = !if(HasSrc2Mods, ?, 0); - bits<2> omod = !if(HasOutputMods, ?, 0); - bits<1> clamp = !if(HasOutputMods, ?, 0); -} - -class VOP3_Pseudo pattern, string opName> : - VOP3Common , - VOP , - SIMCInstr, - MnemonicAlias { - let isPseudo = 1; - let isCodeGenOnly = 1; -} - -class VOP3_Real_si op, dag outs, dag ins, string asm, string opName> : - VOP3Common , - VOP3e , - SIMCInstr { - let AssemblerPredicates = [isSICI]; -} - -class VOP3_Real_vi op, dag outs, dag ins, string asm, string opName> : - VOP3Common , - VOP3e_vi , - SIMCInstr { - let AssemblerPredicates = [isVI]; -} - -class VOP3b_Real_si op, dag outs, dag ins, string asm, string opName> : - VOP3Common , - VOP3be , - SIMCInstr { - let AssemblerPredicates = [isSICI]; -} - -class VOP3b_Real_vi op, dag outs, dag ins, string asm, string opName> : - VOP3Common , - VOP3be_vi , - SIMCInstr { - let AssemblerPredicates = [isVI]; -} - -multiclass VOP3_m pattern, - string opName, int NumSrcArgs, bit HasMods = 1> { - - def "" : VOP3_Pseudo ; - - def _si : VOP3_Real_si , - VOP3DisableFields; - def _vi : VOP3_Real_vi , - VOP3DisableFields; -} - -// VOP3_m without source modifiers -multiclass VOP3_m_nomods pattern, - string opName, int NumSrcArgs, bit HasMods = 1> { - - def "" : VOP3_Pseudo ; - - let src0_modifiers = 0, - src1_modifiers = 0, - src2_modifiers = 0, - clamp = 0, - omod = 0 in { - def _si : VOP3_Real_si ; - def _vi : VOP3_Real_vi ; - } -} - -multiclass VOP3_1_m pattern, string opName, bit HasMods = 1> { - - def "" : VOP3_Pseudo ; - - def _si : VOP3_Real_si , - VOP3DisableFields<0, 0, HasMods>; - - def _vi : VOP3_Real_vi , - VOP3DisableFields<0, 0, HasMods>; -} - -multiclass VOP3SI_1_m pattern, string opName, bit HasMods = 1> { - - def "" : VOP3_Pseudo ; - - def _si : VOP3_Real_si , - VOP3DisableFields<0, 0, HasMods>; - // No VI instruction. This class is for SI only. -} - -multiclass VOP3_2_m pattern, string opName, string revOp, - bit HasMods = 1, bit UseFullOp = 0> { - - def "" : VOP3_Pseudo , - VOP2_REV; - - def _si : VOP3_Real_si , - VOP3DisableFields<1, 0, HasMods>; - - def _vi : VOP3_Real_vi , - VOP3DisableFields<1, 0, HasMods>; -} - -multiclass VOP3SI_2_m pattern, string opName, string revOp, - bit HasMods = 1, bit UseFullOp = 0> { - - def "" : VOP3_Pseudo , - VOP2_REV; - - def _si : VOP3_Real_si , - VOP3DisableFields<1, 0, HasMods>; - - // No VI instruction. This class is for SI only. -} - -// XXX - Is v_div_scale_{f32|f64} only available in vop3b without -// option of implicit vcc use? -multiclass VOP3b_2_m pattern, string opName, string revOp, - bit HasMods = 1, bit UseFullOp = 0> { - def "" : VOP3_Pseudo , - VOP2_REV; - - // The VOP2 variant puts the carry out into VCC, the VOP3 variant - // can write it into any SGPR. We currently don't use the carry out, - // so for now hardcode it to VCC as well. - let sdst = SIOperand.VCC, Defs = [VCC] in { - def _si : VOP3b_Real_si , - VOP3DisableFields<1, 0, HasMods>; - - def _vi : VOP3b_Real_vi , - VOP3DisableFields<1, 0, HasMods>; - } // End sdst = SIOperand.VCC, Defs = [VCC] -} - -multiclass VOP3b_3_m pattern, string opName, string revOp, - bit HasMods = 1, bit UseFullOp = 0> { - def "" : VOP3_Pseudo ; - - - def _si : VOP3b_Real_si , - VOP3DisableFields<1, 1, HasMods>; - - def _vi : VOP3b_Real_vi , - VOP3DisableFields<1, 1, HasMods>; -} - -multiclass VOP3_C_m pattern, string opName, - bit HasMods, bit defExec, string revOp> { - - def "" : VOP3_Pseudo , - VOP2_REV; - - def _si : VOP3_Real_si , - VOP3DisableFields<1, 0, HasMods> { - let Defs = !if(defExec, [EXEC], []); - } - - def _vi : VOP3_Real_vi , - VOP3DisableFields<1, 0, HasMods> { - let Defs = !if(defExec, [EXEC], []); - } -} - -// An instruction that is VOP2 on SI and VOP3 on VI, no modifiers. -multiclass VOP2SI_3VI_m pattern = []> { - let isPseudo = 1, isCodeGenOnly = 1 in { - def "" : VOPAnyCommon , - SIMCInstr; - } - - def _si : VOP2 , - SIMCInstr { - let AssemblerPredicates = [isSICI]; - } - - def _vi : VOP3Common , - VOP3e_vi , - VOP3DisableFields <1, 0, 0>, - SIMCInstr { - let AssemblerPredicates = [isVI]; - } -} - -multiclass VOP1_Helper pat32, - dag ins64, string asm64, list pat64, - bit HasMods> { - - defm _e32 : VOP1_m ; - - defm _e64 : VOP3_1_m ; -} - -multiclass VOP1Inst : VOP1_Helper < - op, opName, P.Outs, - P.Ins32, P.Asm32, [], - P.Ins64, P.Asm64, - !if(P.HasModifiers, - [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, - i32:$src0_modifiers, i1:$clamp, i32:$omod))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0))]), - P.HasModifiers ->; - -multiclass VOP1InstSI { - - defm _e32 : VOP1SI_m ; - - defm _e64 : VOP3SI_1_m ; -} - -multiclass VOP2_Helper pat32, - dag ins64, string asm64, list pat64, - string revOp, bit HasMods> { - defm _e32 : VOP2_m ; - - defm _e64 : VOP3_2_m ; -} - -multiclass VOP2Inst : VOP2_Helper < - op, opName, P.Outs, - P.Ins32, P.Asm32, [], - P.Ins64, P.Asm64, - !if(P.HasModifiers, - [(set P.DstVT:$dst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), - revOp, P.HasModifiers ->; - -multiclass VOP2InstSI { - defm _e32 : VOP2SI_m ; - - defm _e64 : VOP3SI_2_m ; -} - -multiclass VOP2b_Helper pat32, - dag ins64, string asm64, list pat64, - string revOp, bit HasMods> { - - defm _e32 : VOP2_m ; - - defm _e64 : VOP3b_2_m ; -} - -multiclass VOP2bInst : VOP2b_Helper < - op, opName, P.Outs, - P.Ins32, P.Asm32, [], - P.Ins64, P.Asm64, - !if(P.HasModifiers, - [(set P.DstVT:$dst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), - revOp, P.HasModifiers ->; - -// A VOP2 instruction that is VOP3-only on VI. -multiclass VOP2_VI3_Helper pat32, - dag ins64, string asm64, list pat64, - string revOp, bit HasMods> { - defm _e32 : VOP2SI_m ; - - defm _e64 : VOP3_2_m ; -} - -multiclass VOP2_VI3_Inst - : VOP2_VI3_Helper < - op, opName, P.Outs, - P.Ins32, P.Asm32, [], - P.Ins64, P.Asm64, - !if(P.HasModifiers, - [(set P.DstVT:$dst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), - revOp, P.HasModifiers ->; - -multiclass VOP2MADK pattern = []> { - - def "" : VOP2_Pseudo ; - -let isCodeGenOnly = 0 in { - def _si : VOP2Common , - SIMCInstr , - VOP2_MADKe { - let AssemblerPredicates = [isSICI]; - } - - def _vi : VOP2Common , - SIMCInstr , - VOP2_MADKe { - let AssemblerPredicates = [isVI]; - } -} // End isCodeGenOnly = 0 -} - -class VOPC_Pseudo pattern, string opName> : - VOPCCommon , - VOP , - SIMCInstr, - MnemonicAlias { - let isPseudo = 1; - let isCodeGenOnly = 1; -} - -multiclass VOPC_m pattern, - string opName, bit DefExec, string revOpName = ""> { - def "" : VOPC_Pseudo ; - - def _si : VOPC, - SIMCInstr { - let Defs = !if(DefExec, [EXEC], []); - let hasSideEffects = DefExec; - } - - def _vi : VOPC, - SIMCInstr { - let Defs = !if(DefExec, [EXEC], []); - let hasSideEffects = DefExec; - } -} - -multiclass VOPC_Helper pat32, - dag out64, dag ins64, string asm64, list pat64, - bit HasMods, bit DefExec, string revOp> { - defm _e32 : VOPC_m ; - - defm _e64 : VOP3_C_m ; -} - -// Special case for class instructions which only have modifiers on -// the 1st source operand. -multiclass VOPC_Class_Helper pat32, - dag out64, dag ins64, string asm64, list pat64, - bit HasMods, bit DefExec, string revOp> { - defm _e32 : VOPC_m ; - - defm _e64 : VOP3_C_m , - VOP3DisableModFields<1, 0, 0>; -} - -multiclass VOPCInst : VOPC_Helper < - op, opName, - P.Ins32, P.Asm32, [], - (outs VOPDstS64:$dst), P.Ins64, P.Asm64, - !if(P.HasModifiers, - [(set i1:$dst, - (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), - cond))], - [(set i1:$dst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]), - P.HasModifiers, DefExec, revOp ->; - -multiclass VOPCClassInst : VOPC_Class_Helper < - op, opName, - P.Ins32, P.Asm32, [], - (outs VOPDstS64:$dst), P.Ins64, P.Asm64, - !if(P.HasModifiers, - [(set i1:$dst, - (AMDGPUfp_class (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)), P.Src1VT:$src1))], - [(set i1:$dst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]), - P.HasModifiers, DefExec, opName ->; - - -multiclass VOPC_F32 : - VOPCInst ; - -multiclass VOPC_F64 : - VOPCInst ; - -multiclass VOPC_I32 : - VOPCInst ; - -multiclass VOPC_I64 : - VOPCInst ; - - -multiclass VOPCX - : VOPCInst ; - -multiclass VOPCX_F32 : - VOPCX ; - -multiclass VOPCX_F64 : - VOPCX ; - -multiclass VOPCX_I32 : - VOPCX ; - -multiclass VOPCX_I64 : - VOPCX ; - -multiclass VOP3_Helper pat, int NumSrcArgs, bit HasMods> : VOP3_m < - op, outs, ins, opName#" "#asm, pat, opName, NumSrcArgs, HasMods ->; - -multiclass VOPC_CLASS_F32 : - VOPCClassInst ; - -multiclass VOPCX_CLASS_F32 : - VOPCClassInst ; - -multiclass VOPC_CLASS_F64 : - VOPCClassInst ; - -multiclass VOPCX_CLASS_F64 : - VOPCClassInst ; - -multiclass VOP3Inst : VOP3_Helper < - op, opName, (outs P.DstRC.RegClass:$dst), P.Ins64, P.Asm64, - !if(!eq(P.NumSrcArgs, 3), - !if(P.HasModifiers, - [(set P.DstVT:$dst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), - (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1, - P.Src2VT:$src2))]), - !if(!eq(P.NumSrcArgs, 2), - !if(P.HasModifiers, - [(set P.DstVT:$dst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]) - /* P.NumSrcArgs == 1 */, - !if(P.HasModifiers, - [(set P.DstVT:$dst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0))]))), - P.NumSrcArgs, P.HasModifiers ->; - -// Special case for v_div_fmas_{f32|f64}, since it seems to be the -// only VOP instruction that implicitly reads VCC. -multiclass VOP3_VCC_Inst : VOP3_Helper < - op, opName, - (outs P.DstRC.RegClass:$dst), - (ins InputModsNoDefault:$src0_modifiers, P.Src0RC64:$src0, - InputModsNoDefault:$src1_modifiers, P.Src1RC64:$src1, - InputModsNoDefault:$src2_modifiers, P.Src2RC64:$src2, - ClampMod:$clamp, - omod:$omod), - " $dst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod", - [(set P.DstVT:$dst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), - (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers)), - (i1 VCC)))], - 3, 1 ->; - -multiclass VOP3b_Helper pattern> : - VOP3b_3_m < - op, (outs vrc:$vdst, SReg_64:$sdst), - (ins InputModsNoDefault:$src0_modifiers, arc:$src0, - InputModsNoDefault:$src1_modifiers, arc:$src1, - InputModsNoDefault:$src2_modifiers, arc:$src2, - ClampMod:$clamp, omod:$omod), - opName#" $vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod", pattern, - opName, opName, 1, 1 ->; - -multiclass VOP3b_64 pattern> : - VOP3b_Helper ; - -multiclass VOP3b_32 pattern> : - VOP3b_Helper ; - - -class Vop3ModPat : Pat< - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), - (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))), - (Inst i32:$src0_modifiers, P.Src0VT:$src0, - i32:$src1_modifiers, P.Src1VT:$src1, - i32:$src2_modifiers, P.Src2VT:$src2, - i1:$clamp, - i32:$omod)>; - -//===----------------------------------------------------------------------===// -// Interpolation opcodes -//===----------------------------------------------------------------------===// - -class VINTRP_Pseudo pattern> : - VINTRPCommon , - SIMCInstr { - let isPseudo = 1; - let isCodeGenOnly = 1; -} - -class VINTRP_Real_si op, string opName, dag outs, dag ins, - string asm> : - VINTRPCommon , - VINTRPe , - SIMCInstr; - -class VINTRP_Real_vi op, string opName, dag outs, dag ins, - string asm> : - VINTRPCommon , - VINTRPe_vi , - SIMCInstr; - -multiclass VINTRP_m op, dag outs, dag ins, string asm, - list pattern = []> { - def "" : VINTRP_Pseudo ; - - def _si : VINTRP_Real_si ; - - def _vi : VINTRP_Real_vi ; -} - -//===----------------------------------------------------------------------===// -// Vector I/O classes -//===----------------------------------------------------------------------===// - -class DS_Pseudo pattern> : - DS , - SIMCInstr { - let isPseudo = 1; - let isCodeGenOnly = 1; -} - -class DS_Real_si op, string opName, dag outs, dag ins, string asm> : - DS , - DSe , - SIMCInstr { - let isCodeGenOnly = 0; -} - -class DS_Real_vi op, string opName, dag outs, dag ins, string asm> : - DS , - DSe_vi , - SIMCInstr ; - -class DS_Off16_Real_si op, string opName, dag outs, dag ins, string asm> : - DS_Real_si { - - // Single load interpret the 2 i8imm operands as a single i16 offset. - bits<16> offset; - let offset0 = offset{7-0}; - let offset1 = offset{15-8}; - let isCodeGenOnly = 0; -} - -class DS_Off16_Real_vi op, string opName, dag outs, dag ins, string asm> : - DS_Real_vi { - - // Single load interpret the 2 i8imm operands as a single i16 offset. - bits<16> offset; - let offset0 = offset{7-0}; - let offset1 = offset{15-8}; -} - -multiclass DS_1A_RET op, string opName, RegisterClass rc, - dag outs = (outs rc:$vdst), - dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds), - string asm = opName#" $vdst, $addr"#"$offset$gds"> { - - def "" : DS_Pseudo ; - - let data0 = 0, data1 = 0 in { - def _si : DS_Off16_Real_si ; - def _vi : DS_Off16_Real_vi ; - } -} - -multiclass DS_1A_Off8_RET op, string opName, RegisterClass rc, - dag outs = (outs rc:$vdst), - dag ins = (ins VGPR_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1, - gds01:$gds), - string asm = opName#" $vdst, $addr"#"$offset0"#"$offset1$gds"> { - - def "" : DS_Pseudo ; - - let data0 = 0, data1 = 0, AsmMatchConverter = "cvtDSOffset01" in { - def _si : DS_Real_si ; - def _vi : DS_Real_vi ; - } -} - -multiclass DS_1A1D_NORET op, string opName, RegisterClass rc, - dag outs = (outs), - dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds), - string asm = opName#" $addr, $data0"#"$offset$gds"> { - - def "" : DS_Pseudo , - AtomicNoRet; - - let data1 = 0, vdst = 0 in { - def _si : DS_Off16_Real_si ; - def _vi : DS_Off16_Real_vi ; - } -} - -multiclass DS_1A1D_Off8_NORET op, string opName, RegisterClass rc, - dag outs = (outs), - dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1, - ds_offset0:$offset0, ds_offset1:$offset1, gds01:$gds), - string asm = opName#" $addr, $data0, $data1"#"$offset0"#"$offset1"#"$gds"> { - - def "" : DS_Pseudo ; - - let vdst = 0, AsmMatchConverter = "cvtDSOffset01" in { - def _si : DS_Real_si ; - def _vi : DS_Real_vi ; - } -} - -multiclass DS_1A1D_RET op, string opName, RegisterClass rc, - string noRetOp = "", - dag outs = (outs rc:$vdst), - dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds), - string asm = opName#" $vdst, $addr, $data0"#"$offset$gds"> { - - def "" : DS_Pseudo , - AtomicNoRet; - - let data1 = 0 in { - def _si : DS_Off16_Real_si ; - def _vi : DS_Off16_Real_vi ; - } -} - -multiclass DS_1A2D_RET_m op, string opName, RegisterClass rc, - string noRetOp = "", dag ins, - dag outs = (outs rc:$vdst), - string asm = opName#" $vdst, $addr, $data0, $data1"#"$offset"#"$gds"> { - - def "" : DS_Pseudo , - AtomicNoRet; - - def _si : DS_Off16_Real_si ; - def _vi : DS_Off16_Real_vi ; -} - -multiclass DS_1A2D_RET op, string asm, RegisterClass rc, - string noRetOp = "", RegisterClass src = rc> : - DS_1A2D_RET_m ; - -multiclass DS_1A2D_NORET op, string opName, RegisterClass rc, - string noRetOp = opName, - dag outs = (outs), - dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1, - ds_offset:$offset, gds:$gds), - string asm = opName#" $addr, $data0, $data1"#"$offset"#"$gds"> { - - def "" : DS_Pseudo , - AtomicNoRet; - - let vdst = 0 in { - def _si : DS_Off16_Real_si ; - def _vi : DS_Off16_Real_vi ; - } -} - -multiclass DS_0A_RET op, string opName, - dag outs = (outs VGPR_32:$vdst), - dag ins = (ins ds_offset:$offset, gds:$gds), - string asm = opName#" $vdst"#"$offset"#"$gds"> { - - let mayLoad = 1, mayStore = 1 in { - def "" : DS_Pseudo ; - - let addr = 0, data0 = 0, data1 = 0 in { - def _si : DS_Off16_Real_si ; - def _vi : DS_Off16_Real_vi ; - } // end addr = 0, data0 = 0, data1 = 0 - } // end mayLoad = 1, mayStore = 1 -} - -multiclass DS_1A_RET_GDS op, string opName, - dag outs = (outs VGPR_32:$vdst), - dag ins = (ins VGPR_32:$addr, ds_offset_gds:$offset), - string asm = opName#" $vdst, $addr"#"$offset gds"> { - - def "" : DS_Pseudo ; - - let data0 = 0, data1 = 0, gds = 1 in { - def _si : DS_Off16_Real_si ; - def _vi : DS_Off16_Real_vi ; - } // end data0 = 0, data1 = 0, gds = 1 -} - -multiclass DS_1A_GDS op, string opName, - dag outs = (outs), - dag ins = (ins VGPR_32:$addr), - string asm = opName#" $addr gds"> { - - def "" : DS_Pseudo ; - - let vdst = 0, data0 = 0, data1 = 0, offset0 = 0, offset1 = 0, gds = 1 in { - def _si : DS_Real_si ; - def _vi : DS_Real_vi ; - } // end vdst = 0, data = 0, data1 = 0, gds = 1 -} - -multiclass DS_1A op, string opName, - dag outs = (outs), - dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds), - string asm = opName#" $addr"#"$offset"#"$gds"> { - - let mayLoad = 1, mayStore = 1 in { - def "" : DS_Pseudo ; - - let vdst = 0, data0 = 0, data1 = 0 in { - def _si : DS_Off16_Real_si ; - def _vi : DS_Off16_Real_vi ; - } // let vdst = 0, data0 = 0, data1 = 0 - } // end mayLoad = 1, mayStore = 1 -} - -//===----------------------------------------------------------------------===// -// MTBUF classes -//===----------------------------------------------------------------------===// - -class MTBUF_Pseudo pattern> : - MTBUF , - SIMCInstr { - let isPseudo = 1; - let isCodeGenOnly = 1; -} - -class MTBUF_Real_si op, string opName, dag outs, dag ins, - string asm> : - MTBUF , - MTBUFe , - SIMCInstr; - -class MTBUF_Real_vi op, string opName, dag outs, dag ins, string asm> : - MTBUF , - MTBUFe_vi , - SIMCInstr ; - -multiclass MTBUF_m op, string opName, dag outs, dag ins, string asm, - list pattern> { - - def "" : MTBUF_Pseudo ; - - def _si : MTBUF_Real_si ; - - def _vi : MTBUF_Real_vi <{0, op{2}, op{1}, op{0}}, opName, outs, ins, asm>; - -} - -let mayStore = 1, mayLoad = 0 in { - -multiclass MTBUF_Store_Helper op, string opName, - RegisterClass regClass> : MTBUF_m < - op, opName, (outs), - (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, - i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, - SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SCSrc_32:$soffset), - opName#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt," - #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", [] ->; - -} // mayStore = 1, mayLoad = 0 - -let mayLoad = 1, mayStore = 0 in { - -multiclass MTBUF_Load_Helper op, string opName, - RegisterClass regClass> : MTBUF_m < - op, opName, (outs regClass:$dst), - (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, - i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, SReg_128:$srsrc, - i1imm:$slc, i1imm:$tfe, SCSrc_32:$soffset), - opName#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt," - #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", [] ->; - -} // mayLoad = 1, mayStore = 0 - -//===----------------------------------------------------------------------===// -// MUBUF classes -//===----------------------------------------------------------------------===// - -class mubuf si, bits<7> vi = si> { - field bits<7> SI = si; - field bits<7> VI = vi; -} - -let isCodeGenOnly = 0 in { - -class MUBUF_si op, dag outs, dag ins, string asm, list pattern> : - MUBUF , MUBUFe { - let lds = 0; -} - -} // End let isCodeGenOnly = 0 - -class MUBUF_vi op, dag outs, dag ins, string asm, list pattern> : - MUBUF , MUBUFe_vi { - let lds = 0; -} - -class MUBUFAddr64Table { - bit IsAddr64 = is_addr64; - string OpName = NAME # suffix; -} - -class MUBUF_Pseudo pattern> : - MUBUF , - SIMCInstr { - let isPseudo = 1; - let isCodeGenOnly = 1; - - // dummy fields, so that we can use let statements around multiclasses - bits<1> offen; - bits<1> idxen; - bits<8> vaddr; - bits<1> glc; - bits<1> slc; - bits<1> tfe; - bits<8> soffset; -} - -class MUBUF_Real_si : - MUBUF , - MUBUFe , - SIMCInstr { - let lds = 0; -} - -class MUBUF_Real_vi : - MUBUF , - MUBUFe_vi , - SIMCInstr { - let lds = 0; -} - -multiclass MUBUF_m pattern> { - - def "" : MUBUF_Pseudo , - MUBUFAddr64Table <0>; - - let addr64 = 0, isCodeGenOnly = 0 in { - def _si : MUBUF_Real_si ; - } - - def _vi : MUBUF_Real_vi ; -} - -multiclass MUBUFAddr64_m pattern> { - - def "" : MUBUF_Pseudo , - MUBUFAddr64Table <1>; - - let addr64 = 1, isCodeGenOnly = 0 in { - def _si : MUBUF_Real_si ; - } - - // There is no VI version. If the pseudo is selected, it should be lowered - // for VI appropriately. -} - -multiclass MUBUFAtomicOffset_m pattern, bit is_return> { - - def "" : MUBUF_Pseudo , - MUBUFAddr64Table <0, !if(is_return, "_RTN", "")>, - AtomicNoRet; - - let offen = 0, idxen = 0, tfe = 0, vaddr = 0 in { - let addr64 = 0 in { - def _si : MUBUF_Real_si ; - } - - def _vi : MUBUF_Real_vi ; - } -} - -multiclass MUBUFAtomicAddr64_m pattern, bit is_return> { - - def "" : MUBUF_Pseudo , - MUBUFAddr64Table <1, !if(is_return, "_RTN", "")>, - AtomicNoRet; - - let offen = 0, idxen = 0, addr64 = 1, tfe = 0 in { - def _si : MUBUF_Real_si ; - } - - // There is no VI version. If the pseudo is selected, it should be lowered - // for VI appropriately. -} - -multiclass MUBUF_Atomic { - - let mayStore = 1, mayLoad = 1, hasPostISelHook = 1 in { - - // No return variants - let glc = 0 in { - - defm _ADDR64 : MUBUFAtomicAddr64_m < - op, name#"_addr64", (outs), - (ins rc:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, - SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), - name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#"$slc", [], 0 - >; - - defm _OFFSET : MUBUFAtomicOffset_m < - op, name#"_offset", (outs), - (ins rc:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset, - slc:$slc), - name#" $vdata, $srsrc, $soffset"#"$offset"#"$slc", [], 0 - >; - } // glc = 0 - - // Variant that return values - let glc = 1, Constraints = "$vdata = $vdata_in", - DisableEncoding = "$vdata_in" in { - - defm _RTN_ADDR64 : MUBUFAtomicAddr64_m < - op, name#"_rtn_addr64", (outs rc:$vdata), - (ins rc:$vdata_in, SReg_128:$srsrc, VReg_64:$vaddr, - SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), - name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#" glc"#"$slc", - [(set vt:$vdata, - (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, - i16:$offset, i1:$slc), vt:$vdata_in))], 1 - >; - - defm _RTN_OFFSET : MUBUFAtomicOffset_m < - op, name#"_rtn_offset", (outs rc:$vdata), - (ins rc:$vdata_in, SReg_128:$srsrc, SCSrc_32:$soffset, - mbuf_offset:$offset, slc:$slc), - name#" $vdata, $srsrc, $soffset"#"$offset"#" glc $slc", - [(set vt:$vdata, - (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, - i1:$slc), vt:$vdata_in))], 1 - >; - - } // glc = 1 - - } // mayStore = 1, mayLoad = 1, hasPostISelHook = 1 -} - -multiclass MUBUF_Load_Helper { - - let mayLoad = 1, mayStore = 0 in { - let offen = 0, idxen = 0, vaddr = 0 in { - defm _OFFSET : MUBUF_m ; - } - - let offen = 1, idxen = 0 in { - defm _OFFEN : MUBUF_m ; - } - - let offen = 0, idxen = 1 in { - defm _IDXEN : MUBUF_m ; - } - - let offen = 1, idxen = 1 in { - defm _BOTHEN : MUBUF_m ; - } - - let offen = 0, idxen = 0 in { - defm _ADDR64 : MUBUFAddr64_m ; - } - } -} - -multiclass MUBUF_Store_Helper { - let mayLoad = 0, mayStore = 1 in { - defm : MUBUF_m ; - - let offen = 0, idxen = 0, vaddr = 0 in { - defm _OFFSET : MUBUF_m ; - } // offen = 0, idxen = 0, vaddr = 0 - - let offen = 1, idxen = 0 in { - defm _OFFEN : MUBUF_m ; - } // end offen = 1, idxen = 0 - - let offen = 0, idxen = 1 in { - defm _IDXEN : MUBUF_m ; - } - - let offen = 1, idxen = 1 in { - defm _BOTHEN : MUBUF_m ; - } - - let offen = 0, idxen = 0 in { - defm _ADDR64 : MUBUFAddr64_m ; - } - } // End mayLoad = 0, mayStore = 1 -} - -class FLAT_Load_Helper op, string asm, RegisterClass regClass> : - FLAT { - let data = 0; - let mayLoad = 1; -} - -class FLAT_Store_Helper op, string name, RegisterClass vdataClass> : - FLAT { - - let mayLoad = 0; - let mayStore = 1; - - // Encoding - let vdst = 0; -} - -multiclass FLAT_ATOMIC op, string name, RegisterClass vdst_rc, - RegisterClass data_rc = vdst_rc> { - - let mayLoad = 1, mayStore = 1 in { - def "" : FLAT , - AtomicNoRet { - let glc = 0; - let vdst = 0; - } - - def _RTN : FLAT , - AtomicNoRet { - let glc = 1; - } - } -} - -class MIMG_Mask { - string Op = op; - int Channels = channels; -} - -class MIMG_NoSampler_Helper op, string asm, - RegisterClass dst_rc, - RegisterClass src_rc> : MIMG < - op, - (outs dst_rc:$vdata), - (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, - i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr, - SReg_256:$srsrc), - asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," - #" $tfe, $lwe, $slc, $vaddr, $srsrc", - []> { - let ssamp = 0; - let mayLoad = 1; - let mayStore = 0; - let hasPostISelHook = 1; -} - -multiclass MIMG_NoSampler_Src_Helper op, string asm, - RegisterClass dst_rc, - int channels> { - def _V1 : MIMG_NoSampler_Helper , - MIMG_Mask; - def _V2 : MIMG_NoSampler_Helper , - MIMG_Mask; - def _V4 : MIMG_NoSampler_Helper , - MIMG_Mask; -} - -multiclass MIMG_NoSampler op, string asm> { - defm _V1 : MIMG_NoSampler_Src_Helper ; - defm _V2 : MIMG_NoSampler_Src_Helper ; - defm _V3 : MIMG_NoSampler_Src_Helper ; - defm _V4 : MIMG_NoSampler_Src_Helper ; -} - -class MIMG_Sampler_Helper op, string asm, - RegisterClass dst_rc, - RegisterClass src_rc, int wqm> : MIMG < - op, - (outs dst_rc:$vdata), - (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, - i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr, - SReg_256:$srsrc, SReg_128:$ssamp), - asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," - #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp", - []> { - let mayLoad = 1; - let mayStore = 0; - let hasPostISelHook = 1; - let WQM = wqm; -} - -multiclass MIMG_Sampler_Src_Helper op, string asm, - RegisterClass dst_rc, - int channels, int wqm> { - def _V1 : MIMG_Sampler_Helper , - MIMG_Mask; - def _V2 : MIMG_Sampler_Helper , - MIMG_Mask; - def _V4 : MIMG_Sampler_Helper , - MIMG_Mask; - def _V8 : MIMG_Sampler_Helper , - MIMG_Mask; - def _V16 : MIMG_Sampler_Helper , - MIMG_Mask; -} - -multiclass MIMG_Sampler op, string asm> { - defm _V1 : MIMG_Sampler_Src_Helper; - defm _V2 : MIMG_Sampler_Src_Helper; - defm _V3 : MIMG_Sampler_Src_Helper; - defm _V4 : MIMG_Sampler_Src_Helper; -} - -multiclass MIMG_Sampler_WQM op, string asm> { - defm _V1 : MIMG_Sampler_Src_Helper; - defm _V2 : MIMG_Sampler_Src_Helper; - defm _V3 : MIMG_Sampler_Src_Helper; - defm _V4 : MIMG_Sampler_Src_Helper; -} - -class MIMG_Gather_Helper op, string asm, - RegisterClass dst_rc, - RegisterClass src_rc, int wqm> : MIMG < - op, - (outs dst_rc:$vdata), - (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, - i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr, - SReg_256:$srsrc, SReg_128:$ssamp), - asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," - #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp", - []> { - let mayLoad = 1; - let mayStore = 0; - - // DMASK was repurposed for GATHER4. 4 components are always - // returned and DMASK works like a swizzle - it selects - // the component to fetch. The only useful DMASK values are - // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns - // (red,red,red,red) etc.) The ISA document doesn't mention - // this. - // Therefore, disable all code which updates DMASK by setting these two: - let MIMG = 0; - let hasPostISelHook = 0; - let WQM = wqm; -} - -multiclass MIMG_Gather_Src_Helper op, string asm, - RegisterClass dst_rc, - int channels, int wqm> { - def _V1 : MIMG_Gather_Helper , - MIMG_Mask; - def _V2 : MIMG_Gather_Helper , - MIMG_Mask; - def _V4 : MIMG_Gather_Helper , - MIMG_Mask; - def _V8 : MIMG_Gather_Helper , - MIMG_Mask; - def _V16 : MIMG_Gather_Helper , - MIMG_Mask; -} - -multiclass MIMG_Gather op, string asm> { - defm _V1 : MIMG_Gather_Src_Helper; - defm _V2 : MIMG_Gather_Src_Helper; - defm _V3 : MIMG_Gather_Src_Helper; - defm _V4 : MIMG_Gather_Src_Helper; -} - -multiclass MIMG_Gather_WQM op, string asm> { - defm _V1 : MIMG_Gather_Src_Helper; - defm _V2 : MIMG_Gather_Src_Helper; - defm _V3 : MIMG_Gather_Src_Helper; - defm _V4 : MIMG_Gather_Src_Helper; -} - -//===----------------------------------------------------------------------===// -// Vector instruction mappings -//===----------------------------------------------------------------------===// - -// Maps an opcode in e32 form to its e64 equivalent -def getVOPe64 : InstrMapping { - let FilterClass = "VOP"; - let RowFields = ["OpName"]; - let ColFields = ["Size"]; - let KeyCol = ["4"]; - let ValueCols = [["8"]]; -} - -// Maps an opcode in e64 form to its e32 equivalent -def getVOPe32 : InstrMapping { - let FilterClass = "VOP"; - let RowFields = ["OpName"]; - let ColFields = ["Size"]; - let KeyCol = ["8"]; - let ValueCols = [["4"]]; -} - -def getMaskedMIMGOp : InstrMapping { - let FilterClass = "MIMG_Mask"; - let RowFields = ["Op"]; - let ColFields = ["Channels"]; - let KeyCol = ["4"]; - let ValueCols = [["1"], ["2"], ["3"] ]; -} - -// Maps an commuted opcode to its original version -def getCommuteOrig : InstrMapping { - let FilterClass = "VOP2_REV"; - let RowFields = ["RevOp"]; - let ColFields = ["IsOrig"]; - let KeyCol = ["0"]; - let ValueCols = [["1"]]; -} - -// Maps an original opcode to its commuted version -def getCommuteRev : InstrMapping { - let FilterClass = "VOP2_REV"; - let RowFields = ["RevOp"]; - let ColFields = ["IsOrig"]; - let KeyCol = ["1"]; - let ValueCols = [["0"]]; -} - -def getCommuteCmpOrig : InstrMapping { - let FilterClass = "VOP2_REV"; - let RowFields = ["RevOp"]; - let ColFields = ["IsOrig"]; - let KeyCol = ["0"]; - let ValueCols = [["1"]]; -} - -// Maps an original opcode to its commuted version -def getCommuteCmpRev : InstrMapping { - let FilterClass = "VOP2_REV"; - let RowFields = ["RevOp"]; - let ColFields = ["IsOrig"]; - let KeyCol = ["1"]; - let ValueCols = [["0"]]; -} - - -def getMCOpcodeGen : InstrMapping { - let FilterClass = "SIMCInstr"; - let RowFields = ["PseudoInstr"]; - let ColFields = ["Subtarget"]; - let KeyCol = [!cast(SISubtarget.NONE)]; - let ValueCols = [[!cast(SISubtarget.SI)],[!cast(SISubtarget.VI)]]; -} - -def getAddr64Inst : InstrMapping { - let FilterClass = "MUBUFAddr64Table"; - let RowFields = ["OpName"]; - let ColFields = ["IsAddr64"]; - let KeyCol = ["0"]; - let ValueCols = [["1"]]; -} - -// Maps an atomic opcode to its version with a return value. -def getAtomicRetOp : InstrMapping { - let FilterClass = "AtomicNoRet"; - let RowFields = ["NoRetOp"]; - let ColFields = ["IsRet"]; - let KeyCol = ["0"]; - let ValueCols = [["1"]]; -} - -// Maps an atomic opcode to its returnless version. -def getAtomicNoRetOp : InstrMapping { - let FilterClass = "AtomicNoRet"; - let RowFields = ["NoRetOp"]; - let ColFields = ["IsRet"]; - let KeyCol = ["1"]; - let ValueCols = [["0"]]; -} - -include "SIInstructions.td" -include "CIInstructions.td" -include "VIInstructions.td" diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td deleted file mode 100644 index 8c8d836776d..00000000000 --- a/lib/Target/R600/SIInstructions.td +++ /dev/null @@ -1,3327 +0,0 @@ -//===-- SIInstructions.td - SI Instruction Defintions ---------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// This file was originally auto-generated from a GPU register header file and -// all the instruction definitions were originally commented out. Instructions -// that are not yet supported remain commented out. -//===----------------------------------------------------------------------===// - -class InterpSlots { -int P0 = 2; -int P10 = 0; -int P20 = 1; -} -def INTERP : InterpSlots; - -def InterpSlot : Operand { - let PrintMethod = "printInterpSlot"; -} - -def SendMsgImm : Operand { - let PrintMethod = "printSendMsg"; -} - -def isGCN : Predicate<"Subtarget->getGeneration() " - ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">, - AssemblerPredicate<"FeatureGCN">; -def isSI : Predicate<"Subtarget->getGeneration() " - "== AMDGPUSubtarget::SOUTHERN_ISLANDS">; - -def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">; -def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">; - -def SWaitMatchClass : AsmOperandClass { - let Name = "SWaitCnt"; - let RenderMethod = "addImmOperands"; - let ParserMethod = "parseSWaitCntOps"; -} - -def WAIT_FLAG : InstFlag<"printWaitFlag"> { - let ParserMatchClass = SWaitMatchClass; -} - -let SubtargetPredicate = isGCN in { - -//===----------------------------------------------------------------------===// -// EXP Instructions -//===----------------------------------------------------------------------===// - -defm EXP : EXP_m; - -//===----------------------------------------------------------------------===// -// SMRD Instructions -//===----------------------------------------------------------------------===// - -let mayLoad = 1 in { - -// We are using the SGPR_32 and not the SReg_32 register class for 32-bit -// SMRD instructions, because the SGPR_32 register class does not include M0 -// and writing to M0 from an SMRD instruction will hang the GPU. -defm S_LOAD_DWORD : SMRD_Helper <0x00, "s_load_dword", SReg_64, SGPR_32>; -defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "s_load_dwordx2", SReg_64, SReg_64>; -defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "s_load_dwordx4", SReg_64, SReg_128>; -defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "s_load_dwordx8", SReg_64, SReg_256>; -defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "s_load_dwordx16", SReg_64, SReg_512>; - -defm S_BUFFER_LOAD_DWORD : SMRD_Helper < - 0x08, "s_buffer_load_dword", SReg_128, SGPR_32 ->; - -defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper < - 0x09, "s_buffer_load_dwordx2", SReg_128, SReg_64 ->; - -defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper < - 0x0a, "s_buffer_load_dwordx4", SReg_128, SReg_128 ->; - -defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper < - 0x0b, "s_buffer_load_dwordx8", SReg_128, SReg_256 ->; - -defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper < - 0x0c, "s_buffer_load_dwordx16", SReg_128, SReg_512 ->; - -} // mayLoad = 1 - -//def S_MEMTIME : SMRD_ <0x0000001e, "s_memtime", []>; -//def S_DCACHE_INV : SMRD_ <0x0000001f, "s_dcache_inv", []>; - -//===----------------------------------------------------------------------===// -// SOP1 Instructions -//===----------------------------------------------------------------------===// - -let isMoveImm = 1 in { - let isReMaterializable = 1, isAsCheapAsAMove = 1 in { - defm S_MOV_B32 : SOP1_32 , "s_mov_b32", []>; - defm S_MOV_B64 : SOP1_64 , "s_mov_b64", []>; - } // let isRematerializeable = 1 - - let Uses = [SCC] in { - defm S_CMOV_B32 : SOP1_32 , "s_cmov_b32", []>; - defm S_CMOV_B64 : SOP1_64 , "s_cmov_b64", []>; - } // End Uses = [SCC] -} // End isMoveImm = 1 - -let Defs = [SCC] in { - defm S_NOT_B32 : SOP1_32 , "s_not_b32", - [(set i32:$dst, (not i32:$src0))] - >; - - defm S_NOT_B64 : SOP1_64 , "s_not_b64", - [(set i64:$dst, (not i64:$src0))] - >; - defm S_WQM_B32 : SOP1_32 , "s_wqm_b32", []>; - defm S_WQM_B64 : SOP1_64 , "s_wqm_b64", []>; -} // End Defs = [SCC] - - -defm S_BREV_B32 : SOP1_32 , "s_brev_b32", - [(set i32:$dst, (AMDGPUbrev i32:$src0))] ->; -defm S_BREV_B64 : SOP1_64 , "s_brev_b64", []>; - -let Defs = [SCC] in { - defm S_BCNT0_I32_B32 : SOP1_32 , "s_bcnt0_i32_b32", []>; - defm S_BCNT0_I32_B64 : SOP1_32_64 , "s_bcnt0_i32_b64", []>; - defm S_BCNT1_I32_B32 : SOP1_32 , "s_bcnt1_i32_b32", - [(set i32:$dst, (ctpop i32:$src0))] - >; - defm S_BCNT1_I32_B64 : SOP1_32_64 , "s_bcnt1_i32_b64", []>; -} // End Defs = [SCC] - -defm S_FF0_I32_B32 : SOP1_32 , "s_ff0_i32_b32", []>; -defm S_FF0_I32_B64 : SOP1_32_64 , "s_ff0_i32_b64", []>; -defm S_FF1_I32_B32 : SOP1_32 , "s_ff1_i32_b32", - [(set i32:$dst, (cttz_zero_undef i32:$src0))] ->; -defm S_FF1_I32_B64 : SOP1_32_64 , "s_ff1_i32_b64", []>; - -defm S_FLBIT_I32_B32 : SOP1_32 , "s_flbit_i32_b32", - [(set i32:$dst, (ctlz_zero_undef i32:$src0))] ->; - -defm S_FLBIT_I32_B64 : SOP1_32_64 , "s_flbit_i32_b64", []>; -defm S_FLBIT_I32 : SOP1_32 , "s_flbit_i32", - [(set i32:$dst, (int_AMDGPU_flbit_i32 i32:$src0))] ->; -defm S_FLBIT_I32_I64 : SOP1_32_64 , "s_flbit_i32_i64", []>; -defm S_SEXT_I32_I8 : SOP1_32 , "s_sext_i32_i8", - [(set i32:$dst, (sext_inreg i32:$src0, i8))] ->; -defm S_SEXT_I32_I16 : SOP1_32 , "s_sext_i32_i16", - [(set i32:$dst, (sext_inreg i32:$src0, i16))] ->; - -defm S_BITSET0_B32 : SOP1_32 , "s_bitset0_b32", []>; -defm S_BITSET0_B64 : SOP1_64 , "s_bitset0_b64", []>; -defm S_BITSET1_B32 : SOP1_32 , "s_bitset1_b32", []>; -defm S_BITSET1_B64 : SOP1_64 , "s_bitset1_b64", []>; -defm S_GETPC_B64 : SOP1_64_0 , "s_getpc_b64", []>; -defm S_SETPC_B64 : SOP1_64 , "s_setpc_b64", []>; -defm S_SWAPPC_B64 : SOP1_64 , "s_swappc_b64", []>; -defm S_RFE_B64 : SOP1_64 , "s_rfe_b64", []>; - -let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in { - -defm S_AND_SAVEEXEC_B64 : SOP1_64 , "s_and_saveexec_b64", []>; -defm S_OR_SAVEEXEC_B64 : SOP1_64 , "s_or_saveexec_b64", []>; -defm S_XOR_SAVEEXEC_B64 : SOP1_64 , "s_xor_saveexec_b64", []>; -defm S_ANDN2_SAVEEXEC_B64 : SOP1_64 , "s_andn2_saveexec_b64", []>; -defm S_ORN2_SAVEEXEC_B64 : SOP1_64 , "s_orn2_saveexec_b64", []>; -defm S_NAND_SAVEEXEC_B64 : SOP1_64 , "s_nand_saveexec_b64", []>; -defm S_NOR_SAVEEXEC_B64 : SOP1_64 , "s_nor_saveexec_b64", []>; -defm S_XNOR_SAVEEXEC_B64 : SOP1_64 , "s_xnor_saveexec_b64", []>; - -} // End hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] - -defm S_QUADMASK_B32 : SOP1_32 , "s_quadmask_b32", []>; -defm S_QUADMASK_B64 : SOP1_64 , "s_quadmask_b64", []>; -defm S_MOVRELS_B32 : SOP1_32 , "s_movrels_b32", []>; -defm S_MOVRELS_B64 : SOP1_64 , "s_movrels_b64", []>; -defm S_MOVRELD_B32 : SOP1_32 , "s_movreld_b32", []>; -defm S_MOVRELD_B64 : SOP1_64 , "s_movreld_b64", []>; -defm S_CBRANCH_JOIN : SOP1_1 , "s_cbranch_join", []>; -defm S_MOV_REGRD_B32 : SOP1_32 , "s_mov_regrd_b32", []>; -let Defs = [SCC] in { - defm S_ABS_I32 : SOP1_32 , "s_abs_i32", []>; -} // End Defs = [SCC] -defm S_MOV_FED_B32 : SOP1_32 , "s_mov_fed_b32", []>; - -//===----------------------------------------------------------------------===// -// SOP2 Instructions -//===----------------------------------------------------------------------===// - -let Defs = [SCC] in { // Carry out goes to SCC -let isCommutable = 1 in { -defm S_ADD_U32 : SOP2_32 , "s_add_u32", []>; -defm S_ADD_I32 : SOP2_32 , "s_add_i32", - [(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))] ->; -} // End isCommutable = 1 - -defm S_SUB_U32 : SOP2_32 , "s_sub_u32", []>; -defm S_SUB_I32 : SOP2_32 , "s_sub_i32", - [(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))] ->; - -let Uses = [SCC] in { // Carry in comes from SCC -let isCommutable = 1 in { -defm S_ADDC_U32 : SOP2_32 , "s_addc_u32", - [(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; -} // End isCommutable = 1 - -defm S_SUBB_U32 : SOP2_32 , "s_subb_u32", - [(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; -} // End Uses = [SCC] - -defm S_MIN_I32 : SOP2_32 , "s_min_i32", - [(set i32:$dst, (smin i32:$src0, i32:$src1))] ->; -defm S_MIN_U32 : SOP2_32 , "s_min_u32", - [(set i32:$dst, (umin i32:$src0, i32:$src1))] ->; -defm S_MAX_I32 : SOP2_32 , "s_max_i32", - [(set i32:$dst, (smax i32:$src0, i32:$src1))] ->; -defm S_MAX_U32 : SOP2_32 , "s_max_u32", - [(set i32:$dst, (umax i32:$src0, i32:$src1))] ->; -} // End Defs = [SCC] - - -let Uses = [SCC] in { - defm S_CSELECT_B32 : SOP2_32 , "s_cselect_b32", []>; - defm S_CSELECT_B64 : SOP2_64 , "s_cselect_b64", []>; -} // End Uses = [SCC] - -let Defs = [SCC] in { -defm S_AND_B32 : SOP2_32 , "s_and_b32", - [(set i32:$dst, (and i32:$src0, i32:$src1))] ->; - -defm S_AND_B64 : SOP2_64 , "s_and_b64", - [(set i64:$dst, (and i64:$src0, i64:$src1))] ->; - -defm S_OR_B32 : SOP2_32 , "s_or_b32", - [(set i32:$dst, (or i32:$src0, i32:$src1))] ->; - -defm S_OR_B64 : SOP2_64 , "s_or_b64", - [(set i64:$dst, (or i64:$src0, i64:$src1))] ->; - -defm S_XOR_B32 : SOP2_32 , "s_xor_b32", - [(set i32:$dst, (xor i32:$src0, i32:$src1))] ->; - -defm S_XOR_B64 : SOP2_64 , "s_xor_b64", - [(set i64:$dst, (xor i64:$src0, i64:$src1))] ->; -defm S_ANDN2_B32 : SOP2_32 , "s_andn2_b32", []>; -defm S_ANDN2_B64 : SOP2_64 , "s_andn2_b64", []>; -defm S_ORN2_B32 : SOP2_32 , "s_orn2_b32", []>; -defm S_ORN2_B64 : SOP2_64 , "s_orn2_b64", []>; -defm S_NAND_B32 : SOP2_32 , "s_nand_b32", []>; -defm S_NAND_B64 : SOP2_64 , "s_nand_b64", []>; -defm S_NOR_B32 : SOP2_32 , "s_nor_b32", []>; -defm S_NOR_B64 : SOP2_64 , "s_nor_b64", []>; -defm S_XNOR_B32 : SOP2_32 , "s_xnor_b32", []>; -defm S_XNOR_B64 : SOP2_64 , "s_xnor_b64", []>; -} // End Defs = [SCC] - -// Use added complexity so these patterns are preferred to the VALU patterns. -let AddedComplexity = 1 in { -let Defs = [SCC] in { - -defm S_LSHL_B32 : SOP2_32 , "s_lshl_b32", - [(set i32:$dst, (shl i32:$src0, i32:$src1))] ->; -defm S_LSHL_B64 : SOP2_64_32 , "s_lshl_b64", - [(set i64:$dst, (shl i64:$src0, i32:$src1))] ->; -defm S_LSHR_B32 : SOP2_32 , "s_lshr_b32", - [(set i32:$dst, (srl i32:$src0, i32:$src1))] ->; -defm S_LSHR_B64 : SOP2_64_32 , "s_lshr_b64", - [(set i64:$dst, (srl i64:$src0, i32:$src1))] ->; -defm S_ASHR_I32 : SOP2_32 , "s_ashr_i32", - [(set i32:$dst, (sra i32:$src0, i32:$src1))] ->; -defm S_ASHR_I64 : SOP2_64_32 , "s_ashr_i64", - [(set i64:$dst, (sra i64:$src0, i32:$src1))] ->; -} // End Defs = [SCC] - -defm S_BFM_B32 : SOP2_32 , "s_bfm_b32", - [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))]>; -defm S_BFM_B64 : SOP2_64 , "s_bfm_b64", []>; -defm S_MUL_I32 : SOP2_32 , "s_mul_i32", - [(set i32:$dst, (mul i32:$src0, i32:$src1))] ->; - -} // End AddedComplexity = 1 - -let Defs = [SCC] in { -defm S_BFE_U32 : SOP2_32 , "s_bfe_u32", []>; -defm S_BFE_I32 : SOP2_32 , "s_bfe_i32", []>; -defm S_BFE_U64 : SOP2_64 , "s_bfe_u64", []>; -defm S_BFE_I64 : SOP2_64_32 , "s_bfe_i64", []>; -} // End Defs = [SCC] - -let sdst = 0 in { -defm S_CBRANCH_G_FORK : SOP2_m < - sop2<0x2b, 0x29>, "s_cbranch_g_fork", (outs), - (ins SReg_64:$src0, SReg_64:$src1), "s_cbranch_g_fork $src0, $src1", [] ->; -} - -let Defs = [SCC] in { -defm S_ABSDIFF_I32 : SOP2_32 , "s_absdiff_i32", []>; -} // End Defs = [SCC] - -//===----------------------------------------------------------------------===// -// SOPC Instructions -//===----------------------------------------------------------------------===// - -def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "s_cmp_eq_i32">; -def S_CMP_LG_I32 : SOPC_32 <0x00000001, "s_cmp_lg_i32">; -def S_CMP_GT_I32 : SOPC_32 <0x00000002, "s_cmp_gt_i32">; -def S_CMP_GE_I32 : SOPC_32 <0x00000003, "s_cmp_ge_i32">; -def S_CMP_LT_I32 : SOPC_32 <0x00000004, "s_cmp_lt_i32">; -def S_CMP_LE_I32 : SOPC_32 <0x00000005, "s_cmp_le_i32">; -def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "s_cmp_eq_u32">; -def S_CMP_LG_U32 : SOPC_32 <0x00000007, "s_cmp_lg_u32">; -def S_CMP_GT_U32 : SOPC_32 <0x00000008, "s_cmp_gt_u32">; -def S_CMP_GE_U32 : SOPC_32 <0x00000009, "s_cmp_ge_u32">; -def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "s_cmp_lt_u32">; -def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32">; -////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "s_bitcmp0_b32", []>; -////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "s_bitcmp1_b32", []>; -////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "s_bitcmp0_b64", []>; -////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "s_bitcmp1_b64", []>; -//def S_SETVSKIP : SOPC_ <0x00000010, "s_setvskip", []>; - -//===----------------------------------------------------------------------===// -// SOPK Instructions -//===----------------------------------------------------------------------===// - -let isReMaterializable = 1 in { -defm S_MOVK_I32 : SOPK_32 , "s_movk_i32", []>; -} // End isReMaterializable = 1 -let Uses = [SCC] in { - defm S_CMOVK_I32 : SOPK_32 , "s_cmovk_i32", []>; -} - -let isCompare = 1 in { - -/* -This instruction is disabled for now until we can figure out how to teach -the instruction selector to correctly use the S_CMP* vs V_CMP* -instructions. - -When this instruction is enabled the code generator sometimes produces this -invalid sequence: - -SCC = S_CMPK_EQ_I32 SGPR0, imm -VCC = COPY SCC -VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1 - -defm S_CMPK_EQ_I32 : SOPK_SCC , "s_cmpk_eq_i32", - [(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))] ->; -*/ - -defm S_CMPK_EQ_I32 : SOPK_SCC , "s_cmpk_eq_i32", []>; -defm S_CMPK_LG_I32 : SOPK_SCC , "s_cmpk_lg_i32", []>; -defm S_CMPK_GT_I32 : SOPK_SCC , "s_cmpk_gt_i32", []>; -defm S_CMPK_GE_I32 : SOPK_SCC , "s_cmpk_ge_i32", []>; -defm S_CMPK_LT_I32 : SOPK_SCC , "s_cmpk_lt_i32", []>; -defm S_CMPK_LE_I32 : SOPK_SCC , "s_cmpk_le_i32", []>; -defm S_CMPK_EQ_U32 : SOPK_SCC , "s_cmpk_eq_u32", []>; -defm S_CMPK_LG_U32 : SOPK_SCC , "s_cmpk_lg_u32", []>; -defm S_CMPK_GT_U32 : SOPK_SCC , "s_cmpk_gt_u32", []>; -defm S_CMPK_GE_U32 : SOPK_SCC , "s_cmpk_ge_u32", []>; -defm S_CMPK_LT_U32 : SOPK_SCC , "s_cmpk_lt_u32", []>; -defm S_CMPK_LE_U32 : SOPK_SCC , "s_cmpk_le_u32", []>; -} // End isCompare = 1 - -let Defs = [SCC], isCommutable = 1, DisableEncoding = "$src0", - Constraints = "$sdst = $src0" in { - defm S_ADDK_I32 : SOPK_32TIE , "s_addk_i32", []>; - defm S_MULK_I32 : SOPK_32TIE , "s_mulk_i32", []>; -} - -defm S_CBRANCH_I_FORK : SOPK_m < - sopk<0x11, 0x10>, "s_cbranch_i_fork", (outs), - (ins SReg_64:$sdst, u16imm:$simm16), " $sdst, $simm16" ->; -defm S_GETREG_B32 : SOPK_32 , "s_getreg_b32", []>; -defm S_SETREG_B32 : SOPK_m < - sopk<0x13, 0x12>, "s_setreg_b32", (outs), - (ins SReg_32:$sdst, u16imm:$simm16), " $sdst, $simm16" ->; -// FIXME: Not on SI? -//defm S_GETREG_REGRD_B32 : SOPK_32 , "s_getreg_regrd_b32", []>; -defm S_SETREG_IMM32_B32 : SOPK_IMM32 < - sopk<0x15, 0x14>, "s_setreg_imm32_b32", (outs), - (ins i32imm:$imm, u16imm:$simm16), " $imm, $simm16" ->; - -//===----------------------------------------------------------------------===// -// SOPP Instructions -//===----------------------------------------------------------------------===// - -def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "s_nop $simm16">; - -let isTerminator = 1 in { - -def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm", - [(IL_retflag)]> { - let simm16 = 0; - let isBarrier = 1; - let hasCtrlDep = 1; -} - -let isBranch = 1 in { -def S_BRANCH : SOPP < - 0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16", - [(br bb:$simm16)]> { - let isBarrier = 1; -} - -let DisableEncoding = "$scc" in { -def S_CBRANCH_SCC0 : SOPP < - 0x00000004, (ins sopp_brtarget:$simm16, SCCReg:$scc), - "s_cbranch_scc0 $simm16" ->; -def S_CBRANCH_SCC1 : SOPP < - 0x00000005, (ins sopp_brtarget:$simm16, SCCReg:$scc), - "s_cbranch_scc1 $simm16" ->; -} // End DisableEncoding = "$scc" - -def S_CBRANCH_VCCZ : SOPP < - 0x00000006, (ins sopp_brtarget:$simm16, VCCReg:$vcc), - "s_cbranch_vccz $simm16" ->; -def S_CBRANCH_VCCNZ : SOPP < - 0x00000007, (ins sopp_brtarget:$simm16, VCCReg:$vcc), - "s_cbranch_vccnz $simm16" ->; - -let DisableEncoding = "$exec" in { -def S_CBRANCH_EXECZ : SOPP < - 0x00000008, (ins sopp_brtarget:$simm16, EXECReg:$exec), - "s_cbranch_execz $simm16" ->; -def S_CBRANCH_EXECNZ : SOPP < - 0x00000009, (ins sopp_brtarget:$simm16, EXECReg:$exec), - "s_cbranch_execnz $simm16" ->; -} // End DisableEncoding = "$exec" - - -} // End isBranch = 1 -} // End isTerminator = 1 - -let hasSideEffects = 1 in { -def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier", - [(int_AMDGPU_barrier_local)] -> { - let simm16 = 0; - let isBarrier = 1; - let hasCtrlDep = 1; - let mayLoad = 1; - let mayStore = 1; -} - -def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">; -def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">; -def S_SLEEP : SOPP <0x0000000e, (ins i16imm:$simm16), "s_sleep $simm16">; -def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$sim16), "s_setprio $sim16">; - -let Uses = [EXEC, M0] in { - def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16), "s_sendmsg $simm16", - [(AMDGPUsendmsg (i32 imm:$simm16))] - >; -} // End Uses = [EXEC, M0] - -def S_SENDMSGHALT : SOPP <0x00000011, (ins i16imm:$simm16), "s_sendmsghalt $simm16">; -def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16">; -def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> { - let simm16 = 0; -} -def S_INCPERFLEVEL : SOPP <0x00000014, (ins i16imm:$simm16), "s_incperflevel $simm16">; -def S_DECPERFLEVEL : SOPP <0x00000015, (ins i16imm:$simm16), "s_decperflevel $simm16">; -def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> { - let simm16 = 0; -} -} // End hasSideEffects - -//===----------------------------------------------------------------------===// -// VOPC Instructions -//===----------------------------------------------------------------------===// - -let isCompare = 1, isCommutable = 1 in { - -defm V_CMP_F_F32 : VOPC_F32 , "v_cmp_f_f32">; -defm V_CMP_LT_F32 : VOPC_F32 , "v_cmp_lt_f32", COND_OLT, "v_cmp_gt_f32">; -defm V_CMP_EQ_F32 : VOPC_F32 , "v_cmp_eq_f32", COND_OEQ>; -defm V_CMP_LE_F32 : VOPC_F32 , "v_cmp_le_f32", COND_OLE, "v_cmp_ge_f32">; -defm V_CMP_GT_F32 : VOPC_F32 , "v_cmp_gt_f32", COND_OGT>; -defm V_CMP_LG_F32 : VOPC_F32 , "v_cmp_lg_f32", COND_ONE>; -defm V_CMP_GE_F32 : VOPC_F32 , "v_cmp_ge_f32", COND_OGE>; -defm V_CMP_O_F32 : VOPC_F32 , "v_cmp_o_f32", COND_O>; -defm V_CMP_U_F32 : VOPC_F32 , "v_cmp_u_f32", COND_UO>; -defm V_CMP_NGE_F32 : VOPC_F32 , "v_cmp_nge_f32", COND_ULT, "v_cmp_nle_f32">; -defm V_CMP_NLG_F32 : VOPC_F32 , "v_cmp_nlg_f32", COND_UEQ>; -defm V_CMP_NGT_F32 : VOPC_F32 , "v_cmp_ngt_f32", COND_ULE, "v_cmp_nlt_f32">; -defm V_CMP_NLE_F32 : VOPC_F32 , "v_cmp_nle_f32", COND_UGT>; -defm V_CMP_NEQ_F32 : VOPC_F32 , "v_cmp_neq_f32", COND_UNE>; -defm V_CMP_NLT_F32 : VOPC_F32 , "v_cmp_nlt_f32", COND_UGE>; -defm V_CMP_TRU_F32 : VOPC_F32 , "v_cmp_tru_f32">; - - -defm V_CMPX_F_F32 : VOPCX_F32 , "v_cmpx_f_f32">; -defm V_CMPX_LT_F32 : VOPCX_F32 , "v_cmpx_lt_f32", "v_cmpx_gt_f32">; -defm V_CMPX_EQ_F32 : VOPCX_F32 , "v_cmpx_eq_f32">; -defm V_CMPX_LE_F32 : VOPCX_F32 , "v_cmpx_le_f32", "v_cmpx_ge_f32">; -defm V_CMPX_GT_F32 : VOPCX_F32 , "v_cmpx_gt_f32">; -defm V_CMPX_LG_F32 : VOPCX_F32 , "v_cmpx_lg_f32">; -defm V_CMPX_GE_F32 : VOPCX_F32 , "v_cmpx_ge_f32">; -defm V_CMPX_O_F32 : VOPCX_F32 , "v_cmpx_o_f32">; -defm V_CMPX_U_F32 : VOPCX_F32 , "v_cmpx_u_f32">; -defm V_CMPX_NGE_F32 : VOPCX_F32 , "v_cmpx_nge_f32">; -defm V_CMPX_NLG_F32 : VOPCX_F32 , "v_cmpx_nlg_f32">; -defm V_CMPX_NGT_F32 : VOPCX_F32 , "v_cmpx_ngt_f32">; -defm V_CMPX_NLE_F32 : VOPCX_F32 , "v_cmpx_nle_f32">; -defm V_CMPX_NEQ_F32 : VOPCX_F32 , "v_cmpx_neq_f32">; -defm V_CMPX_NLT_F32 : VOPCX_F32 , "v_cmpx_nlt_f32">; -defm V_CMPX_TRU_F32 : VOPCX_F32 , "v_cmpx_tru_f32">; - - -defm V_CMP_F_F64 : VOPC_F64 , "v_cmp_f_f64">; -defm V_CMP_LT_F64 : VOPC_F64 , "v_cmp_lt_f64", COND_OLT, "v_cmp_gt_f64">; -defm V_CMP_EQ_F64 : VOPC_F64 , "v_cmp_eq_f64", COND_OEQ>; -defm V_CMP_LE_F64 : VOPC_F64 , "v_cmp_le_f64", COND_OLE, "v_cmp_ge_f64">; -defm V_CMP_GT_F64 : VOPC_F64 , "v_cmp_gt_f64", COND_OGT>; -defm V_CMP_LG_F64 : VOPC_F64 , "v_cmp_lg_f64", COND_ONE>; -defm V_CMP_GE_F64 : VOPC_F64 , "v_cmp_ge_f64", COND_OGE>; -defm V_CMP_O_F64 : VOPC_F64 , "v_cmp_o_f64", COND_O>; -defm V_CMP_U_F64 : VOPC_F64 , "v_cmp_u_f64", COND_UO>; -defm V_CMP_NGE_F64 : VOPC_F64 , "v_cmp_nge_f64", COND_ULT, "v_cmp_nle_f64">; -defm V_CMP_NLG_F64 : VOPC_F64 , "v_cmp_nlg_f64", COND_UEQ>; -defm V_CMP_NGT_F64 : VOPC_F64 , "v_cmp_ngt_f64", COND_ULE, "v_cmp_nlt_f64">; -defm V_CMP_NLE_F64 : VOPC_F64 , "v_cmp_nle_f64", COND_UGT>; -defm V_CMP_NEQ_F64 : VOPC_F64 , "v_cmp_neq_f64", COND_UNE>; -defm V_CMP_NLT_F64 : VOPC_F64 , "v_cmp_nlt_f64", COND_UGE>; -defm V_CMP_TRU_F64 : VOPC_F64 , "v_cmp_tru_f64">; - - -defm V_CMPX_F_F64 : VOPCX_F64 , "v_cmpx_f_f64">; -defm V_CMPX_LT_F64 : VOPCX_F64 , "v_cmpx_lt_f64", "v_cmpx_gt_f64">; -defm V_CMPX_EQ_F64 : VOPCX_F64 , "v_cmpx_eq_f64">; -defm V_CMPX_LE_F64 : VOPCX_F64 , "v_cmpx_le_f64", "v_cmpx_ge_f64">; -defm V_CMPX_GT_F64 : VOPCX_F64 , "v_cmpx_gt_f64">; -defm V_CMPX_LG_F64 : VOPCX_F64 , "v_cmpx_lg_f64">; -defm V_CMPX_GE_F64 : VOPCX_F64 , "v_cmpx_ge_f64">; -defm V_CMPX_O_F64 : VOPCX_F64 , "v_cmpx_o_f64">; -defm V_CMPX_U_F64 : VOPCX_F64 , "v_cmpx_u_f64">; -defm V_CMPX_NGE_F64 : VOPCX_F64 , "v_cmpx_nge_f64", "v_cmpx_nle_f64">; -defm V_CMPX_NLG_F64 : VOPCX_F64 , "v_cmpx_nlg_f64">; -defm V_CMPX_NGT_F64 : VOPCX_F64 , "v_cmpx_ngt_f64", "v_cmpx_nlt_f64">; -defm V_CMPX_NLE_F64 : VOPCX_F64 , "v_cmpx_nle_f64">; -defm V_CMPX_NEQ_F64 : VOPCX_F64 , "v_cmpx_neq_f64">; -defm V_CMPX_NLT_F64 : VOPCX_F64 , "v_cmpx_nlt_f64">; -defm V_CMPX_TRU_F64 : VOPCX_F64 , "v_cmpx_tru_f64">; - - -let SubtargetPredicate = isSICI in { - -defm V_CMPS_F_F32 : VOPC_F32 , "v_cmps_f_f32">; -defm V_CMPS_LT_F32 : VOPC_F32 , "v_cmps_lt_f32", COND_NULL, "v_cmps_gt_f32">; -defm V_CMPS_EQ_F32 : VOPC_F32 , "v_cmps_eq_f32">; -defm V_CMPS_LE_F32 : VOPC_F32 , "v_cmps_le_f32", COND_NULL, "v_cmps_ge_f32">; -defm V_CMPS_GT_F32 : VOPC_F32 , "v_cmps_gt_f32">; -defm V_CMPS_LG_F32 : VOPC_F32 , "v_cmps_lg_f32">; -defm V_CMPS_GE_F32 : VOPC_F32 , "v_cmps_ge_f32">; -defm V_CMPS_O_F32 : VOPC_F32 , "v_cmps_o_f32">; -defm V_CMPS_U_F32 : VOPC_F32 , "v_cmps_u_f32">; -defm V_CMPS_NGE_F32 : VOPC_F32 , "v_cmps_nge_f32", COND_NULL, "v_cmps_nle_f32">; -defm V_CMPS_NLG_F32 : VOPC_F32 , "v_cmps_nlg_f32">; -defm V_CMPS_NGT_F32 : VOPC_F32 , "v_cmps_ngt_f32", COND_NULL, "v_cmps_nlt_f32">; -defm V_CMPS_NLE_F32 : VOPC_F32 , "v_cmps_nle_f32">; -defm V_CMPS_NEQ_F32 : VOPC_F32 , "v_cmps_neq_f32">; -defm V_CMPS_NLT_F32 : VOPC_F32 , "v_cmps_nlt_f32">; -defm V_CMPS_TRU_F32 : VOPC_F32 , "v_cmps_tru_f32">; - - -defm V_CMPSX_F_F32 : VOPCX_F32 , "v_cmpsx_f_f32">; -defm V_CMPSX_LT_F32 : VOPCX_F32 , "v_cmpsx_lt_f32", "v_cmpsx_gt_f32">; -defm V_CMPSX_EQ_F32 : VOPCX_F32 , "v_cmpsx_eq_f32">; -defm V_CMPSX_LE_F32 : VOPCX_F32 , "v_cmpsx_le_f32", "v_cmpsx_ge_f32">; -defm V_CMPSX_GT_F32 : VOPCX_F32 , "v_cmpsx_gt_f32">; -defm V_CMPSX_LG_F32 : VOPCX_F32 , "v_cmpsx_lg_f32">; -defm V_CMPSX_GE_F32 : VOPCX_F32 , "v_cmpsx_ge_f32">; -defm V_CMPSX_O_F32 : VOPCX_F32 , "v_cmpsx_o_f32">; -defm V_CMPSX_U_F32 : VOPCX_F32 , "v_cmpsx_u_f32">; -defm V_CMPSX_NGE_F32 : VOPCX_F32 , "v_cmpsx_nge_f32", "v_cmpsx_nle_f32">; -defm V_CMPSX_NLG_F32 : VOPCX_F32 , "v_cmpsx_nlg_f32">; -defm V_CMPSX_NGT_F32 : VOPCX_F32 , "v_cmpsx_ngt_f32", "v_cmpsx_nlt_f32">; -defm V_CMPSX_NLE_F32 : VOPCX_F32 , "v_cmpsx_nle_f32">; -defm V_CMPSX_NEQ_F32 : VOPCX_F32 , "v_cmpsx_neq_f32">; -defm V_CMPSX_NLT_F32 : VOPCX_F32 , "v_cmpsx_nlt_f32">; -defm V_CMPSX_TRU_F32 : VOPCX_F32 , "v_cmpsx_tru_f32">; - - -defm V_CMPS_F_F64 : VOPC_F64 , "v_cmps_f_f64">; -defm V_CMPS_LT_F64 : VOPC_F64 , "v_cmps_lt_f64", COND_NULL, "v_cmps_gt_f64">; -defm V_CMPS_EQ_F64 : VOPC_F64 , "v_cmps_eq_f64">; -defm V_CMPS_LE_F64 : VOPC_F64 , "v_cmps_le_f64", COND_NULL, "v_cmps_ge_f64">; -defm V_CMPS_GT_F64 : VOPC_F64 , "v_cmps_gt_f64">; -defm V_CMPS_LG_F64 : VOPC_F64 , "v_cmps_lg_f64">; -defm V_CMPS_GE_F64 : VOPC_F64 , "v_cmps_ge_f64">; -defm V_CMPS_O_F64 : VOPC_F64 , "v_cmps_o_f64">; -defm V_CMPS_U_F64 : VOPC_F64 , "v_cmps_u_f64">; -defm V_CMPS_NGE_F64 : VOPC_F64 , "v_cmps_nge_f64", COND_NULL, "v_cmps_nle_f64">; -defm V_CMPS_NLG_F64 : VOPC_F64 , "v_cmps_nlg_f64">; -defm V_CMPS_NGT_F64 : VOPC_F64 , "v_cmps_ngt_f64", COND_NULL, "v_cmps_nlt_f64">; -defm V_CMPS_NLE_F64 : VOPC_F64 , "v_cmps_nle_f64">; -defm V_CMPS_NEQ_F64 : VOPC_F64 , "v_cmps_neq_f64">; -defm V_CMPS_NLT_F64 : VOPC_F64 , "v_cmps_nlt_f64">; -defm V_CMPS_TRU_F64 : VOPC_F64 , "v_cmps_tru_f64">; - - -defm V_CMPSX_F_F64 : VOPCX_F64 , "v_cmpsx_f_f64">; -defm V_CMPSX_LT_F64 : VOPCX_F64 , "v_cmpsx_lt_f64", "v_cmpsx_gt_f64">; -defm V_CMPSX_EQ_F64 : VOPCX_F64 , "v_cmpsx_eq_f64">; -defm V_CMPSX_LE_F64 : VOPCX_F64 , "v_cmpsx_le_f64", "v_cmpsx_ge_f64">; -defm V_CMPSX_GT_F64 : VOPCX_F64 , "v_cmpsx_gt_f64">; -defm V_CMPSX_LG_F64 : VOPCX_F64 , "v_cmpsx_lg_f64">; -defm V_CMPSX_GE_F64 : VOPCX_F64 , "v_cmpsx_ge_f64">; -defm V_CMPSX_O_F64 : VOPCX_F64 , "v_cmpsx_o_f64">; -defm V_CMPSX_U_F64 : VOPCX_F64 , "v_cmpsx_u_f64">; -defm V_CMPSX_NGE_F64 : VOPCX_F64 , "v_cmpsx_nge_f64", "v_cmpsx_nle_f64">; -defm V_CMPSX_NLG_F64 : VOPCX_F64 , "v_cmpsx_nlg_f64">; -defm V_CMPSX_NGT_F64 : VOPCX_F64 , "v_cmpsx_ngt_f64", "v_cmpsx_nlt_f64">; -defm V_CMPSX_NLE_F64 : VOPCX_F64 , "v_cmpsx_nle_f64">; -defm V_CMPSX_NEQ_F64 : VOPCX_F64 , "v_cmpsx_neq_f64">; -defm V_CMPSX_NLT_F64 : VOPCX_F64 , "v_cmpsx_nlt_f64">; -defm V_CMPSX_TRU_F64 : VOPCX_F64 , "v_cmpsx_tru_f64">; - -} // End SubtargetPredicate = isSICI - -defm V_CMP_F_I32 : VOPC_I32 , "v_cmp_f_i32">; -defm V_CMP_LT_I32 : VOPC_I32 , "v_cmp_lt_i32", COND_SLT, "v_cmp_gt_i32">; -defm V_CMP_EQ_I32 : VOPC_I32 , "v_cmp_eq_i32", COND_EQ>; -defm V_CMP_LE_I32 : VOPC_I32 , "v_cmp_le_i32", COND_SLE, "v_cmp_ge_i32">; -defm V_CMP_GT_I32 : VOPC_I32 , "v_cmp_gt_i32", COND_SGT>; -defm V_CMP_NE_I32 : VOPC_I32 , "v_cmp_ne_i32", COND_NE>; -defm V_CMP_GE_I32 : VOPC_I32 , "v_cmp_ge_i32", COND_SGE>; -defm V_CMP_T_I32 : VOPC_I32 , "v_cmp_t_i32">; - - -defm V_CMPX_F_I32 : VOPCX_I32 , "v_cmpx_f_i32">; -defm V_CMPX_LT_I32 : VOPCX_I32 , "v_cmpx_lt_i32", "v_cmpx_gt_i32">; -defm V_CMPX_EQ_I32 : VOPCX_I32 , "v_cmpx_eq_i32">; -defm V_CMPX_LE_I32 : VOPCX_I32 , "v_cmpx_le_i32", "v_cmpx_ge_i32">; -defm V_CMPX_GT_I32 : VOPCX_I32 , "v_cmpx_gt_i32">; -defm V_CMPX_NE_I32 : VOPCX_I32 , "v_cmpx_ne_i32">; -defm V_CMPX_GE_I32 : VOPCX_I32 , "v_cmpx_ge_i32">; -defm V_CMPX_T_I32 : VOPCX_I32 , "v_cmpx_t_i32">; - - -defm V_CMP_F_I64 : VOPC_I64 , "v_cmp_f_i64">; -defm V_CMP_LT_I64 : VOPC_I64 , "v_cmp_lt_i64", COND_SLT, "v_cmp_gt_i64">; -defm V_CMP_EQ_I64 : VOPC_I64 , "v_cmp_eq_i64", COND_EQ>; -defm V_CMP_LE_I64 : VOPC_I64 , "v_cmp_le_i64", COND_SLE, "v_cmp_ge_i64">; -defm V_CMP_GT_I64 : VOPC_I64 , "v_cmp_gt_i64", COND_SGT>; -defm V_CMP_NE_I64 : VOPC_I64 , "v_cmp_ne_i64", COND_NE>; -defm V_CMP_GE_I64 : VOPC_I64 , "v_cmp_ge_i64", COND_SGE>; -defm V_CMP_T_I64 : VOPC_I64 , "v_cmp_t_i64">; - - -defm V_CMPX_F_I64 : VOPCX_I64 , "v_cmpx_f_i64">; -defm V_CMPX_LT_I64 : VOPCX_I64 , "v_cmpx_lt_i64", "v_cmpx_gt_i64">; -defm V_CMPX_EQ_I64 : VOPCX_I64 , "v_cmpx_eq_i64">; -defm V_CMPX_LE_I64 : VOPCX_I64 , "v_cmpx_le_i64", "v_cmpx_ge_i64">; -defm V_CMPX_GT_I64 : VOPCX_I64 , "v_cmpx_gt_i64">; -defm V_CMPX_NE_I64 : VOPCX_I64 , "v_cmpx_ne_i64">; -defm V_CMPX_GE_I64 : VOPCX_I64 , "v_cmpx_ge_i64">; -defm V_CMPX_T_I64 : VOPCX_I64 , "v_cmpx_t_i64">; - - -defm V_CMP_F_U32 : VOPC_I32 , "v_cmp_f_u32">; -defm V_CMP_LT_U32 : VOPC_I32 , "v_cmp_lt_u32", COND_ULT, "v_cmp_gt_u32">; -defm V_CMP_EQ_U32 : VOPC_I32 , "v_cmp_eq_u32", COND_EQ>; -defm V_CMP_LE_U32 : VOPC_I32 , "v_cmp_le_u32", COND_ULE, "v_cmp_ge_u32">; -defm V_CMP_GT_U32 : VOPC_I32 , "v_cmp_gt_u32", COND_UGT>; -defm V_CMP_NE_U32 : VOPC_I32 , "v_cmp_ne_u32", COND_NE>; -defm V_CMP_GE_U32 : VOPC_I32 , "v_cmp_ge_u32", COND_UGE>; -defm V_CMP_T_U32 : VOPC_I32 , "v_cmp_t_u32">; - - -defm V_CMPX_F_U32 : VOPCX_I32 , "v_cmpx_f_u32">; -defm V_CMPX_LT_U32 : VOPCX_I32 , "v_cmpx_lt_u32", "v_cmpx_gt_u32">; -defm V_CMPX_EQ_U32 : VOPCX_I32 , "v_cmpx_eq_u32">; -defm V_CMPX_LE_U32 : VOPCX_I32 , "v_cmpx_le_u32", "v_cmpx_le_u32">; -defm V_CMPX_GT_U32 : VOPCX_I32 , "v_cmpx_gt_u32">; -defm V_CMPX_NE_U32 : VOPCX_I32 , "v_cmpx_ne_u32">; -defm V_CMPX_GE_U32 : VOPCX_I32 , "v_cmpx_ge_u32">; -defm V_CMPX_T_U32 : VOPCX_I32 , "v_cmpx_t_u32">; - - -defm V_CMP_F_U64 : VOPC_I64 , "v_cmp_f_u64">; -defm V_CMP_LT_U64 : VOPC_I64 , "v_cmp_lt_u64", COND_ULT, "v_cmp_gt_u64">; -defm V_CMP_EQ_U64 : VOPC_I64 , "v_cmp_eq_u64", COND_EQ>; -defm V_CMP_LE_U64 : VOPC_I64 , "v_cmp_le_u64", COND_ULE, "v_cmp_ge_u64">; -defm V_CMP_GT_U64 : VOPC_I64 , "v_cmp_gt_u64", COND_UGT>; -defm V_CMP_NE_U64 : VOPC_I64 , "v_cmp_ne_u64", COND_NE>; -defm V_CMP_GE_U64 : VOPC_I64 , "v_cmp_ge_u64", COND_UGE>; -defm V_CMP_T_U64 : VOPC_I64 , "v_cmp_t_u64">; - -defm V_CMPX_F_U64 : VOPCX_I64 , "v_cmpx_f_u64">; -defm V_CMPX_LT_U64 : VOPCX_I64 , "v_cmpx_lt_u64", "v_cmpx_gt_u64">; -defm V_CMPX_EQ_U64 : VOPCX_I64 , "v_cmpx_eq_u64">; -defm V_CMPX_LE_U64 : VOPCX_I64 , "v_cmpx_le_u64", "v_cmpx_ge_u64">; -defm V_CMPX_GT_U64 : VOPCX_I64 , "v_cmpx_gt_u64">; -defm V_CMPX_NE_U64 : VOPCX_I64 , "v_cmpx_ne_u64">; -defm V_CMPX_GE_U64 : VOPCX_I64 , "v_cmpx_ge_u64">; -defm V_CMPX_T_U64 : VOPCX_I64 , "v_cmpx_t_u64">; - -} // End isCompare = 1, isCommutable = 1 - -defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 , "v_cmp_class_f32">; -defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 , "v_cmpx_class_f32">; -defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 , "v_cmp_class_f64">; -defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 , "v_cmpx_class_f64">; - -//===----------------------------------------------------------------------===// -// DS Instructions -//===----------------------------------------------------------------------===// - -defm DS_ADD_U32 : DS_1A1D_NORET <0x0, "ds_add_u32", VGPR_32>; -defm DS_SUB_U32 : DS_1A1D_NORET <0x1, "ds_sub_u32", VGPR_32>; -defm DS_RSUB_U32 : DS_1A1D_NORET <0x2, "ds_rsub_u32", VGPR_32>; -defm DS_INC_U32 : DS_1A1D_NORET <0x3, "ds_inc_u32", VGPR_32>; -defm DS_DEC_U32 : DS_1A1D_NORET <0x4, "ds_dec_u32", VGPR_32>; -defm DS_MIN_I32 : DS_1A1D_NORET <0x5, "ds_min_i32", VGPR_32>; -defm DS_MAX_I32 : DS_1A1D_NORET <0x6, "ds_max_i32", VGPR_32>; -defm DS_MIN_U32 : DS_1A1D_NORET <0x7, "ds_min_u32", VGPR_32>; -defm DS_MAX_U32 : DS_1A1D_NORET <0x8, "ds_max_u32", VGPR_32>; -defm DS_AND_B32 : DS_1A1D_NORET <0x9, "ds_and_b32", VGPR_32>; -defm DS_OR_B32 : DS_1A1D_NORET <0xa, "ds_or_b32", VGPR_32>; -defm DS_XOR_B32 : DS_1A1D_NORET <0xb, "ds_xor_b32", VGPR_32>; -defm DS_MSKOR_B32 : DS_1A2D_NORET <0xc, "ds_mskor_b32", VGPR_32>; -let mayLoad = 0 in { -defm DS_WRITE_B32 : DS_1A1D_NORET <0xd, "ds_write_b32", VGPR_32>; -defm DS_WRITE2_B32 : DS_1A1D_Off8_NORET <0xe, "ds_write2_b32", VGPR_32>; -defm DS_WRITE2ST64_B32 : DS_1A1D_Off8_NORET <0xf, "ds_write2st64_b32", VGPR_32>; -} -defm DS_CMPST_B32 : DS_1A2D_NORET <0x10, "ds_cmpst_b32", VGPR_32>; -defm DS_CMPST_F32 : DS_1A2D_NORET <0x11, "ds_cmpst_f32", VGPR_32>; -defm DS_MIN_F32 : DS_1A2D_NORET <0x12, "ds_min_f32", VGPR_32>; -defm DS_MAX_F32 : DS_1A2D_NORET <0x13, "ds_max_f32", VGPR_32>; - -defm DS_GWS_INIT : DS_1A_GDS <0x19, "ds_gws_init">; -defm DS_GWS_SEMA_V : DS_1A_GDS <0x1a, "ds_gws_sema_v">; -defm DS_GWS_SEMA_BR : DS_1A_GDS <0x1b, "ds_gws_sema_br">; -defm DS_GWS_SEMA_P : DS_1A_GDS <0x1c, "ds_gws_sema_p">; -defm DS_GWS_BARRIER : DS_1A_GDS <0x1d, "ds_gws_barrier">; -let mayLoad = 0 in { -defm DS_WRITE_B8 : DS_1A1D_NORET <0x1e, "ds_write_b8", VGPR_32>; -defm DS_WRITE_B16 : DS_1A1D_NORET <0x1f, "ds_write_b16", VGPR_32>; -} -defm DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "ds_add_rtn_u32", VGPR_32, "ds_add_u32">; -defm DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "ds_sub_rtn_u32", VGPR_32, "ds_sub_u32">; -defm DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "ds_rsub_rtn_u32", VGPR_32, "ds_rsub_u32">; -defm DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "ds_inc_rtn_u32", VGPR_32, "ds_inc_u32">; -defm DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "ds_dec_rtn_u32", VGPR_32, "ds_dec_u32">; -defm DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "ds_min_rtn_i32", VGPR_32, "ds_min_i32">; -defm DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "ds_max_rtn_i32", VGPR_32, "ds_max_i32">; -defm DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "ds_min_rtn_u32", VGPR_32, "ds_min_u32">; -defm DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "ds_max_rtn_u32", VGPR_32, "ds_max_u32">; -defm DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "ds_and_rtn_b32", VGPR_32, "ds_and_b32">; -defm DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "ds_or_rtn_b32", VGPR_32, "ds_or_b32">; -defm DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "ds_xor_rtn_b32", VGPR_32, "ds_xor_b32">; -defm DS_MSKOR_RTN_B32 : DS_1A2D_RET <0x2c, "ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">; -defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "ds_wrxchg_rtn_b32", VGPR_32>; -defm DS_WRXCHG2_RTN_B32 : DS_1A2D_RET < - 0x2e, "ds_wrxchg2_rtn_b32", VReg_64, "", VGPR_32 ->; -defm DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_RET < - 0x2f, "ds_wrxchg2st64_rtn_b32", VReg_64, "", VGPR_32 ->; -defm DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">; -defm DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">; -defm DS_MIN_RTN_F32 : DS_1A2D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">; -defm DS_MAX_RTN_F32 : DS_1A2D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">; -let SubtargetPredicate = isCI in { -defm DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">; -} // End isCI -defm DS_SWIZZLE_B32 : DS_1A_RET <0x35, "ds_swizzle_b32", VGPR_32>; -let mayStore = 0 in { -defm DS_READ_B32 : DS_1A_RET <0x36, "ds_read_b32", VGPR_32>; -defm DS_READ2_B32 : DS_1A_Off8_RET <0x37, "ds_read2_b32", VReg_64>; -defm DS_READ2ST64_B32 : DS_1A_Off8_RET <0x38, "ds_read2st64_b32", VReg_64>; -defm DS_READ_I8 : DS_1A_RET <0x39, "ds_read_i8", VGPR_32>; -defm DS_READ_U8 : DS_1A_RET <0x3a, "ds_read_u8", VGPR_32>; -defm DS_READ_I16 : DS_1A_RET <0x3b, "ds_read_i16", VGPR_32>; -defm DS_READ_U16 : DS_1A_RET <0x3c, "ds_read_u16", VGPR_32>; -} -defm DS_CONSUME : DS_0A_RET <0x3d, "ds_consume">; -defm DS_APPEND : DS_0A_RET <0x3e, "ds_append">; -defm DS_ORDERED_COUNT : DS_1A_RET_GDS <0x3f, "ds_ordered_count">; -defm DS_ADD_U64 : DS_1A1D_NORET <0x40, "ds_add_u64", VReg_64>; -defm DS_SUB_U64 : DS_1A1D_NORET <0x41, "ds_sub_u64", VReg_64>; -defm DS_RSUB_U64 : DS_1A1D_NORET <0x42, "ds_rsub_u64", VReg_64>; -defm DS_INC_U64 : DS_1A1D_NORET <0x43, "ds_inc_u64", VReg_64>; -defm DS_DEC_U64 : DS_1A1D_NORET <0x44, "ds_dec_u64", VReg_64>; -defm DS_MIN_I64 : DS_1A1D_NORET <0x45, "ds_min_i64", VReg_64>; -defm DS_MAX_I64 : DS_1A1D_NORET <0x46, "ds_max_i64", VReg_64>; -defm DS_MIN_U64 : DS_1A1D_NORET <0x47, "ds_min_u64", VReg_64>; -defm DS_MAX_U64 : DS_1A1D_NORET <0x48, "ds_max_u64", VReg_64>; -defm DS_AND_B64 : DS_1A1D_NORET <0x49, "ds_and_b64", VReg_64>; -defm DS_OR_B64 : DS_1A1D_NORET <0x4a, "ds_or_b64", VReg_64>; -defm DS_XOR_B64 : DS_1A1D_NORET <0x4b, "ds_xor_b64", VReg_64>; -defm DS_MSKOR_B64 : DS_1A2D_NORET <0x4c, "ds_mskor_b64", VReg_64>; -let mayLoad = 0 in { -defm DS_WRITE_B64 : DS_1A1D_NORET <0x4d, "ds_write_b64", VReg_64>; -defm DS_WRITE2_B64 : DS_1A1D_Off8_NORET <0x4E, "ds_write2_b64", VReg_64>; -defm DS_WRITE2ST64_B64 : DS_1A1D_Off8_NORET <0x4f, "ds_write2st64_b64", VReg_64>; -} -defm DS_CMPST_B64 : DS_1A2D_NORET <0x50, "ds_cmpst_b64", VReg_64>; -defm DS_CMPST_F64 : DS_1A2D_NORET <0x51, "ds_cmpst_f64", VReg_64>; -defm DS_MIN_F64 : DS_1A1D_NORET <0x52, "ds_min_f64", VReg_64>; -defm DS_MAX_F64 : DS_1A1D_NORET <0x53, "ds_max_f64", VReg_64>; - -defm DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "ds_add_rtn_u64", VReg_64, "ds_add_u64">; -defm DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "ds_sub_rtn_u64", VReg_64, "ds_sub_u64">; -defm DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "ds_rsub_rtn_u64", VReg_64, "ds_rsub_u64">; -defm DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "ds_inc_rtn_u64", VReg_64, "ds_inc_u64">; -defm DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "ds_dec_rtn_u64", VReg_64, "ds_dec_u64">; -defm DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "ds_min_rtn_i64", VReg_64, "ds_min_i64">; -defm DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "ds_max_rtn_i64", VReg_64, "ds_max_i64">; -defm DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "ds_min_rtn_u64", VReg_64, "ds_min_u64">; -defm DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "ds_max_rtn_u64", VReg_64, "ds_max_u64">; -defm DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "ds_and_rtn_b64", VReg_64, "ds_and_b64">; -defm DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "ds_or_rtn_b64", VReg_64, "ds_or_b64">; -defm DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "ds_xor_rtn_b64", VReg_64, "ds_xor_b64">; -defm DS_MSKOR_RTN_B64 : DS_1A2D_RET <0x6c, "ds_mskor_rtn_b64", VReg_64, "ds_mskor_b64">; -defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "ds_wrxchg_rtn_b64", VReg_64, "ds_wrxchg_b64">; -defm DS_WRXCHG2_RTN_B64 : DS_1A2D_RET <0x6e, "ds_wrxchg2_rtn_b64", VReg_128, "ds_wrxchg2_b64", VReg_64>; -defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_RET <0x6f, "ds_wrxchg2st64_rtn_b64", VReg_128, "ds_wrxchg2st64_b64", VReg_64>; -defm DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "ds_cmpst_rtn_b64", VReg_64, "ds_cmpst_b64">; -defm DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "ds_cmpst_rtn_f64", VReg_64, "ds_cmpst_f64">; -defm DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "ds_min_rtn_f64", VReg_64, "ds_min_f64">; -defm DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "ds_max_rtn_f64", VReg_64, "ds_max_f64">; - -let mayStore = 0 in { -defm DS_READ_B64 : DS_1A_RET <0x76, "ds_read_b64", VReg_64>; -defm DS_READ2_B64 : DS_1A_Off8_RET <0x77, "ds_read2_b64", VReg_128>; -defm DS_READ2ST64_B64 : DS_1A_Off8_RET <0x78, "ds_read2st64_b64", VReg_128>; -} - -defm DS_ADD_SRC2_U32 : DS_1A <0x80, "ds_add_src2_u32">; -defm DS_SUB_SRC2_U32 : DS_1A <0x81, "ds_sub_src2_u32">; -defm DS_RSUB_SRC2_U32 : DS_1A <0x82, "ds_rsub_src2_u32">; -defm DS_INC_SRC2_U32 : DS_1A <0x83, "ds_inc_src2_u32">; -defm DS_DEC_SRC2_U32 : DS_1A <0x84, "ds_dec_src2_u32">; -defm DS_MIN_SRC2_I32 : DS_1A <0x85, "ds_min_src2_i32">; -defm DS_MAX_SRC2_I32 : DS_1A <0x86, "ds_max_src2_i32">; -defm DS_MIN_SRC2_U32 : DS_1A <0x87, "ds_min_src2_u32">; -defm DS_MAX_SRC2_U32 : DS_1A <0x88, "ds_max_src2_u32">; -defm DS_AND_SRC2_B32 : DS_1A <0x89, "ds_and_src_b32">; -defm DS_OR_SRC2_B32 : DS_1A <0x8a, "ds_or_src2_b32">; -defm DS_XOR_SRC2_B32 : DS_1A <0x8b, "ds_xor_src2_b32">; -defm DS_WRITE_SRC2_B32 : DS_1A <0x8c, "ds_write_src2_b32">; - -defm DS_MIN_SRC2_F32 : DS_1A <0x92, "ds_min_src2_f32">; -defm DS_MAX_SRC2_F32 : DS_1A <0x93, "ds_max_src2_f32">; - -defm DS_ADD_SRC2_U64 : DS_1A <0xc0, "ds_add_src2_u64">; -defm DS_SUB_SRC2_U64 : DS_1A <0xc1, "ds_sub_src2_u64">; -defm DS_RSUB_SRC2_U64 : DS_1A <0xc2, "ds_rsub_src2_u64">; -defm DS_INC_SRC2_U64 : DS_1A <0xc3, "ds_inc_src2_u64">; -defm DS_DEC_SRC2_U64 : DS_1A <0xc4, "ds_dec_src2_u64">; -defm DS_MIN_SRC2_I64 : DS_1A <0xc5, "ds_min_src2_i64">; -defm DS_MAX_SRC2_I64 : DS_1A <0xc6, "ds_max_src2_i64">; -defm DS_MIN_SRC2_U64 : DS_1A <0xc7, "ds_min_src2_u64">; -defm DS_MAX_SRC2_U64 : DS_1A <0xc8, "ds_max_src2_u64">; -defm DS_AND_SRC2_B64 : DS_1A <0xc9, "ds_and_src2_b64">; -defm DS_OR_SRC2_B64 : DS_1A <0xca, "ds_or_src2_b64">; -defm DS_XOR_SRC2_B64 : DS_1A <0xcb, "ds_xor_src2_b64">; -defm DS_WRITE_SRC2_B64 : DS_1A <0xcc, "ds_write_src2_b64">; - -defm DS_MIN_SRC2_F64 : DS_1A <0xd2, "ds_min_src2_f64">; -defm DS_MAX_SRC2_F64 : DS_1A <0xd3, "ds_max_src2_f64">; - -//let SubtargetPredicate = isCI in { -// DS_CONDXCHG32_RTN_B64 -// DS_CONDXCHG32_RTN_B128 -//} // End isCI - -//===----------------------------------------------------------------------===// -// MUBUF Instructions -//===----------------------------------------------------------------------===// - -defm BUFFER_LOAD_FORMAT_X : MUBUF_Load_Helper < - mubuf<0x00>, "buffer_load_format_x", VGPR_32 ->; -defm BUFFER_LOAD_FORMAT_XY : MUBUF_Load_Helper < - mubuf<0x01>, "buffer_load_format_xy", VReg_64 ->; -defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Load_Helper < - mubuf<0x02>, "buffer_load_format_xyz", VReg_96 ->; -defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper < - mubuf<0x03>, "buffer_load_format_xyzw", VReg_128 ->; -defm BUFFER_STORE_FORMAT_X : MUBUF_Store_Helper < - mubuf<0x04>, "buffer_store_format_x", VGPR_32 ->; -defm BUFFER_STORE_FORMAT_XY : MUBUF_Store_Helper < - mubuf<0x05>, "buffer_store_format_xy", VReg_64 ->; -defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Store_Helper < - mubuf<0x06>, "buffer_store_format_xyz", VReg_96 ->; -defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Store_Helper < - mubuf<0x07>, "buffer_store_format_xyzw", VReg_128 ->; -defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper < - mubuf<0x08, 0x10>, "buffer_load_ubyte", VGPR_32, i32, az_extloadi8_global ->; -defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper < - mubuf<0x09, 0x11>, "buffer_load_sbyte", VGPR_32, i32, sextloadi8_global ->; -defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper < - mubuf<0x0a, 0x12>, "buffer_load_ushort", VGPR_32, i32, az_extloadi16_global ->; -defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper < - mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, sextloadi16_global ->; -defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper < - mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, global_load ->; -defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper < - mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, global_load ->; -defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper < - mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, global_load ->; - -defm BUFFER_STORE_BYTE : MUBUF_Store_Helper < - mubuf<0x18>, "buffer_store_byte", VGPR_32, i32, truncstorei8_global ->; - -defm BUFFER_STORE_SHORT : MUBUF_Store_Helper < - mubuf<0x1a>, "buffer_store_short", VGPR_32, i32, truncstorei16_global ->; - -defm BUFFER_STORE_DWORD : MUBUF_Store_Helper < - mubuf<0x1c>, "buffer_store_dword", VGPR_32, i32, global_store ->; - -defm BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper < - mubuf<0x1d>, "buffer_store_dwordx2", VReg_64, v2i32, global_store ->; - -defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper < - mubuf<0x1e, 0x1f>, "buffer_store_dwordx4", VReg_128, v4i32, global_store ->; - -defm BUFFER_ATOMIC_SWAP : MUBUF_Atomic < - mubuf<0x30, 0x40>, "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global ->; -//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ , "buffer_atomic_cmpswap", []>; -defm BUFFER_ATOMIC_ADD : MUBUF_Atomic < - mubuf<0x32, 0x42>, "buffer_atomic_add", VGPR_32, i32, atomic_add_global ->; -defm BUFFER_ATOMIC_SUB : MUBUF_Atomic < - mubuf<0x33, 0x43>, "buffer_atomic_sub", VGPR_32, i32, atomic_sub_global ->; -//def BUFFER_ATOMIC_RSUB : MUBUF_ , "buffer_atomic_rsub", []>; // isn't on CI & VI -defm BUFFER_ATOMIC_SMIN : MUBUF_Atomic < - mubuf<0x35, 0x44>, "buffer_atomic_smin", VGPR_32, i32, atomic_min_global ->; -defm BUFFER_ATOMIC_UMIN : MUBUF_Atomic < - mubuf<0x36, 0x45>, "buffer_atomic_umin", VGPR_32, i32, atomic_umin_global ->; -defm BUFFER_ATOMIC_SMAX : MUBUF_Atomic < - mubuf<0x37, 0x46>, "buffer_atomic_smax", VGPR_32, i32, atomic_max_global ->; -defm BUFFER_ATOMIC_UMAX : MUBUF_Atomic < - mubuf<0x38, 0x47>, "buffer_atomic_umax", VGPR_32, i32, atomic_umax_global ->; -defm BUFFER_ATOMIC_AND : MUBUF_Atomic < - mubuf<0x39, 0x48>, "buffer_atomic_and", VGPR_32, i32, atomic_and_global ->; -defm BUFFER_ATOMIC_OR : MUBUF_Atomic < - mubuf<0x3a, 0x49>, "buffer_atomic_or", VGPR_32, i32, atomic_or_global ->; -defm BUFFER_ATOMIC_XOR : MUBUF_Atomic < - mubuf<0x3b, 0x4a>, "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global ->; -//def BUFFER_ATOMIC_INC : MUBUF_ , "buffer_atomic_inc", []>; -//def BUFFER_ATOMIC_DEC : MUBUF_ , "buffer_atomic_dec", []>; -//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ , "buffer_atomic_fcmpswap", []>; // isn't on VI -//def BUFFER_ATOMIC_FMIN : MUBUF_ , "buffer_atomic_fmin", []>; // isn't on VI -//def BUFFER_ATOMIC_FMAX : MUBUF_ , "buffer_atomic_fmax", []>; // isn't on VI -//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 , "buffer_atomic_swap_x2", []>; -//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 , "buffer_atomic_cmpswap_x2", []>; -//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 , "buffer_atomic_add_x2", []>; -//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 , "buffer_atomic_sub_x2", []>; -//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 , "buffer_atomic_rsub_x2", []>; // isn't on CI & VI -//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 , "buffer_atomic_smin_x2", []>; -//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 , "buffer_atomic_umin_x2", []>; -//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 , "buffer_atomic_smax_x2", []>; -//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 , "buffer_atomic_umax_x2", []>; -//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 , "buffer_atomic_and_x2", []>; -//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 , "buffer_atomic_or_x2", []>; -//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 , "buffer_atomic_xor_x2", []>; -//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 , "buffer_atomic_inc_x2", []>; -//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 , "buffer_atomic_dec_x2", []>; -//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 , "buffer_atomic_fcmpswap_x2", []>; // isn't on VI -//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 , "buffer_atomic_fmin_x2", []>; // isn't on VI -//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 , "buffer_atomic_fmax_x2", []>; // isn't on VI -//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 , "buffer_wbinvl1_sc", []>; // isn't on CI & VI -//def BUFFER_WBINVL1_VOL : MUBUF_WBINVL1 , "buffer_wbinvl1_vol", []>; // isn't on SI -//def BUFFER_WBINVL1 : MUBUF_WBINVL1 , "buffer_wbinvl1", []>; - -//===----------------------------------------------------------------------===// -// MTBUF Instructions -//===----------------------------------------------------------------------===// - -//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "tbuffer_load_format_x", []>; -//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "tbuffer_load_format_xy", []>; -//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "tbuffer_load_format_xyz", []>; -defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "tbuffer_load_format_xyzw", VReg_128>; -defm TBUFFER_STORE_FORMAT_X : MTBUF_Store_Helper <0x00000004, "tbuffer_store_format_x", VGPR_32>; -defm TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Helper <0x00000005, "tbuffer_store_format_xy", VReg_64>; -defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "tbuffer_store_format_xyz", VReg_128>; -defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "tbuffer_store_format_xyzw", VReg_128>; - -//===----------------------------------------------------------------------===// -// MIMG Instructions -//===----------------------------------------------------------------------===// - -defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load">; -defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip">; -//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"image_load_pck", 0x00000002>; -//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"image_load_pck_sgn", 0x00000003>; -//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"image_load_mip_pck", 0x00000004>; -//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"image_load_mip_pck_sgn", 0x00000005>; -//def IMAGE_STORE : MIMG_NoPattern_ <"image_store", 0x00000008>; -//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"image_store_mip", 0x00000009>; -//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>; -//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>; -defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">; -//def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"image_atomic_swap", 0x0000000f>; -//def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"image_atomic_cmpswap", 0x00000010>; -//def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"image_atomic_add", 0x00000011>; -//def IMAGE_ATOMIC_SUB : MIMG_NoPattern_ <"image_atomic_sub", 0x00000012>; -//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>; -//def IMAGE_ATOMIC_SMIN : MIMG_NoPattern_ <"image_atomic_smin", 0x00000014>; -//def IMAGE_ATOMIC_UMIN : MIMG_NoPattern_ <"image_atomic_umin", 0x00000015>; -//def IMAGE_ATOMIC_SMAX : MIMG_NoPattern_ <"image_atomic_smax", 0x00000016>; -//def IMAGE_ATOMIC_UMAX : MIMG_NoPattern_ <"image_atomic_umax", 0x00000017>; -//def IMAGE_ATOMIC_AND : MIMG_NoPattern_ <"image_atomic_and", 0x00000018>; -//def IMAGE_ATOMIC_OR : MIMG_NoPattern_ <"image_atomic_or", 0x00000019>; -//def IMAGE_ATOMIC_XOR : MIMG_NoPattern_ <"image_atomic_xor", 0x0000001a>; -//def IMAGE_ATOMIC_INC : MIMG_NoPattern_ <"image_atomic_inc", 0x0000001b>; -//def IMAGE_ATOMIC_DEC : MIMG_NoPattern_ <"image_atomic_dec", 0x0000001c>; -//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>; -//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, "image_sample">; -defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, "image_sample_cl">; -defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "image_sample_d">; -defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, "image_sample_d_cl">; -defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, "image_sample_l">; -defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, "image_sample_b">; -defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, "image_sample_b_cl">; -defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, "image_sample_lz">; -defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, "image_sample_c">; -defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, "image_sample_c_cl">; -defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, "image_sample_c_d">; -defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, "image_sample_c_d_cl">; -defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, "image_sample_c_l">; -defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, "image_sample_c_b">; -defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, "image_sample_c_b_cl">; -defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, "image_sample_c_lz">; -defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, "image_sample_o">; -defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, "image_sample_cl_o">; -defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, "image_sample_d_o">; -defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, "image_sample_d_cl_o">; -defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, "image_sample_l_o">; -defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, "image_sample_b_o">; -defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, "image_sample_b_cl_o">; -defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, "image_sample_lz_o">; -defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, "image_sample_c_o">; -defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, "image_sample_c_cl_o">; -defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, "image_sample_c_d_o">; -defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, "image_sample_c_d_cl_o">; -defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, "image_sample_c_l_o">; -defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, "image_sample_c_b_o">; -defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, "image_sample_c_b_cl_o">; -defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, "image_sample_c_lz_o">; -defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, "image_gather4">; -defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, "image_gather4_cl">; -defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, "image_gather4_l">; -defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, "image_gather4_b">; -defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, "image_gather4_b_cl">; -defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, "image_gather4_lz">; -defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, "image_gather4_c">; -defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, "image_gather4_c_cl">; -defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, "image_gather4_c_l">; -defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, "image_gather4_c_b">; -defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, "image_gather4_c_b_cl">; -defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, "image_gather4_c_lz">; -defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, "image_gather4_o">; -defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, "image_gather4_cl_o">; -defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, "image_gather4_l_o">; -defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, "image_gather4_b_o">; -defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, "image_gather4_b_cl_o">; -defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, "image_gather4_lz_o">; -defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, "image_gather4_c_o">; -defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, "image_gather4_c_cl_o">; -defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">; -defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, "image_gather4_c_b_o">; -defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, "image_gather4_c_b_cl_o">; -defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">; -defm IMAGE_GET_LOD : MIMG_Sampler_WQM <0x00000060, "image_get_lod">; -defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, "image_sample_cd">; -defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, "image_sample_cd_cl">; -defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, "image_sample_c_cd">; -defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, "image_sample_c_cd_cl">; -defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, "image_sample_cd_o">; -defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, "image_sample_cd_cl_o">; -defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, "image_sample_c_cd_o">; -defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o">; -//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>; -//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>; - -//===----------------------------------------------------------------------===// -// VOP1 Instructions -//===----------------------------------------------------------------------===// - -let vdst = 0, src0 = 0 in { -defm V_NOP : VOP1_m , (outs), (ins), "v_nop", [], "v_nop">; -} - -let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in { -defm V_MOV_B32 : VOP1Inst , "v_mov_b32", VOP_I32_I32>; -} // End isMoveImm = 1 - -let Uses = [EXEC] in { - -// FIXME: Specify SchedRW for READFIRSTLANE_B32 - -def V_READFIRSTLANE_B32 : VOP1 < - 0x00000002, - (outs SReg_32:$vdst), - (ins VGPR_32:$src0), - "v_readfirstlane_b32 $vdst, $src0", - [] ->; - -} - -let SchedRW = [WriteQuarterRate32] in { - -defm V_CVT_I32_F64 : VOP1Inst , "v_cvt_i32_f64", - VOP_I32_F64, fp_to_sint ->; -defm V_CVT_F64_I32 : VOP1Inst , "v_cvt_f64_i32", - VOP_F64_I32, sint_to_fp ->; -defm V_CVT_F32_I32 : VOP1Inst , "v_cvt_f32_i32", - VOP_F32_I32, sint_to_fp ->; -defm V_CVT_F32_U32 : VOP1Inst , "v_cvt_f32_u32", - VOP_F32_I32, uint_to_fp ->; -defm V_CVT_U32_F32 : VOP1Inst , "v_cvt_u32_f32", - VOP_I32_F32, fp_to_uint ->; -defm V_CVT_I32_F32 : VOP1Inst , "v_cvt_i32_f32", - VOP_I32_F32, fp_to_sint ->; -defm V_CVT_F16_F32 : VOP1Inst , "v_cvt_f16_f32", - VOP_I32_F32, fp_to_f16 ->; -defm V_CVT_F32_F16 : VOP1Inst , "v_cvt_f32_f16", - VOP_F32_I32, f16_to_fp ->; -defm V_CVT_RPI_I32_F32 : VOP1Inst , "v_cvt_rpi_i32_f32", - VOP_I32_F32, cvt_rpi_i32_f32>; -defm V_CVT_FLR_I32_F32 : VOP1Inst , "v_cvt_flr_i32_f32", - VOP_I32_F32, cvt_flr_i32_f32>; -defm V_CVT_OFF_F32_I4 : VOP1Inst , "v_cvt_off_f32_i4", VOP_F32_I32>; -defm V_CVT_F32_F64 : VOP1Inst , "v_cvt_f32_f64", - VOP_F32_F64, fround ->; -defm V_CVT_F64_F32 : VOP1Inst , "v_cvt_f64_f32", - VOP_F64_F32, fextend ->; -defm V_CVT_F32_UBYTE0 : VOP1Inst , "v_cvt_f32_ubyte0", - VOP_F32_I32, AMDGPUcvt_f32_ubyte0 ->; -defm V_CVT_F32_UBYTE1 : VOP1Inst , "v_cvt_f32_ubyte1", - VOP_F32_I32, AMDGPUcvt_f32_ubyte1 ->; -defm V_CVT_F32_UBYTE2 : VOP1Inst , "v_cvt_f32_ubyte2", - VOP_F32_I32, AMDGPUcvt_f32_ubyte2 ->; -defm V_CVT_F32_UBYTE3 : VOP1Inst , "v_cvt_f32_ubyte3", - VOP_F32_I32, AMDGPUcvt_f32_ubyte3 ->; -defm V_CVT_U32_F64 : VOP1Inst , "v_cvt_u32_f64", - VOP_I32_F64, fp_to_uint ->; -defm V_CVT_F64_U32 : VOP1Inst , "v_cvt_f64_u32", - VOP_F64_I32, uint_to_fp ->; - -} // let SchedRW = [WriteQuarterRate32] - -defm V_FRACT_F32 : VOP1Inst , "v_fract_f32", - VOP_F32_F32, AMDGPUfract ->; -defm V_TRUNC_F32 : VOP1Inst , "v_trunc_f32", - VOP_F32_F32, ftrunc ->; -defm V_CEIL_F32 : VOP1Inst , "v_ceil_f32", - VOP_F32_F32, fceil ->; -defm V_RNDNE_F32 : VOP1Inst , "v_rndne_f32", - VOP_F32_F32, frint ->; -defm V_FLOOR_F32 : VOP1Inst , "v_floor_f32", - VOP_F32_F32, ffloor ->; -defm V_EXP_F32 : VOP1Inst , "v_exp_f32", - VOP_F32_F32, fexp2 ->; - -let SchedRW = [WriteQuarterRate32] in { - -defm V_LOG_F32 : VOP1Inst , "v_log_f32", - VOP_F32_F32, flog2 ->; -defm V_RCP_F32 : VOP1Inst , "v_rcp_f32", - VOP_F32_F32, AMDGPUrcp ->; -defm V_RCP_IFLAG_F32 : VOP1Inst , "v_rcp_iflag_f32", - VOP_F32_F32 ->; -defm V_RSQ_F32 : VOP1Inst , "v_rsq_f32", - VOP_F32_F32, AMDGPUrsq ->; - -} //let SchedRW = [WriteQuarterRate32] - -let SchedRW = [WriteDouble] in { - -defm V_RCP_F64 : VOP1Inst , "v_rcp_f64", - VOP_F64_F64, AMDGPUrcp ->; -defm V_RSQ_F64 : VOP1Inst , "v_rsq_f64", - VOP_F64_F64, AMDGPUrsq ->; - -} // let SchedRW = [WriteDouble]; - -defm V_SQRT_F32 : VOP1Inst , "v_sqrt_f32", - VOP_F32_F32, fsqrt ->; - -let SchedRW = [WriteDouble] in { - -defm V_SQRT_F64 : VOP1Inst , "v_sqrt_f64", - VOP_F64_F64, fsqrt ->; - -} // let SchedRW = [WriteDouble] - -defm V_SIN_F32 : VOP1Inst , "v_sin_f32", - VOP_F32_F32, AMDGPUsin ->; -defm V_COS_F32 : VOP1Inst , "v_cos_f32", - VOP_F32_F32, AMDGPUcos ->; -defm V_NOT_B32 : VOP1Inst , "v_not_b32", VOP_I32_I32>; -defm V_BFREV_B32 : VOP1Inst , "v_bfrev_b32", VOP_I32_I32>; -defm V_FFBH_U32 : VOP1Inst , "v_ffbh_u32", VOP_I32_I32>; -defm V_FFBL_B32 : VOP1Inst , "v_ffbl_b32", VOP_I32_I32>; -defm V_FFBH_I32 : VOP1Inst , "v_ffbh_i32", VOP_I32_I32>; -defm V_FREXP_EXP_I32_F64 : VOP1Inst , "v_frexp_exp_i32_f64", - VOP_I32_F64 ->; -defm V_FREXP_MANT_F64 : VOP1Inst , "v_frexp_mant_f64", - VOP_F64_F64 ->; -defm V_FRACT_F64 : VOP1Inst , "v_fract_f64", VOP_F64_F64>; -defm V_FREXP_EXP_I32_F32 : VOP1Inst , "v_frexp_exp_i32_f32", - VOP_I32_F32 ->; -defm V_FREXP_MANT_F32 : VOP1Inst , "v_frexp_mant_f32", - VOP_F32_F32 ->; -let vdst = 0, src0 = 0 in { -defm V_CLREXCP : VOP1_m , (outs), (ins), "v_clrexcp", [], - "v_clrexcp" ->; -} -defm V_MOVRELD_B32 : VOP1Inst , "v_movreld_b32", VOP_I32_I32>; -defm V_MOVRELS_B32 : VOP1Inst , "v_movrels_b32", VOP_I32_I32>; -defm V_MOVRELSD_B32 : VOP1Inst , "v_movrelsd_b32", VOP_I32_I32>; - -// These instruction only exist on SI and CI -let SubtargetPredicate = isSICI in { - -let SchedRW = [WriteQuarterRate32] in { - -defm V_MOV_FED_B32 : VOP1InstSI , "v_mov_fed_b32", VOP_I32_I32>; -defm V_LOG_CLAMP_F32 : VOP1InstSI , "v_log_clamp_f32", VOP_F32_F32>; -defm V_RCP_CLAMP_F32 : VOP1InstSI , "v_rcp_clamp_f32", VOP_F32_F32>; -defm V_RCP_LEGACY_F32 : VOP1InstSI , "v_rcp_legacy_f32", VOP_F32_F32>; -defm V_RSQ_CLAMP_F32 : VOP1InstSI , "v_rsq_clamp_f32", - VOP_F32_F32, AMDGPUrsq_clamped ->; -defm V_RSQ_LEGACY_F32 : VOP1InstSI , "v_rsq_legacy_f32", - VOP_F32_F32, AMDGPUrsq_legacy ->; - -} // End let SchedRW = [WriteQuarterRate32] - -let SchedRW = [WriteDouble] in { - -defm V_RCP_CLAMP_F64 : VOP1InstSI , "v_rcp_clamp_f64", VOP_F64_F64>; -defm V_RSQ_CLAMP_F64 : VOP1InstSI , "v_rsq_clamp_f64", - VOP_F64_F64, AMDGPUrsq_clamped ->; - -} // End SchedRW = [WriteDouble] - -} // End SubtargetPredicate = isSICI - -//===----------------------------------------------------------------------===// -// VINTRP Instructions -//===----------------------------------------------------------------------===// - -let Uses = [M0] in { - -// FIXME: Specify SchedRW for VINTRP insturctions. - -multiclass V_INTERP_P1_F32_m : VINTRP_m < - 0x00000000, - (outs VGPR_32:$dst), - (ins VGPR_32:$i, i32imm:$attr_chan, i32imm:$attr), - "v_interp_p1_f32 $dst, $i, $attr_chan, $attr, [m0]", - [(set f32:$dst, (AMDGPUinterp_p1 i32:$i, (i32 imm:$attr_chan), - (i32 imm:$attr)))] ->; - -let OtherPredicates = [has32BankLDS] in { - -defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m; - -} // End OtherPredicates = [has32BankLDS] - -let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $dst" in { - -defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m; - -} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $dst" - -let DisableEncoding = "$src0", Constraints = "$src0 = $dst" in { - -defm V_INTERP_P2_F32 : VINTRP_m < - 0x00000001, - (outs VGPR_32:$dst), - (ins VGPR_32:$src0, VGPR_32:$j, i32imm:$attr_chan, i32imm:$attr), - "v_interp_p2_f32 $dst, [$src0], $j, $attr_chan, $attr, [m0]", - [(set f32:$dst, (AMDGPUinterp_p2 f32:$src0, i32:$j, (i32 imm:$attr_chan), - (i32 imm:$attr)))]>; - -} // End DisableEncoding = "$src0", Constraints = "$src0 = $dst" - -defm V_INTERP_MOV_F32 : VINTRP_m < - 0x00000002, - (outs VGPR_32:$dst), - (ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr), - "v_interp_mov_f32 $dst, $src0, $attr_chan, $attr, [m0]", - [(set f32:$dst, (AMDGPUinterp_mov (i32 imm:$src0), (i32 imm:$attr_chan), - (i32 imm:$attr)))]>; - -} // End Uses = [M0] - -//===----------------------------------------------------------------------===// -// VOP2 Instructions -//===----------------------------------------------------------------------===// - -multiclass V_CNDMASK { - defm _e32 : VOP2_m < - op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins32, VOP_CNDMASK.Asm32, [], - name, name>; - - defm _e64 : VOP3_m < - op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins64, - name#!cast(VOP_CNDMASK.Asm64), [], name, 3>; -} - -defm V_CNDMASK_B32 : V_CNDMASK, "v_cndmask_b32">; - -let isCommutable = 1 in { -defm V_ADD_F32 : VOP2Inst , "v_add_f32", - VOP_F32_F32_F32, fadd ->; - -defm V_SUB_F32 : VOP2Inst , "v_sub_f32", VOP_F32_F32_F32, fsub>; -defm V_SUBREV_F32 : VOP2Inst , "v_subrev_f32", - VOP_F32_F32_F32, null_frag, "v_sub_f32" ->; -} // End isCommutable = 1 - -let isCommutable = 1 in { - -defm V_MUL_LEGACY_F32 : VOP2Inst , "v_mul_legacy_f32", - VOP_F32_F32_F32, int_AMDGPU_mul ->; - -defm V_MUL_F32 : VOP2Inst , "v_mul_f32", - VOP_F32_F32_F32, fmul ->; - -defm V_MUL_I32_I24 : VOP2Inst , "v_mul_i32_i24", - VOP_I32_I32_I32, AMDGPUmul_i24 ->; - -defm V_MUL_HI_I32_I24 : VOP2Inst , "v_mul_hi_i32_i24", - VOP_I32_I32_I32 ->; - -defm V_MUL_U32_U24 : VOP2Inst , "v_mul_u32_u24", - VOP_I32_I32_I32, AMDGPUmul_u24 ->; - -defm V_MUL_HI_U32_U24 : VOP2Inst , "v_mul_hi_u32_u24", - VOP_I32_I32_I32 ->; - -defm V_MIN_F32 : VOP2Inst , "v_min_f32", VOP_F32_F32_F32, - fminnum>; -defm V_MAX_F32 : VOP2Inst , "v_max_f32", VOP_F32_F32_F32, - fmaxnum>; -defm V_MIN_I32 : VOP2Inst , "v_min_i32", VOP_I32_I32_I32>; -defm V_MAX_I32 : VOP2Inst , "v_max_i32", VOP_I32_I32_I32>; -defm V_MIN_U32 : VOP2Inst , "v_min_u32", VOP_I32_I32_I32>; -defm V_MAX_U32 : VOP2Inst , "v_max_u32", VOP_I32_I32_I32>; - -defm V_LSHRREV_B32 : VOP2Inst < - vop2<0x16, 0x10>, "v_lshrrev_b32", VOP_I32_I32_I32, null_frag, - "v_lshr_b32" ->; - -defm V_ASHRREV_I32 : VOP2Inst < - vop2<0x18, 0x11>, "v_ashrrev_i32", VOP_I32_I32_I32, null_frag, - "v_ashr_i32" ->; - -defm V_LSHLREV_B32 : VOP2Inst < - vop2<0x1a, 0x12>, "v_lshlrev_b32", VOP_I32_I32_I32, null_frag, - "v_lshl_b32" ->; - -defm V_AND_B32 : VOP2Inst , "v_and_b32", VOP_I32_I32_I32>; -defm V_OR_B32 : VOP2Inst , "v_or_b32", VOP_I32_I32_I32>; -defm V_XOR_B32 : VOP2Inst , "v_xor_b32", VOP_I32_I32_I32>; - -defm V_MAC_F32 : VOP2Inst , "v_mac_f32", VOP_F32_F32_F32>; -} // End isCommutable = 1 - -defm V_MADMK_F32 : VOP2MADK , "v_madmk_f32">; - -let isCommutable = 1 in { -defm V_MADAK_F32 : VOP2MADK , "v_madak_f32">; -} // End isCommutable = 1 - -let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC -// No patterns so that the scalar instructions are always selected. -// The scalar versions will be replaced with vector when needed later. - -// V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI, -// but the VI instructions behave the same as the SI versions. -defm V_ADD_I32 : VOP2bInst , "v_add_i32", - VOP_I32_I32_I32, add ->; -defm V_SUB_I32 : VOP2bInst , "v_sub_i32", VOP_I32_I32_I32>; - -defm V_SUBREV_I32 : VOP2bInst , "v_subrev_i32", - VOP_I32_I32_I32, null_frag, "v_sub_i32" ->; - -let Uses = [VCC] in { // Carry-in comes from VCC -defm V_ADDC_U32 : VOP2bInst , "v_addc_u32", - VOP_I32_I32_I32_VCC ->; -defm V_SUBB_U32 : VOP2bInst , "v_subb_u32", - VOP_I32_I32_I32_VCC ->; -defm V_SUBBREV_U32 : VOP2bInst , "v_subbrev_u32", - VOP_I32_I32_I32_VCC, null_frag, "v_subb_u32" ->; - -} // End Uses = [VCC] -} // End isCommutable = 1, Defs = [VCC] - -defm V_READLANE_B32 : VOP2SI_3VI_m < - vop3 <0x001, 0x289>, - "v_readlane_b32", - (outs SReg_32:$vdst), - (ins VGPR_32:$src0, SCSrc_32:$src1), - "v_readlane_b32 $vdst, $src0, $src1" ->; - -defm V_WRITELANE_B32 : VOP2SI_3VI_m < - vop3 <0x002, 0x28a>, - "v_writelane_b32", - (outs VGPR_32:$vdst), - (ins SReg_32:$src0, SCSrc_32:$src1), - "v_writelane_b32 $vdst, $src0, $src1" ->; - -// These instructions only exist on SI and CI -let SubtargetPredicate = isSICI in { - -defm V_MIN_LEGACY_F32 : VOP2InstSI , "v_min_legacy_f32", - VOP_F32_F32_F32, AMDGPUfmin_legacy ->; -defm V_MAX_LEGACY_F32 : VOP2InstSI , "v_max_legacy_f32", - VOP_F32_F32_F32, AMDGPUfmax_legacy ->; - -let isCommutable = 1 in { -defm V_LSHR_B32 : VOP2InstSI , "v_lshr_b32", VOP_I32_I32_I32>; -defm V_ASHR_I32 : VOP2InstSI , "v_ashr_i32", VOP_I32_I32_I32>; -defm V_LSHL_B32 : VOP2InstSI , "v_lshl_b32", VOP_I32_I32_I32>; -} // End isCommutable = 1 -} // End let SubtargetPredicate = SICI - -let isCommutable = 1 in { -defm V_MAC_LEGACY_F32 : VOP2_VI3_Inst , "v_mac_legacy_f32", - VOP_F32_F32_F32 ->; -} // End isCommutable = 1 - -defm V_BFM_B32 : VOP2_VI3_Inst , "v_bfm_b32", - VOP_I32_I32_I32 ->; -defm V_BCNT_U32_B32 : VOP2_VI3_Inst , "v_bcnt_u32_b32", - VOP_I32_I32_I32 ->; -defm V_MBCNT_LO_U32_B32 : VOP2_VI3_Inst , "v_mbcnt_lo_u32_b32", - VOP_I32_I32_I32 ->; -defm V_MBCNT_HI_U32_B32 : VOP2_VI3_Inst , "v_mbcnt_hi_u32_b32", - VOP_I32_I32_I32 ->; -defm V_LDEXP_F32 : VOP2_VI3_Inst , "v_ldexp_f32", - VOP_F32_F32_I32, AMDGPUldexp ->; - -defm V_CVT_PKACCUM_U8_F32 : VOP2_VI3_Inst , "v_cvt_pkaccum_u8_f32", - VOP_I32_F32_I32>; // TODO: set "Uses = dst" - -defm V_CVT_PKNORM_I16_F32 : VOP2_VI3_Inst , "v_cvt_pknorm_i16_f32", - VOP_I32_F32_F32 ->; -defm V_CVT_PKNORM_U16_F32 : VOP2_VI3_Inst , "v_cvt_pknorm_u16_f32", - VOP_I32_F32_F32 ->; -defm V_CVT_PKRTZ_F16_F32 : VOP2_VI3_Inst , "v_cvt_pkrtz_f16_f32", - VOP_I32_F32_F32, int_SI_packf16 ->; -defm V_CVT_PK_U16_U32 : VOP2_VI3_Inst , "v_cvt_pk_u16_u32", - VOP_I32_I32_I32 ->; -defm V_CVT_PK_I16_I32 : VOP2_VI3_Inst , "v_cvt_pk_i16_i32", - VOP_I32_I32_I32 ->; - -//===----------------------------------------------------------------------===// -// VOP3 Instructions -//===----------------------------------------------------------------------===// - -let isCommutable = 1 in { -defm V_MAD_LEGACY_F32 : VOP3Inst , "v_mad_legacy_f32", - VOP_F32_F32_F32_F32 ->; - -defm V_MAD_F32 : VOP3Inst , "v_mad_f32", - VOP_F32_F32_F32_F32, fmad ->; - -defm V_MAD_I32_I24 : VOP3Inst , "v_mad_i32_i24", - VOP_I32_I32_I32_I32, AMDGPUmad_i24 ->; -defm V_MAD_U32_U24 : VOP3Inst , "v_mad_u32_u24", - VOP_I32_I32_I32_I32, AMDGPUmad_u24 ->; -} // End isCommutable = 1 - -defm V_CUBEID_F32 : VOP3Inst , "v_cubeid_f32", - VOP_F32_F32_F32_F32 ->; -defm V_CUBESC_F32 : VOP3Inst , "v_cubesc_f32", - VOP_F32_F32_F32_F32 ->; -defm V_CUBETC_F32 : VOP3Inst , "v_cubetc_f32", - VOP_F32_F32_F32_F32 ->; -defm V_CUBEMA_F32 : VOP3Inst , "v_cubema_f32", - VOP_F32_F32_F32_F32 ->; - -defm V_BFE_U32 : VOP3Inst , "v_bfe_u32", - VOP_I32_I32_I32_I32, AMDGPUbfe_u32 ->; -defm V_BFE_I32 : VOP3Inst , "v_bfe_i32", - VOP_I32_I32_I32_I32, AMDGPUbfe_i32 ->; - -defm V_BFI_B32 : VOP3Inst , "v_bfi_b32", - VOP_I32_I32_I32_I32, AMDGPUbfi ->; - -let isCommutable = 1 in { -defm V_FMA_F32 : VOP3Inst , "v_fma_f32", - VOP_F32_F32_F32_F32, fma ->; -defm V_FMA_F64 : VOP3Inst , "v_fma_f64", - VOP_F64_F64_F64_F64, fma ->; -} // End isCommutable = 1 - -//def V_LERP_U8 : VOP3_U8 <0x0000014d, "v_lerp_u8", []>; -defm V_ALIGNBIT_B32 : VOP3Inst , "v_alignbit_b32", - VOP_I32_I32_I32_I32 ->; -defm V_ALIGNBYTE_B32 : VOP3Inst , "v_alignbyte_b32", - VOP_I32_I32_I32_I32 ->; - -defm V_MIN3_F32 : VOP3Inst , "v_min3_f32", - VOP_F32_F32_F32_F32, AMDGPUfmin3>; - -defm V_MIN3_I32 : VOP3Inst , "v_min3_i32", - VOP_I32_I32_I32_I32, AMDGPUsmin3 ->; -defm V_MIN3_U32 : VOP3Inst , "v_min3_u32", - VOP_I32_I32_I32_I32, AMDGPUumin3 ->; -defm V_MAX3_F32 : VOP3Inst , "v_max3_f32", - VOP_F32_F32_F32_F32, AMDGPUfmax3 ->; -defm V_MAX3_I32 : VOP3Inst , "v_max3_i32", - VOP_I32_I32_I32_I32, AMDGPUsmax3 ->; -defm V_MAX3_U32 : VOP3Inst , "v_max3_u32", - VOP_I32_I32_I32_I32, AMDGPUumax3 ->; -defm V_MED3_F32 : VOP3Inst , "v_med3_f32", - VOP_F32_F32_F32_F32 ->; -defm V_MED3_I32 : VOP3Inst , "v_med3_i32", - VOP_I32_I32_I32_I32 ->; -defm V_MED3_U32 : VOP3Inst , "v_med3_u32", - VOP_I32_I32_I32_I32 ->; - -//def V_SAD_U8 : VOP3_U8 <0x0000015a, "v_sad_u8", []>; -//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "v_sad_hi_u8", []>; -//def V_SAD_U16 : VOP3_U16 <0x0000015c, "v_sad_u16", []>; -defm V_SAD_U32 : VOP3Inst , "v_sad_u32", - VOP_I32_I32_I32_I32 ->; -////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "v_cvt_pk_u8_f32", []>; -defm V_DIV_FIXUP_F32 : VOP3Inst < - vop3<0x15f, 0x1de>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup ->; - -let SchedRW = [WriteDouble] in { - -defm V_DIV_FIXUP_F64 : VOP3Inst < - vop3<0x160, 0x1df>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup ->; - -} // let SchedRW = [WriteDouble] - -let SchedRW = [WriteDouble] in { -let isCommutable = 1 in { - -defm V_ADD_F64 : VOP3Inst , "v_add_f64", - VOP_F64_F64_F64, fadd ->; -defm V_MUL_F64 : VOP3Inst , "v_mul_f64", - VOP_F64_F64_F64, fmul ->; - -defm V_MIN_F64 : VOP3Inst , "v_min_f64", - VOP_F64_F64_F64, fminnum ->; -defm V_MAX_F64 : VOP3Inst , "v_max_f64", - VOP_F64_F64_F64, fmaxnum ->; - -} // isCommutable = 1 - -defm V_LDEXP_F64 : VOP3Inst , "v_ldexp_f64", - VOP_F64_F64_I32, AMDGPUldexp ->; - -} // let SchedRW = [WriteDouble] - -let isCommutable = 1, SchedRW = [WriteQuarterRate32] in { - -defm V_MUL_LO_U32 : VOP3Inst , "v_mul_lo_u32", - VOP_I32_I32_I32 ->; -defm V_MUL_HI_U32 : VOP3Inst , "v_mul_hi_u32", - VOP_I32_I32_I32 ->; - -defm V_MUL_LO_I32 : VOP3Inst , "v_mul_lo_i32", - VOP_I32_I32_I32 ->; -defm V_MUL_HI_I32 : VOP3Inst , "v_mul_hi_i32", - VOP_I32_I32_I32 ->; - -} // isCommutable = 1, SchedRW = [WriteQuarterRate32] - -let SchedRW = [WriteFloatFMA, WriteSALU] in { -defm V_DIV_SCALE_F32 : VOP3b_32 , "v_div_scale_f32", []>; -} - -let SchedRW = [WriteDouble, WriteSALU] in { -// Double precision division pre-scale. -defm V_DIV_SCALE_F64 : VOP3b_64 , "v_div_scale_f64", []>; -} // let SchedRW = [WriteDouble] - -let isCommutable = 1, Uses = [VCC] in { - -// v_div_fmas_f32: -// result = src0 * src1 + src2 -// if (vcc) -// result *= 2^32 -// -defm V_DIV_FMAS_F32 : VOP3_VCC_Inst , "v_div_fmas_f32", - VOP_F32_F32_F32_F32, AMDGPUdiv_fmas ->; - -let SchedRW = [WriteDouble] in { -// v_div_fmas_f64: -// result = src0 * src1 + src2 -// if (vcc) -// result *= 2^64 -// -defm V_DIV_FMAS_F64 : VOP3_VCC_Inst , "v_div_fmas_f64", - VOP_F64_F64_F64_F64, AMDGPUdiv_fmas ->; - -} // End SchedRW = [WriteDouble] -} // End isCommutable = 1 - -//def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>; -//def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>; -//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "v_mqsad_u8", []>; - -let SchedRW = [WriteDouble] in { -defm V_TRIG_PREOP_F64 : VOP3Inst < - vop3<0x174, 0x292>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop ->; - -} // let SchedRW = [WriteDouble] - -// These instructions only exist on SI and CI -let SubtargetPredicate = isSICI in { - -defm V_LSHL_B64 : VOP3Inst , "v_lshl_b64", VOP_I64_I64_I32>; -defm V_LSHR_B64 : VOP3Inst , "v_lshr_b64", VOP_I64_I64_I32>; -defm V_ASHR_I64 : VOP3Inst , "v_ashr_i64", VOP_I64_I64_I32>; - -defm V_MULLIT_F32 : VOP3Inst , "v_mullit_f32", - VOP_F32_F32_F32_F32>; - -} // End SubtargetPredicate = isSICI - -let SubtargetPredicate = isVI in { - -defm V_LSHLREV_B64 : VOP3Inst , "v_lshlrev_b64", - VOP_I64_I32_I64 ->; -defm V_LSHRREV_B64 : VOP3Inst , "v_lshrrev_b64", - VOP_I64_I32_I64 ->; -defm V_ASHRREV_I64 : VOP3Inst , "v_ashrrev_i64", - VOP_I64_I32_I64 ->; - -} // End SubtargetPredicate = isVI - -//===----------------------------------------------------------------------===// -// Pseudo Instructions -//===----------------------------------------------------------------------===// -let isCodeGenOnly = 1, isPseudo = 1 in { - -// For use in patterns -def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$dst), - (ins VSrc_64:$src0, VSrc_64:$src1, SSrc_64:$src2), "", [] ->; - -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { -// 64-bit vector move instruction. This is mainly used by the SIFoldOperands -// pass to enable folding of inline immediates. -def V_MOV_B64_PSEUDO : InstSI <(outs VReg_64:$dst), (ins VSrc_64:$src0), "", []>; -} // end let hasSideEffects = 0, mayLoad = 0, mayStore = 0 - -let hasSideEffects = 1 in { -def SGPR_USE : InstSI <(outs),(ins), "", []>; -} - -// SI pseudo instructions. These are used by the CFG structurizer pass -// and should be lowered to ISA instructions prior to codegen. - -let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { -let Uses = [EXEC], Defs = [EXEC] in { - -let isBranch = 1, isTerminator = 1 in { - -def SI_IF: InstSI < - (outs SReg_64:$dst), - (ins SReg_64:$vcc, brtarget:$target), - "", - [(set i64:$dst, (int_SI_if i1:$vcc, bb:$target))] ->; - -def SI_ELSE : InstSI < - (outs SReg_64:$dst), - (ins SReg_64:$src, brtarget:$target), - "", - [(set i64:$dst, (int_SI_else i64:$src, bb:$target))] -> { - let Constraints = "$src = $dst"; -} - -def SI_LOOP : InstSI < - (outs), - (ins SReg_64:$saved, brtarget:$target), - "si_loop $saved, $target", - [(int_SI_loop i64:$saved, bb:$target)] ->; - -} // end isBranch = 1, isTerminator = 1 - -def SI_BREAK : InstSI < - (outs SReg_64:$dst), - (ins SReg_64:$src), - "si_else $dst, $src", - [(set i64:$dst, (int_SI_break i64:$src))] ->; - -def SI_IF_BREAK : InstSI < - (outs SReg_64:$dst), - (ins SReg_64:$vcc, SReg_64:$src), - "si_if_break $dst, $vcc, $src", - [(set i64:$dst, (int_SI_if_break i1:$vcc, i64:$src))] ->; - -def SI_ELSE_BREAK : InstSI < - (outs SReg_64:$dst), - (ins SReg_64:$src0, SReg_64:$src1), - "si_else_break $dst, $src0, $src1", - [(set i64:$dst, (int_SI_else_break i64:$src0, i64:$src1))] ->; - -def SI_END_CF : InstSI < - (outs), - (ins SReg_64:$saved), - "si_end_cf $saved", - [(int_SI_end_cf i64:$saved)] ->; - -} // End Uses = [EXEC], Defs = [EXEC] - -let Uses = [EXEC], Defs = [EXEC,VCC] in { -def SI_KILL : InstSI < - (outs), - (ins VSrc_32:$src), - "si_kill $src", - [(int_AMDGPU_kill f32:$src)] ->; -} // End Uses = [EXEC], Defs = [EXEC,VCC] - -} // end mayLoad = 1, mayStore = 1, hasSideEffects = 1 - -let Uses = [EXEC], Defs = [EXEC,VCC,M0] in { - -//defm SI_ : RegisterLoadStore ; - -let UseNamedOperandTable = 1 in { - -def SI_RegisterLoad : InstSI < - (outs VGPR_32:$dst, SReg_64:$temp), - (ins FRAMEri32:$addr, i32imm:$chan), - "", [] -> { - let isRegisterLoad = 1; - let mayLoad = 1; -} - -class SIRegStore : InstSI < - outs, - (ins VGPR_32:$val, FRAMEri32:$addr, i32imm:$chan), - "", [] -> { - let isRegisterStore = 1; - let mayStore = 1; -} - -let usesCustomInserter = 1 in { -def SI_RegisterStorePseudo : SIRegStore<(outs)>; -} // End usesCustomInserter = 1 -def SI_RegisterStore : SIRegStore<(outs SReg_64:$temp)>; - - -} // End UseNamedOperandTable = 1 - -def SI_INDIRECT_SRC : InstSI < - (outs VGPR_32:$dst, SReg_64:$temp), - (ins unknown:$src, VSrc_32:$idx, i32imm:$off), - "si_indirect_src $dst, $temp, $src, $idx, $off", - [] ->; - -class SI_INDIRECT_DST : InstSI < - (outs rc:$dst, SReg_64:$temp), - (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VGPR_32:$val), - "si_indirect_dst $dst, $temp, $src, $idx, $off, $val", - [] -> { - let Constraints = "$src = $dst"; -} - -def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST; -def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST; -def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST; -def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST; -def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST; - -} // Uses = [EXEC,VCC,M0], Defs = [EXEC,VCC,M0] - -multiclass SI_SPILL_SGPR { - - let UseNamedOperandTable = 1 in { - def _SAVE : InstSI < - (outs), - (ins sgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc, - SReg_32:$scratch_offset), - "", [] - >; - - def _RESTORE : InstSI < - (outs sgpr_class:$dst), - (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset), - "", [] - >; - } // End UseNamedOperandTable = 1 -} - -// It's unclear whether you can use M0 as the output of v_readlane_b32 -// instructions, so use SGPR_32 register class for spills to prevent -// this from happening. -defm SI_SPILL_S32 : SI_SPILL_SGPR ; -defm SI_SPILL_S64 : SI_SPILL_SGPR ; -defm SI_SPILL_S128 : SI_SPILL_SGPR ; -defm SI_SPILL_S256 : SI_SPILL_SGPR ; -defm SI_SPILL_S512 : SI_SPILL_SGPR ; - -multiclass SI_SPILL_VGPR { - let UseNamedOperandTable = 1, VGPRSpill = 1 in { - def _SAVE : InstSI < - (outs), - (ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc, - SReg_32:$scratch_offset), - "", [] - >; - - def _RESTORE : InstSI < - (outs vgpr_class:$dst), - (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset), - "", [] - >; - } // End UseNamedOperandTable = 1, VGPRSpill = 1 -} - -defm SI_SPILL_V32 : SI_SPILL_VGPR ; -defm SI_SPILL_V64 : SI_SPILL_VGPR ; -defm SI_SPILL_V96 : SI_SPILL_VGPR ; -defm SI_SPILL_V128 : SI_SPILL_VGPR ; -defm SI_SPILL_V256 : SI_SPILL_VGPR ; -defm SI_SPILL_V512 : SI_SPILL_VGPR ; - -let Defs = [SCC] in { - -def SI_CONSTDATA_PTR : InstSI < - (outs SReg_64:$dst), - (ins), - "", [(set SReg_64:$dst, (i64 SIconstdata_ptr))] ->; - -} // End Defs = [SCC] - -} // end IsCodeGenOnly, isPseudo - -} // end SubtargetPredicate = isGCN - -let Predicates = [isGCN] in { - -def : Pat< - (int_AMDGPU_cndlt f32:$src0, f32:$src1, f32:$src2), - (V_CNDMASK_B32_e64 $src2, $src1, - (V_CMP_GT_F32_e64 SRCMODS.NONE, 0, SRCMODS.NONE, $src0, - DSTCLAMP.NONE, DSTOMOD.NONE)) ->; - -def : Pat < - (int_AMDGPU_kilp), - (SI_KILL 0xbf800000) ->; - -/* int_SI_vs_load_input */ -def : Pat< - (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr), - (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, 0, imm:$attr_offset, 0, 0, 0) ->; - -/* int_SI_export */ -def : Pat < - (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr, - f32:$src0, f32:$src1, f32:$src2, f32:$src3), - (EXP imm:$en, imm:$tgt, imm:$compr, imm:$done, imm:$vm, - $src0, $src1, $src2, $src3) ->; - -//===----------------------------------------------------------------------===// -// SMRD Patterns -//===----------------------------------------------------------------------===// - -multiclass SMRD_Pattern { - - // 1. SI-CI: Offset as 8bit DWORD immediate - def : Pat < - (constant_load (add i64:$sbase, (i64 IMM8bitDWORD:$offset))), - (vt (Instr_IMM $sbase, (as_dword_i32imm $offset))) - >; - - // 2. Offset loaded in an 32bit SGPR - def : Pat < - (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))), - (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset))))) - >; - - // 3. No offset at all - def : Pat < - (constant_load i64:$sbase), - (vt (Instr_IMM $sbase, 0)) - >; -} - -multiclass SMRD_Pattern_vi { - - // 1. VI: Offset as 20bit immediate in bytes - def : Pat < - (constant_load (add i64:$sbase, (i64 IMM20bit:$offset))), - (vt (Instr_IMM $sbase, (as_i32imm $offset))) - >; - - // 2. Offset loaded in an 32bit SGPR - def : Pat < - (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))), - (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset))))) - >; - - // 3. No offset at all - def : Pat < - (constant_load i64:$sbase), - (vt (Instr_IMM $sbase, 0)) - >; -} - -let Predicates = [isSICI] in { -defm : SMRD_Pattern ; -defm : SMRD_Pattern ; -defm : SMRD_Pattern ; -defm : SMRD_Pattern ; -defm : SMRD_Pattern ; -defm : SMRD_Pattern ; -defm : SMRD_Pattern ; -} // End Predicates = [isSICI] - -let Predicates = [isVI] in { -defm : SMRD_Pattern_vi ; -defm : SMRD_Pattern_vi ; -defm : SMRD_Pattern_vi ; -defm : SMRD_Pattern_vi ; -defm : SMRD_Pattern_vi ; -defm : SMRD_Pattern_vi ; -defm : SMRD_Pattern_vi ; -} // End Predicates = [isVI] - -let Predicates = [isSICI] in { - -// 1. Offset as 8bit DWORD immediate -def : Pat < - (SIload_constant v4i32:$sbase, IMM8bitDWORD:$offset), - (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_dword_i32imm $offset)) ->; - -} // End Predicates = [isSICI] - -// 2. Offset loaded in an 32bit SGPR -def : Pat < - (SIload_constant v4i32:$sbase, imm:$offset), - (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset)) ->; - -//===----------------------------------------------------------------------===// -// SOP1 Patterns -//===----------------------------------------------------------------------===// - -def : Pat < - (i64 (ctpop i64:$src)), - (i64 (REG_SEQUENCE SReg_64, - (S_BCNT1_I32_B64 $src), sub0, - (S_MOV_B32 0), sub1)) ->; - -//===----------------------------------------------------------------------===// -// SOP2 Patterns -//===----------------------------------------------------------------------===// - -// V_ADD_I32_e32/S_ADD_U32 produces carry in VCC/SCC. For the vector -// case, the sgpr-copies pass will fix this to use the vector version. -def : Pat < - (i32 (addc i32:$src0, i32:$src1)), - (S_ADD_U32 $src0, $src1) ->; - -//===----------------------------------------------------------------------===// -// SOPP Patterns -//===----------------------------------------------------------------------===// - -def : Pat < - (int_AMDGPU_barrier_global), - (S_BARRIER) ->; - -//===----------------------------------------------------------------------===// -// VOP1 Patterns -//===----------------------------------------------------------------------===// - -let Predicates = [UnsafeFPMath] in { - -//def : RcpPat; -//defm : RsqPat; -//defm : RsqPat; - -def : RsqPat; -def : RsqPat; -} - -//===----------------------------------------------------------------------===// -// VOP2 Patterns -//===----------------------------------------------------------------------===// - -def : Pat < - (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)), - (V_BCNT_U32_B32_e64 $popcnt, $val) ->; - -def : Pat < - (i32 (select i1:$src0, i32:$src1, i32:$src2)), - (V_CNDMASK_B32_e64 $src2, $src1, $src0) ->; - -/********** ======================= **********/ -/********** Image sampling patterns **********/ -/********** ======================= **********/ - -// Image + sampler -class SampleRawPattern : Pat < - (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm, - i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe), - (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da), - (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc), - $addr, $rsrc, $sampler) ->; - -multiclass SampleRawPatterns { - def : SampleRawPattern(opcode # _V4_V1), i32>; - def : SampleRawPattern(opcode # _V4_V2), v2i32>; - def : SampleRawPattern(opcode # _V4_V4), v4i32>; - def : SampleRawPattern(opcode # _V4_V8), v8i32>; - def : SampleRawPattern(opcode # _V4_V16), v16i32>; -} - -// Image only -class ImagePattern : Pat < - (name vt:$addr, v8i32:$rsrc, i32:$dmask, i32:$unorm, - i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe), - (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da), - (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc), - $addr, $rsrc) ->; - -multiclass ImagePatterns { - def : ImagePattern(opcode # _V4_V1), i32>; - def : ImagePattern(opcode # _V4_V2), v2i32>; - def : ImagePattern(opcode # _V4_V4), v4i32>; -} - -// Basic sample -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; - -// Sample with comparison -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; - -// Sample with offsets -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; - -// Sample with comparison and offsets -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; -defm : SampleRawPatterns; - -// Gather opcodes -// Only the variants which make sense are defined. -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; - -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; - -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; - -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; - -def : SampleRawPattern; -def : SampleRawPattern; -def : SampleRawPattern; - -def : ImagePattern; -defm : ImagePatterns; -defm : ImagePatterns; - -/* SIsample for simple 1D texture lookup */ -def : Pat < - (SIsample i32:$addr, v32i8:$rsrc, v4i32:$sampler, imm), - (IMAGE_SAMPLE_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) ->; - -class SamplePattern : Pat < - (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, imm), - (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) ->; - -class SampleRectPattern : Pat < - (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_RECT), - (opcode 0xf, 1, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) ->; - -class SampleArrayPattern : Pat < - (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_ARRAY), - (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler) ->; - -class SampleShadowPattern : Pat < - (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW), - (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) ->; - -class SampleShadowArrayPattern : Pat < - (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY), - (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler) ->; - -/* SIsample* for texture lookups consuming more address parameters */ -multiclass SamplePatterns { - def : SamplePattern ; - def : SampleRectPattern ; - def : SampleArrayPattern ; - def : SampleShadowPattern ; - def : SampleShadowArrayPattern ; - - def : SamplePattern ; - def : SampleArrayPattern ; - def : SampleShadowPattern ; - def : SampleShadowArrayPattern ; - - def : SamplePattern ; - def : SampleArrayPattern ; - def : SampleShadowPattern ; - def : SampleShadowArrayPattern ; - - def : SamplePattern ; - def : SampleArrayPattern ; - def : SampleShadowPattern ; - def : SampleShadowArrayPattern ; -} - -defm : SamplePatterns; -defm : SamplePatterns; -defm : SamplePatterns; -defm : SamplePatterns; - -/* int_SI_imageload for texture fetches consuming varying address parameters */ -class ImageLoadPattern : Pat < - (name addr_type:$addr, v32i8:$rsrc, imm), - (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc) ->; - -class ImageLoadArrayPattern : Pat < - (name addr_type:$addr, v32i8:$rsrc, TEX_ARRAY), - (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc) ->; - -class ImageLoadMSAAPattern : Pat < - (name addr_type:$addr, v32i8:$rsrc, TEX_MSAA), - (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc) ->; - -class ImageLoadArrayMSAAPattern : Pat < - (name addr_type:$addr, v32i8:$rsrc, TEX_ARRAY_MSAA), - (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc) ->; - -multiclass ImageLoadPatterns { - def : ImageLoadPattern ; - def : ImageLoadArrayPattern ; -} - -multiclass ImageLoadMSAAPatterns { - def : ImageLoadMSAAPattern ; - def : ImageLoadArrayMSAAPattern ; -} - -defm : ImageLoadPatterns; -defm : ImageLoadPatterns; - -defm : ImageLoadMSAAPatterns; -defm : ImageLoadMSAAPatterns; - -/* Image resource information */ -def : Pat < - (int_SI_resinfo i32:$mipid, v32i8:$rsrc, imm), - (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc) ->; - -def : Pat < - (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY), - (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc) ->; - -def : Pat < - (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY_MSAA), - (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc) ->; - -/********** ============================================ **********/ -/********** Extraction, Insertion, Building and Casting **********/ -/********** ============================================ **********/ - -foreach Index = 0-2 in { - def Extract_Element_v2i32_#Index : Extract_Element < - i32, v2i32, Index, !cast(sub#Index) - >; - def Insert_Element_v2i32_#Index : Insert_Element < - i32, v2i32, Index, !cast(sub#Index) - >; - - def Extract_Element_v2f32_#Index : Extract_Element < - f32, v2f32, Index, !cast(sub#Index) - >; - def Insert_Element_v2f32_#Index : Insert_Element < - f32, v2f32, Index, !cast(sub#Index) - >; -} - -foreach Index = 0-3 in { - def Extract_Element_v4i32_#Index : Extract_Element < - i32, v4i32, Index, !cast(sub#Index) - >; - def Insert_Element_v4i32_#Index : Insert_Element < - i32, v4i32, Index, !cast(sub#Index) - >; - - def Extract_Element_v4f32_#Index : Extract_Element < - f32, v4f32, Index, !cast(sub#Index) - >; - def Insert_Element_v4f32_#Index : Insert_Element < - f32, v4f32, Index, !cast(sub#Index) - >; -} - -foreach Index = 0-7 in { - def Extract_Element_v8i32_#Index : Extract_Element < - i32, v8i32, Index, !cast(sub#Index) - >; - def Insert_Element_v8i32_#Index : Insert_Element < - i32, v8i32, Index, !cast(sub#Index) - >; - - def Extract_Element_v8f32_#Index : Extract_Element < - f32, v8f32, Index, !cast(sub#Index) - >; - def Insert_Element_v8f32_#Index : Insert_Element < - f32, v8f32, Index, !cast(sub#Index) - >; -} - -foreach Index = 0-15 in { - def Extract_Element_v16i32_#Index : Extract_Element < - i32, v16i32, Index, !cast(sub#Index) - >; - def Insert_Element_v16i32_#Index : Insert_Element < - i32, v16i32, Index, !cast(sub#Index) - >; - - def Extract_Element_v16f32_#Index : Extract_Element < - f32, v16f32, Index, !cast(sub#Index) - >; - def Insert_Element_v16f32_#Index : Insert_Element < - f32, v16f32, Index, !cast(sub#Index) - >; -} - -def : BitConvert ; -def : BitConvert ; - -def : BitConvert ; -def : BitConvert ; - -def : BitConvert ; - -def : BitConvert ; - -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; - -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; -def : BitConvert ; - -def : BitConvert ; -def : BitConvert ; - -/********** =================== **********/ -/********** Src & Dst modifiers **********/ -/********** =================== **********/ - -def : Pat < - (AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod), - (f32 FP_ZERO), (f32 FP_ONE)), - (V_ADD_F32_e64 $src0_modifiers, $src0, 0, 0, 1, $omod) ->; - -/********** ================================ **********/ -/********** Floating point absolute/negative **********/ -/********** ================================ **********/ - -// Prevent expanding both fneg and fabs. - -// FIXME: Should use S_OR_B32 -def : Pat < - (fneg (fabs f32:$src)), - (V_OR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Set sign bit */ ->; - -// FIXME: Should use S_OR_B32 -def : Pat < - (fneg (fabs f64:$src)), - (REG_SEQUENCE VReg_64, - (i32 (EXTRACT_SUBREG f64:$src, sub0)), - sub0, - (V_OR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), - (V_MOV_B32_e32 0x80000000)), // Set sign bit. - sub1) ->; - -def : Pat < - (fabs f32:$src), - (V_AND_B32_e32 $src, (V_MOV_B32_e32 0x7fffffff)) ->; - -def : Pat < - (fneg f32:$src), - (V_XOR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) ->; - -def : Pat < - (fabs f64:$src), - (REG_SEQUENCE VReg_64, - (i32 (EXTRACT_SUBREG f64:$src, sub0)), - sub0, - (V_AND_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), - (V_MOV_B32_e32 0x7fffffff)), // Set sign bit. - sub1) ->; - -def : Pat < - (fneg f64:$src), - (REG_SEQUENCE VReg_64, - (i32 (EXTRACT_SUBREG f64:$src, sub0)), - sub0, - (V_XOR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), - (V_MOV_B32_e32 0x80000000)), - sub1) ->; - -/********** ================== **********/ -/********** Immediate Patterns **********/ -/********** ================== **********/ - -def : Pat < - (SGPRImm<(i32 imm)>:$imm), - (S_MOV_B32 imm:$imm) ->; - -def : Pat < - (SGPRImm<(f32 fpimm)>:$imm), - (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm))) ->; - -def : Pat < - (i32 imm:$imm), - (V_MOV_B32_e32 imm:$imm) ->; - -def : Pat < - (f32 fpimm:$imm), - (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm))) ->; - -def : Pat < - (i64 InlineImm:$imm), - (S_MOV_B64 InlineImm:$imm) ->; - -// XXX - Should this use a s_cmp to set SCC? - -// Set to sign-extended 64-bit value (true = -1, false = 0) -def : Pat < - (i1 imm:$imm), - (S_MOV_B64 (i64 (as_i64imm $imm))) ->; - -def : Pat < - (f64 InlineFPImm:$imm), - (S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineFPImm:$imm))) ->; - -/********** ================== **********/ -/********** Intrinsic Patterns **********/ -/********** ================== **********/ - -/* llvm.AMDGPU.pow */ -def : POW_Common ; - -def : Pat < - (int_AMDGPU_div f32:$src0, f32:$src1), - (V_MUL_LEGACY_F32_e32 $src0, (V_RCP_LEGACY_F32_e32 $src1)) ->; - -def : Pat < - (int_AMDGPU_cube v4f32:$src), - (REG_SEQUENCE VReg_128, - (V_CUBETC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0), - 0 /* src1_modifiers */, (EXTRACT_SUBREG $src, sub1), - 0 /* src2_modifiers */, (EXTRACT_SUBREG $src, sub2), - 0 /* clamp */, 0 /* omod */), sub0, - (V_CUBESC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0), - 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), - 0 /* src2_modifiers */,(EXTRACT_SUBREG $src, sub2), - 0 /* clamp */, 0 /* omod */), sub1, - (V_CUBEMA_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0), - 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), - 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2), - 0 /* clamp */, 0 /* omod */), sub2, - (V_CUBEID_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0), - 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), - 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2), - 0 /* clamp */, 0 /* omod */), sub3) ->; - -def : Pat < - (i32 (sext i1:$src0)), - (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0) ->; - -class Ext32Pat : Pat < - (i32 (ext i1:$src0)), - (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src0) ->; - -def : Ext32Pat ; -def : Ext32Pat ; - -// Offset in an 32Bit VGPR -def : Pat < - (SIload_constant v4i32:$sbase, i32:$voff), - (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, 0, 0, 0, 0, 0) ->; - -// The multiplication scales from [0,1] to the unsigned integer range -def : Pat < - (AMDGPUurecip i32:$src0), - (V_CVT_U32_F32_e32 - (V_MUL_F32_e32 CONST.FP_UINT_MAX_PLUS_1, - (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0)))) ->; - -def : Pat < - (int_SI_tid), - (V_MBCNT_HI_U32_B32_e64 0xffffffff, - (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0)) ->; - -//===----------------------------------------------------------------------===// -// VOP3 Patterns -//===----------------------------------------------------------------------===// - -def : IMad24Pat; -def : UMad24Pat; - -def : Pat < - (mulhu i32:$src0, i32:$src1), - (V_MUL_HI_U32 $src0, $src1) ->; - -def : Pat < - (mulhs i32:$src0, i32:$src1), - (V_MUL_HI_I32 $src0, $src1) ->; - -defm : BFIPatterns ; -def : ROTRPattern ; - -/********** ======================= **********/ -/********** Load/Store Patterns **********/ -/********** ======================= **********/ - -class DSReadPat : Pat < - (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), - (inst $ptr, (as_i16imm $offset), (i1 0)) ->; - -def : DSReadPat ; -def : DSReadPat ; -def : DSReadPat ; -def : DSReadPat ; -def : DSReadPat ; - -let AddedComplexity = 100 in { - -def : DSReadPat ; - -} // End AddedComplexity = 100 - -def : Pat < - (v2i32 (si_load_local (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, - i8:$offset1))), - (DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0)) ->; - -class DSWritePat : Pat < - (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)), - (inst $ptr, $value, (as_i16imm $offset), (i1 0)) ->; - -def : DSWritePat ; -def : DSWritePat ; -def : DSWritePat ; - -let AddedComplexity = 100 in { - -def : DSWritePat ; -} // End AddedComplexity = 100 - -def : Pat < - (si_store_local v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, - i8:$offset1)), - (DS_WRITE2_B32 $ptr, (EXTRACT_SUBREG $value, sub0), - (EXTRACT_SUBREG $value, sub1), $offset0, $offset1, - (i1 0)) ->; - -class DSAtomicRetPat : Pat < - (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), - (inst $ptr, $value, (as_i16imm $offset), (i1 0)) ->; - -// Special case of DSAtomicRetPat for add / sub 1 -> inc / dec -// -// We need to use something for the data0, so we set a register to -// -1. For the non-rtn variants, the manual says it does -// DS[A] = (DS[A] >= D0) ? 0 : DS[A] + 1, and setting D0 to uint_max -// will always do the increment so I'm assuming it's the same. -// -// We also load this -1 with s_mov_b32 / s_mov_b64 even though this -// needs to be a VGPR. The SGPR copy pass will fix this, and it's -// easier since there is no v_mov_b64. -class DSAtomicIncRetPat : Pat < - (frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)), - (inst $ptr, (LoadImm (vt -1)), (as_i16imm $offset), (i1 0)) ->; - - -class DSAtomicCmpXChg : Pat < - (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap), - (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0)) ->; - - -// 32-bit atomics. -def : DSAtomicIncRetPat; -def : DSAtomicIncRetPat; - -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; - -def : DSAtomicCmpXChg; - -// 64-bit atomics. -def : DSAtomicIncRetPat; -def : DSAtomicIncRetPat; - -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; - -def : DSAtomicCmpXChg; - - -//===----------------------------------------------------------------------===// -// MUBUF Patterns -//===----------------------------------------------------------------------===// - -multiclass MUBUFLoad_Pattern { - def : Pat < - (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe))), - (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe) - >; -} - -let Predicates = [isSICI] in { -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; -defm : MUBUFLoad_Pattern ; -} // End Predicates = [isSICI] - -class MUBUFScratchLoadPat : Pat < - (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr, - i32:$soffset, u16imm:$offset))), - (Instr $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) ->; - -def : MUBUFScratchLoadPat ; -def : MUBUFScratchLoadPat ; -def : MUBUFScratchLoadPat ; -def : MUBUFScratchLoadPat ; -def : MUBUFScratchLoadPat ; -def : MUBUFScratchLoadPat ; -def : MUBUFScratchLoadPat ; - -// BUFFER_LOAD_DWORD*, addr64=0 -multiclass MUBUF_Load_Dword { - - def : Pat < - (vt (int_SI_buffer_load_dword v4i32:$rsrc, (i32 imm), i32:$soffset, - imm:$offset, 0, 0, imm:$glc, imm:$slc, - imm:$tfe)), - (offset $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), - (as_i1imm $slc), (as_i1imm $tfe)) - >; - - def : Pat < - (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, - imm:$offset, 1, 0, imm:$glc, imm:$slc, - imm:$tfe)), - (offen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), - (as_i1imm $tfe)) - >; - - def : Pat < - (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, - imm:$offset, 0, 1, imm:$glc, imm:$slc, - imm:$tfe)), - (idxen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), - (as_i1imm $slc), (as_i1imm $tfe)) - >; - - def : Pat < - (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset, - imm:$offset, 1, 1, imm:$glc, imm:$slc, - imm:$tfe)), - (bothen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), - (as_i1imm $tfe)) - >; -} - -defm : MUBUF_Load_Dword ; -defm : MUBUF_Load_Dword ; -defm : MUBUF_Load_Dword ; - -class MUBUFScratchStorePat : Pat < - (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset, - u16imm:$offset)), - (Instr $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) ->; - -def : MUBUFScratchStorePat ; -def : MUBUFScratchStorePat ; -def : MUBUFScratchStorePat ; -def : MUBUFScratchStorePat ; -def : MUBUFScratchStorePat ; - -/* -class MUBUFStore_Pattern : Pat < - (st vt:$value, (MUBUFScratch v4i32:$srsrc, i64:$vaddr, u16imm:$offset)), - (Instr $value, $srsrc, $vaddr, $offset) ->; - -let Predicates = [isSICI] in { -def : MUBUFStore_Pattern ; -def : MUBUFStore_Pattern ; -def : MUBUFStore_Pattern ; -def : MUBUFStore_Pattern ; -def : MUBUFStore_Pattern ; -} // End Predicates = [isSICI] - -*/ - -//===----------------------------------------------------------------------===// -// MTBUF Patterns -//===----------------------------------------------------------------------===// - -// TBUFFER_STORE_FORMAT_*, addr64=0 -class MTBUF_StoreResource : Pat< - (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr, - i32:$soffset, imm:$inst_offset, imm:$dfmt, - imm:$nfmt, imm:$offen, imm:$idxen, - imm:$glc, imm:$slc, imm:$tfe), - (opcode - $vdata, (as_i16imm $inst_offset), (as_i1imm $offen), (as_i1imm $idxen), - (as_i1imm $glc), 0, (as_i8imm $dfmt), (as_i8imm $nfmt), $vaddr, $rsrc, - (as_i1imm $slc), (as_i1imm $tfe), $soffset) ->; - -def : MTBUF_StoreResource ; -def : MTBUF_StoreResource ; -def : MTBUF_StoreResource ; -def : MTBUF_StoreResource ; - -let SubtargetPredicate = isCI in { - -defm V_QSAD_PK_U16_U8 : VOP3Inst , "v_qsad_pk_u16_u8", - VOP_I32_I32_I32 ->; -defm V_MQSAD_U16_U8 : VOP3Inst , "v_mqsad_u16_u8", - VOP_I32_I32_I32 ->; -defm V_MQSAD_U32_U8 : VOP3Inst , "v_mqsad_u32_u8", - VOP_I32_I32_I32 ->; - -let isCommutable = 1 in { -defm V_MAD_U64_U32 : VOP3Inst , "v_mad_u64_u32", - VOP_I64_I32_I32_I64 ->; - -// XXX - Does this set VCC? -defm V_MAD_I64_I32 : VOP3Inst , "v_mad_i64_i32", - VOP_I64_I32_I32_I64 ->; -} // End isCommutable = 1 - -// Remaining instructions: -// FLAT_* -// S_CBRANCH_CDBGUSER -// S_CBRANCH_CDBGSYS -// S_CBRANCH_CDBGSYS_OR_USER -// S_CBRANCH_CDBGSYS_AND_USER -// S_DCACHE_INV_VOL -// DS_NOP -// DS_GWS_SEMA_RELEASE_ALL -// DS_WRAP_RTN_B32 -// DS_CNDXCHG32_RTN_B64 -// DS_WRITE_B96 -// DS_WRITE_B128 -// DS_CONDXCHG32_RTN_B128 -// DS_READ_B96 -// DS_READ_B128 -// BUFFER_LOAD_DWORDX3 -// BUFFER_STORE_DWORDX3 - -} // End isCI - -/********** ====================== **********/ -/********** Indirect adressing **********/ -/********** ====================== **********/ - -multiclass SI_INDIRECT_Pattern { - - // 1. Extract with offset - def : Pat< - (eltvt (vector_extract vt:$vec, (add i32:$idx, imm:$off))), - (SI_INDIRECT_SRC $vec, $idx, imm:$off) - >; - - // 2. Extract without offset - def : Pat< - (eltvt (vector_extract vt:$vec, i32:$idx)), - (SI_INDIRECT_SRC $vec, $idx, 0) - >; - - // 3. Insert with offset - def : Pat< - (vector_insert vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)), - (IndDst $vec, $idx, imm:$off, $val) - >; - - // 4. Insert without offset - def : Pat< - (vector_insert vt:$vec, eltvt:$val, i32:$idx), - (IndDst $vec, $idx, 0, $val) - >; -} - -defm : SI_INDIRECT_Pattern ; -defm : SI_INDIRECT_Pattern ; -defm : SI_INDIRECT_Pattern ; -defm : SI_INDIRECT_Pattern ; - -defm : SI_INDIRECT_Pattern ; -defm : SI_INDIRECT_Pattern ; -defm : SI_INDIRECT_Pattern ; -defm : SI_INDIRECT_Pattern ; - -//===----------------------------------------------------------------------===// -// Conversion Patterns -//===----------------------------------------------------------------------===// - -def : Pat<(i32 (sext_inreg i32:$src, i1)), - (S_BFE_I32 i32:$src, 65536)>; // 0 | 1 << 16 - -// Handle sext_inreg in i64 -def : Pat < - (i64 (sext_inreg i64:$src, i1)), - (S_BFE_I64 i64:$src, 0x10000) // 0 | 1 << 16 ->; - -def : Pat < - (i64 (sext_inreg i64:$src, i8)), - (S_BFE_I64 i64:$src, 0x80000) // 0 | 8 << 16 ->; - -def : Pat < - (i64 (sext_inreg i64:$src, i16)), - (S_BFE_I64 i64:$src, 0x100000) // 0 | 16 << 16 ->; - -def : Pat < - (i64 (sext_inreg i64:$src, i32)), - (S_BFE_I64 i64:$src, 0x200000) // 0 | 32 << 16 ->; - -class ZExt_i64_i32_Pat : Pat < - (i64 (ext i32:$src)), - (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 0), sub1) ->; - -class ZExt_i64_i1_Pat : Pat < - (i64 (ext i1:$src)), - (REG_SEQUENCE VReg_64, - (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0, - (S_MOV_B32 0), sub1) ->; - - -def : ZExt_i64_i32_Pat; -def : ZExt_i64_i32_Pat; -def : ZExt_i64_i1_Pat; -def : ZExt_i64_i1_Pat; - -def : Pat < - (i64 (sext i32:$src)), - (REG_SEQUENCE SReg_64, $src, sub0, - (S_ASHR_I32 $src, 31), sub1) ->; - -def : Pat < - (i64 (sext i1:$src)), - (REG_SEQUENCE VReg_64, - (V_CNDMASK_B32_e64 0, -1, $src), sub0, - (V_CNDMASK_B32_e64 0, -1, $src), sub1) ->; - -// If we need to perform a logical operation on i1 values, we need to -// use vector comparisons since there is only one SCC register. Vector -// comparisions still write to a pair of SGPRs, so treat these as -// 64-bit comparisons. When legalizing SGPR copies, instructions -// resulting in the copies from SCC to these instructions will be -// moved to the VALU. -def : Pat < - (i1 (and i1:$src0, i1:$src1)), - (S_AND_B64 $src0, $src1) ->; - -def : Pat < - (i1 (or i1:$src0, i1:$src1)), - (S_OR_B64 $src0, $src1) ->; - -def : Pat < - (i1 (xor i1:$src0, i1:$src1)), - (S_XOR_B64 $src0, $src1) ->; - -def : Pat < - (f32 (sint_to_fp i1:$src)), - (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_NEG_ONE, $src) ->; - -def : Pat < - (f32 (uint_to_fp i1:$src)), - (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_ONE, $src) ->; - -def : Pat < - (f64 (sint_to_fp i1:$src)), - (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src)) ->; - -def : Pat < - (f64 (uint_to_fp i1:$src)), - (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src)) ->; - -//===----------------------------------------------------------------------===// -// Miscellaneous Patterns -//===----------------------------------------------------------------------===// - -def : Pat < - (i32 (trunc i64:$a)), - (EXTRACT_SUBREG $a, sub0) ->; - -def : Pat < - (i1 (trunc i32:$a)), - (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1), $a), 1) ->; - -def : Pat < - (i1 (trunc i64:$a)), - (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1), - (EXTRACT_SUBREG $a, sub0)), 1) ->; - -def : Pat < - (i32 (bswap i32:$a)), - (V_BFI_B32 (S_MOV_B32 0x00ff00ff), - (V_ALIGNBIT_B32 $a, $a, 24), - (V_ALIGNBIT_B32 $a, $a, 8)) ->; - -def : Pat < - (f32 (select i1:$src2, f32:$src1, f32:$src0)), - (V_CNDMASK_B32_e64 $src0, $src1, $src2) ->; - -multiclass BFMPatterns { - def : Pat < - (vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)), - (BFM $a, $b) - >; - - def : Pat < - (vt (add (vt (shl 1, vt:$a)), -1)), - (BFM $a, (MOV 0)) - >; -} - -defm : BFMPatterns ; -// FIXME: defm : BFMPatterns ; - -def : BFEPattern ; - -//===----------------------------------------------------------------------===// -// Fract Patterns -//===----------------------------------------------------------------------===// - -let Predicates = [isSI] in { - -// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is -// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient -// way to implement it is using V_FRACT_F64. -// The workaround for the V_FRACT bug is: -// fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) - -// Convert (x + (-floor(x)) to fract(x) -def : Pat < - (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), - (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), - (V_CNDMASK_B64_PSEUDO - $x, - (V_MIN_F64 - SRCMODS.NONE, - (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE), - SRCMODS.NONE, - (V_MOV_B64_PSEUDO 0x3fefffffffffffff), - DSTCLAMP.NONE, DSTOMOD.NONE), - (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)) ->; - -// Convert floor(x) to (x - fract(x)) -def : Pat < - (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))), - (V_ADD_F64 - $mods, - $x, - SRCMODS.NEG, - (V_CNDMASK_B64_PSEUDO - $x, - (V_MIN_F64 - SRCMODS.NONE, - (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE), - SRCMODS.NONE, - (V_MOV_B64_PSEUDO 0x3fefffffffffffff), - DSTCLAMP.NONE, DSTOMOD.NONE), - (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)), - DSTCLAMP.NONE, DSTOMOD.NONE) ->; - -} // End Predicates = [isSI] - -let Predicates = [isCI] in { - -// Convert (x - floor(x)) to fract(x) -def : Pat < - (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)), - (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))), - (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) ->; - -// Convert (x + (-floor(x))) to fract(x) -def : Pat < - (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), - (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), - (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) ->; - -} // End Predicates = [isCI] - -//============================================================================// -// Miscellaneous Optimization Patterns -//============================================================================// - -def : SHA256MaPattern ; - -//============================================================================// -// Assembler aliases -//============================================================================// - -def : MnemonicAlias<"v_add_u32", "v_add_i32">; -def : MnemonicAlias<"v_sub_u32", "v_sub_i32">; -def : MnemonicAlias<"v_subrev_u32", "v_subrev_i32">; - -} // End isGCN predicate diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td deleted file mode 100644 index 027a0a2f516..00000000000 --- a/lib/Target/R600/SIIntrinsics.td +++ /dev/null @@ -1,199 +0,0 @@ -//===-- SIIntrinsics.td - SI Intrinsic defs ----------------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// SI Intrinsic Definitions -// -//===----------------------------------------------------------------------===// - - -let TargetPrefix = "SI", isTarget = 1 in { - - def int_SI_tid : Intrinsic <[llvm_i32_ty], [], [IntrNoMem]>; - def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; - def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; - def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_anyint_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ; - - // Fully-flexible TBUFFER_STORE_FORMAT_* except for the ADDR64 bit, which is not exposed - def int_SI_tbuffer_store : Intrinsic < - [], - [llvm_anyint_ty, // rsrc(SGPR) - llvm_anyint_ty, // vdata(VGPR), overloaded for types i32, v2i32, v4i32 - llvm_i32_ty, // num_channels(imm), selects opcode suffix: 1=X, 2=XY, 3=XYZ, 4=XYZW - llvm_i32_ty, // vaddr(VGPR) - llvm_i32_ty, // soffset(SGPR) - llvm_i32_ty, // inst_offset(imm) - llvm_i32_ty, // dfmt(imm) - llvm_i32_ty, // nfmt(imm) - llvm_i32_ty, // offen(imm) - llvm_i32_ty, // idxen(imm) - llvm_i32_ty, // glc(imm) - llvm_i32_ty, // slc(imm) - llvm_i32_ty], // tfe(imm) - []>; - - // Fully-flexible BUFFER_LOAD_DWORD_* except for the ADDR64 bit, which is not exposed - def int_SI_buffer_load_dword : Intrinsic < - [llvm_anyint_ty], // vdata(VGPR), overloaded for types i32, v2i32, v4i32 - [llvm_anyint_ty, // rsrc(SGPR) - llvm_anyint_ty, // vaddr(VGPR) - llvm_i32_ty, // soffset(SGPR) - llvm_i32_ty, // inst_offset(imm) - llvm_i32_ty, // offen(imm) - llvm_i32_ty, // idxen(imm) - llvm_i32_ty, // glc(imm) - llvm_i32_ty, // slc(imm) - llvm_i32_ty], // tfe(imm) - [IntrReadArgMem]>; - - def int_SI_sendmsg : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - - // Fully-flexible SAMPLE instruction. - class SampleRaw : Intrinsic < - [llvm_v4f32_ty], // vdata(VGPR) - [llvm_anyint_ty, // vaddr(VGPR) - llvm_v8i32_ty, // rsrc(SGPR) - llvm_v4i32_ty, // sampler(SGPR) - llvm_i32_ty, // dmask(imm) - llvm_i32_ty, // unorm(imm) - llvm_i32_ty, // r128(imm) - llvm_i32_ty, // da(imm) - llvm_i32_ty, // glc(imm) - llvm_i32_ty, // slc(imm) - llvm_i32_ty, // tfe(imm) - llvm_i32_ty], // lwe(imm) - [IntrNoMem]>; - - // Image instruction without a sampler. - class Image : Intrinsic < - [llvm_v4f32_ty], // vdata(VGPR) - [llvm_anyint_ty, // vaddr(VGPR) - llvm_v8i32_ty, // rsrc(SGPR) - llvm_i32_ty, // dmask(imm) - llvm_i32_ty, // unorm(imm) - llvm_i32_ty, // r128(imm) - llvm_i32_ty, // da(imm) - llvm_i32_ty, // glc(imm) - llvm_i32_ty, // slc(imm) - llvm_i32_ty, // tfe(imm) - llvm_i32_ty], // lwe(imm) - [IntrNoMem]>; - - // Basic sample - def int_SI_image_sample : SampleRaw; - def int_SI_image_sample_cl : SampleRaw; - def int_SI_image_sample_d : SampleRaw; - def int_SI_image_sample_d_cl : SampleRaw; - def int_SI_image_sample_l : SampleRaw; - def int_SI_image_sample_b : SampleRaw; - def int_SI_image_sample_b_cl : SampleRaw; - def int_SI_image_sample_lz : SampleRaw; - def int_SI_image_sample_cd : SampleRaw; - def int_SI_image_sample_cd_cl : SampleRaw; - - // Sample with comparison - def int_SI_image_sample_c : SampleRaw; - def int_SI_image_sample_c_cl : SampleRaw; - def int_SI_image_sample_c_d : SampleRaw; - def int_SI_image_sample_c_d_cl : SampleRaw; - def int_SI_image_sample_c_l : SampleRaw; - def int_SI_image_sample_c_b : SampleRaw; - def int_SI_image_sample_c_b_cl : SampleRaw; - def int_SI_image_sample_c_lz : SampleRaw; - def int_SI_image_sample_c_cd : SampleRaw; - def int_SI_image_sample_c_cd_cl : SampleRaw; - - // Sample with offsets - def int_SI_image_sample_o : SampleRaw; - def int_SI_image_sample_cl_o : SampleRaw; - def int_SI_image_sample_d_o : SampleRaw; - def int_SI_image_sample_d_cl_o : SampleRaw; - def int_SI_image_sample_l_o : SampleRaw; - def int_SI_image_sample_b_o : SampleRaw; - def int_SI_image_sample_b_cl_o : SampleRaw; - def int_SI_image_sample_lz_o : SampleRaw; - def int_SI_image_sample_cd_o : SampleRaw; - def int_SI_image_sample_cd_cl_o : SampleRaw; - - // Sample with comparison and offsets - def int_SI_image_sample_c_o : SampleRaw; - def int_SI_image_sample_c_cl_o : SampleRaw; - def int_SI_image_sample_c_d_o : SampleRaw; - def int_SI_image_sample_c_d_cl_o : SampleRaw; - def int_SI_image_sample_c_l_o : SampleRaw; - def int_SI_image_sample_c_b_o : SampleRaw; - def int_SI_image_sample_c_b_cl_o : SampleRaw; - def int_SI_image_sample_c_lz_o : SampleRaw; - def int_SI_image_sample_c_cd_o : SampleRaw; - def int_SI_image_sample_c_cd_cl_o : SampleRaw; - - // Basic gather4 - def int_SI_gather4 : SampleRaw; - def int_SI_gather4_cl : SampleRaw; - def int_SI_gather4_l : SampleRaw; - def int_SI_gather4_b : SampleRaw; - def int_SI_gather4_b_cl : SampleRaw; - def int_SI_gather4_lz : SampleRaw; - - // Gather4 with comparison - def int_SI_gather4_c : SampleRaw; - def int_SI_gather4_c_cl : SampleRaw; - def int_SI_gather4_c_l : SampleRaw; - def int_SI_gather4_c_b : SampleRaw; - def int_SI_gather4_c_b_cl : SampleRaw; - def int_SI_gather4_c_lz : SampleRaw; - - // Gather4 with offsets - def int_SI_gather4_o : SampleRaw; - def int_SI_gather4_cl_o : SampleRaw; - def int_SI_gather4_l_o : SampleRaw; - def int_SI_gather4_b_o : SampleRaw; - def int_SI_gather4_b_cl_o : SampleRaw; - def int_SI_gather4_lz_o : SampleRaw; - - // Gather4 with comparison and offsets - def int_SI_gather4_c_o : SampleRaw; - def int_SI_gather4_c_cl_o : SampleRaw; - def int_SI_gather4_c_l_o : SampleRaw; - def int_SI_gather4_c_b_o : SampleRaw; - def int_SI_gather4_c_b_cl_o : SampleRaw; - def int_SI_gather4_c_lz_o : SampleRaw; - - def int_SI_getlod : SampleRaw; - - // Image instrinsics. - def int_SI_image_load : Image; - def int_SI_image_load_mip : Image; - def int_SI_getresinfo : Image; - - // Deprecated image and sample intrinsics. - class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; - - def int_SI_sample : Sample; - def int_SI_sampleb : Sample; - def int_SI_sampled : Sample; - def int_SI_samplel : Sample; - def int_SI_imageload : Intrinsic <[llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_SI_resinfo : Intrinsic <[llvm_v4i32_ty], [llvm_i32_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; - - /* Interpolation Intrinsics */ - - def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_SI_fs_interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty], [IntrNoMem]>; - - /* Control flow Intrinsics */ - - def int_SI_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>; - def int_SI_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], []>; - def int_SI_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>; - def int_SI_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], []>; - def int_SI_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], []>; - def int_SI_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], []>; - def int_SI_end_cf : Intrinsic<[], [llvm_i64_ty], []>; -} diff --git a/lib/Target/R600/SILoadStoreOptimizer.cpp b/lib/Target/R600/SILoadStoreOptimizer.cpp deleted file mode 100644 index 9b1d256dc5a..00000000000 --- a/lib/Target/R600/SILoadStoreOptimizer.cpp +++ /dev/null @@ -1,421 +0,0 @@ -//===-- SILoadStoreOptimizer.cpp ------------------------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This pass tries to fuse DS instructions with close by immediate offsets. -// This will fuse operations such as -// ds_read_b32 v0, v2 offset:16 -// ds_read_b32 v1, v2 offset:32 -// ==> -// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 -// -// -// Future improvements: -// -// - This currently relies on the scheduler to place loads and stores next to -// each other, and then only merges adjacent pairs of instructions. It would -// be good to be more flexible with interleaved instructions, and possibly run -// before scheduling. It currently missing stores of constants because loading -// the constant into the data register is placed between the stores, although -// this is arguably a scheduling problem. -// -// - Live interval recomputing seems inefficient. This currently only matches -// one pair, and recomputes live intervals and moves on to the next pair. It -// would be better to compute a list of all merges that need to occur -// -// - With a list of instructions to process, we can also merge more. If a -// cluster of loads have offsets that are too large to fit in the 8-bit -// offsets, but are close enough to fit in the 8 bits, we can add to the base -// pointer and use the new reduced offsets. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "SIInstrInfo.h" -#include "SIRegisterInfo.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" -#include "llvm/CodeGen/LiveVariables.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetMachine.h" - -using namespace llvm; - -#define DEBUG_TYPE "si-load-store-opt" - -namespace { - -class SILoadStoreOptimizer : public MachineFunctionPass { -private: - const SIInstrInfo *TII; - const SIRegisterInfo *TRI; - MachineRegisterInfo *MRI; - LiveIntervals *LIS; - - - static bool offsetsCanBeCombined(unsigned Offset0, - unsigned Offset1, - unsigned EltSize); - - MachineBasicBlock::iterator findMatchingDSInst(MachineBasicBlock::iterator I, - unsigned EltSize); - - void updateRegDefsUses(unsigned SrcReg, - unsigned DstReg, - unsigned SubIdx); - - MachineBasicBlock::iterator mergeRead2Pair( - MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Paired, - unsigned EltSize); - - MachineBasicBlock::iterator mergeWrite2Pair( - MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Paired, - unsigned EltSize); - -public: - static char ID; - - SILoadStoreOptimizer() - : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr), - LIS(nullptr) {} - - SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) { - initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); - } - - bool optimizeBlock(MachineBasicBlock &MBB); - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "SI Load / Store Optimizer"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addPreserved(); - AU.addPreserved(); - AU.addPreserved(); - AU.addRequired(); - - MachineFunctionPass::getAnalysisUsage(AU); - } -}; - -} // End anonymous namespace. - -INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, - "SI Load / Store Optimizer", false, false) -INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_DEPENDENCY(LiveVariables) -INITIALIZE_PASS_DEPENDENCY(SlotIndexes) -INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, - "SI Load / Store Optimizer", false, false) - -char SILoadStoreOptimizer::ID = 0; - -char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; - -FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) { - return new SILoadStoreOptimizer(TM); -} - -bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0, - unsigned Offset1, - unsigned Size) { - // XXX - Would the same offset be OK? Is there any reason this would happen or - // be useful? - if (Offset0 == Offset1) - return false; - - // This won't be valid if the offset isn't aligned. - if ((Offset0 % Size != 0) || (Offset1 % Size != 0)) - return false; - - unsigned EltOffset0 = Offset0 / Size; - unsigned EltOffset1 = Offset1 / Size; - - // Check if the new offsets fit in the reduced 8-bit range. - if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) - return true; - - // If the offset in elements doesn't fit in 8-bits, we might be able to use - // the stride 64 versions. - if ((EltOffset0 % 64 != 0) || (EltOffset1 % 64) != 0) - return false; - - return isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64); -} - -MachineBasicBlock::iterator -SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I, - unsigned EltSize){ - MachineBasicBlock::iterator E = I->getParent()->end(); - MachineBasicBlock::iterator MBBI = I; - ++MBBI; - - if (MBBI->getOpcode() != I->getOpcode()) - return E; - - // Don't merge volatiles. - if (MBBI->hasOrderedMemoryRef()) - return E; - - int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr); - const MachineOperand &AddrReg0 = I->getOperand(AddrIdx); - const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx); - - // Check same base pointer. Be careful of subregisters, which can occur with - // vectors of pointers. - if (AddrReg0.getReg() == AddrReg1.getReg() && - AddrReg0.getSubReg() == AddrReg1.getSubReg()) { - int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), - AMDGPU::OpName::offset); - unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff; - unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff; - - // Check both offsets fit in the reduced range. - if (offsetsCanBeCombined(Offset0, Offset1, EltSize)) - return MBBI; - } - - return E; -} - -void SILoadStoreOptimizer::updateRegDefsUses(unsigned SrcReg, - unsigned DstReg, - unsigned SubIdx) { - for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(SrcReg), - E = MRI->reg_end(); I != E; ) { - MachineOperand &O = *I; - ++I; - O.substVirtReg(DstReg, SubIdx, *TRI); - } -} - -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( - MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Paired, - unsigned EltSize) { - MachineBasicBlock *MBB = I->getParent(); - - // Be careful, since the addresses could be subregisters themselves in weird - // cases, like vectors of pointers. - const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr); - - unsigned DestReg0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst)->getReg(); - unsigned DestReg1 - = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst)->getReg(); - - unsigned Offset0 - = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; - unsigned Offset1 - = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; - - unsigned NewOffset0 = Offset0 / EltSize; - unsigned NewOffset1 = Offset1 / EltSize; - unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; - - // Prefer the st64 form if we can use it, even if we can fit the offset in the - // non st64 version. I'm not sure if there's any real reason to do this. - bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); - if (UseST64) { - NewOffset0 /= 64; - NewOffset1 /= 64; - Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; - } - - assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && - (NewOffset0 != NewOffset1) && - "Computed offset doesn't fit"); - - const MCInstrDesc &Read2Desc = TII->get(Opc); - - const TargetRegisterClass *SuperRC - = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; - unsigned DestReg = MRI->createVirtualRegister(SuperRC); - - DebugLoc DL = I->getDebugLoc(); - MachineInstrBuilder Read2 - = BuildMI(*MBB, I, DL, Read2Desc, DestReg) - .addOperand(*AddrReg) // addr - .addImm(NewOffset0) // offset0 - .addImm(NewOffset1) // offset1 - .addImm(0) // gds - .addMemOperand(*I->memoperands_begin()) - .addMemOperand(*Paired->memoperands_begin()); - - unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; - unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; - updateRegDefsUses(DestReg0, DestReg, SubRegIdx0); - updateRegDefsUses(DestReg1, DestReg, SubRegIdx1); - - LIS->RemoveMachineInstrFromMaps(I); - // Replacing Paired in the maps with Read2 allows us to avoid updating the - // live range for the m0 register. - LIS->ReplaceMachineInstrInMaps(Paired, Read2); - I->eraseFromParent(); - Paired->eraseFromParent(); - - LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg()); - LIS->shrinkToUses(&AddrRegLI); - - LIS->getInterval(DestReg); // Create new LI - - DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); - return Read2.getInstr(); -} - -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( - MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Paired, - unsigned EltSize) { - MachineBasicBlock *MBB = I->getParent(); - - // Be sure to use .addOperand(), and not .addReg() with these. We want to be - // sure we preserve the subregister index and any register flags set on them. - const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr); - const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0); - const MachineOperand *Data1 - = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0); - - - unsigned Offset0 - = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; - unsigned Offset1 - = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; - - unsigned NewOffset0 = Offset0 / EltSize; - unsigned NewOffset1 = Offset1 / EltSize; - unsigned Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; - - // Prefer the st64 form if we can use it, even if we can fit the offset in the - // non st64 version. I'm not sure if there's any real reason to do this. - bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); - if (UseST64) { - NewOffset0 /= 64; - NewOffset1 /= 64; - Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; - } - - assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && - (NewOffset0 != NewOffset1) && - "Computed offset doesn't fit"); - - const MCInstrDesc &Write2Desc = TII->get(Opc); - DebugLoc DL = I->getDebugLoc(); - - // repairLiveintervalsInRange() doesn't handle physical register, so we have - // to update the M0 range manually. - SlotIndex PairedIndex = LIS->getInstructionIndex(Paired); - LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI)); - LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex); - bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot(); - - MachineInstrBuilder Write2 - = BuildMI(*MBB, I, DL, Write2Desc) - .addOperand(*Addr) // addr - .addOperand(*Data0) // data0 - .addOperand(*Data1) // data1 - .addImm(NewOffset0) // offset0 - .addImm(NewOffset1) // offset1 - .addImm(0) // gds - .addMemOperand(*I->memoperands_begin()) - .addMemOperand(*Paired->memoperands_begin()); - - // XXX - How do we express subregisters here? - unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() }; - - LIS->RemoveMachineInstrFromMaps(I); - LIS->RemoveMachineInstrFromMaps(Paired); - I->eraseFromParent(); - Paired->eraseFromParent(); - - // This doesn't handle physical registers like M0 - LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs); - - if (UpdateM0Range) { - SlotIndex Write2Index = LIS->getInstructionIndex(Write2); - M0Segment->end = Write2Index.getRegSlot(); - } - - DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); - return Write2.getInstr(); -} - -// Scan through looking for adjacent LDS operations with constant offsets from -// the same base register. We rely on the scheduler to do the hard work of -// clustering nearby loads, and assume these are all adjacent. -bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { - bool Modified = false; - - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { - MachineInstr &MI = *I; - - // Don't combine if volatile. - if (MI.hasOrderedMemoryRef()) { - ++I; - continue; - } - - unsigned Opc = MI.getOpcode(); - if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) { - unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4; - MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size); - if (Match != E) { - Modified = true; - I = mergeRead2Pair(I, Match, Size); - } else { - ++I; - } - - continue; - } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) { - unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4; - MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size); - if (Match != E) { - Modified = true; - I = mergeWrite2Pair(I, Match, Size); - } else { - ++I; - } - - continue; - } - - ++I; - } - - return Modified; -} - -bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { - const TargetSubtargetInfo &STM = MF.getSubtarget(); - TRI = static_cast(STM.getRegisterInfo()); - TII = static_cast(STM.getInstrInfo()); - MRI = &MF.getRegInfo(); - - LIS = &getAnalysis(); - - DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); - - assert(!MRI->isSSA()); - - bool Modified = false; - - for (MachineBasicBlock &MBB : MF) - Modified |= optimizeBlock(MBB); - - return Modified; -} diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp deleted file mode 100644 index c319b32111f..00000000000 --- a/lib/Target/R600/SILowerControlFlow.cpp +++ /dev/null @@ -1,605 +0,0 @@ -//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief This pass lowers the pseudo control flow instructions to real -/// machine instructions. -/// -/// All control flow is handled using predicated instructions and -/// a predicate stack. Each Scalar ALU controls the operations of 64 Vector -/// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs -/// by writting to the 64-bit EXEC register (each bit corresponds to a -/// single vector ALU). Typically, for predicates, a vector ALU will write -/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each -/// Vector ALU) and then the ScalarALU will AND the VCC register with the -/// EXEC to update the predicates. -/// -/// For example: -/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2 -/// %SGPR0 = SI_IF %VCC -/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 -/// %SGPR0 = SI_ELSE %SGPR0 -/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0 -/// SI_END_CF %SGPR0 -/// -/// becomes: -/// -/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask -/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask -/// S_CBRANCH_EXECZ label0 // This instruction is an optional -/// // optimization which allows us to -/// // branch if all the bits of -/// // EXEC are zero. -/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch -/// -/// label0: -/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC // Restore the exec mask for the Then block -/// %EXEC = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask -/// S_BRANCH_EXECZ label1 // Use our branch optimization -/// // instruction again. -/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block -/// label1: -/// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "SIInstrInfo.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/Constants.h" - -using namespace llvm; - -namespace { - -class SILowerControlFlowPass : public MachineFunctionPass { - -private: - static const unsigned SkipThreshold = 12; - - static char ID; - const SIRegisterInfo *TRI; - const SIInstrInfo *TII; - - bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To); - - void Skip(MachineInstr &From, MachineOperand &To); - void SkipIfDead(MachineInstr &MI); - - void If(MachineInstr &MI); - void Else(MachineInstr &MI); - void Break(MachineInstr &MI); - void IfBreak(MachineInstr &MI); - void ElseBreak(MachineInstr &MI); - void Loop(MachineInstr &MI); - void EndCf(MachineInstr &MI); - - void Kill(MachineInstr &MI); - void Branch(MachineInstr &MI); - - void LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0); - void computeIndirectRegAndOffset(unsigned VecReg, unsigned &Reg, int &Offset); - void IndirectSrc(MachineInstr &MI); - void IndirectDst(MachineInstr &MI); - -public: - SILowerControlFlowPass(TargetMachine &tm) : - MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "SI Lower control flow instructions"; - } - -}; - -} // End anonymous namespace - -char SILowerControlFlowPass::ID = 0; - -FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) { - return new SILowerControlFlowPass(tm); -} - -bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From, - MachineBasicBlock *To) { - - unsigned NumInstr = 0; - - for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty(); - MBB = *MBB->succ_begin()) { - - for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); - NumInstr < SkipThreshold && I != E; ++I) { - - if (I->isBundle() || !I->isBundled()) - if (++NumInstr >= SkipThreshold) - return true; - } - } - - return false; -} - -void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) { - - if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB())) - return; - - DebugLoc DL = From.getDebugLoc(); - BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) - .addOperand(To) - .addReg(AMDGPU::EXEC); -} - -void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) { - - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - - if (MBB.getParent()->getInfo()->getShaderType() != - ShaderType::PIXEL || - !shouldSkip(&MBB, &MBB.getParent()->back())) - return; - - MachineBasicBlock::iterator Insert = &MI; - ++Insert; - - // If the exec mask is non-zero, skip the next two instructions - BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addImm(3) - .addReg(AMDGPU::EXEC); - - // Exec mask is zero: Export to NULL target... - BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP)) - .addImm(0) - .addImm(0x09) // V_008DFC_SQ_EXP_NULL - .addImm(0) - .addImm(1) - .addImm(1) - .addReg(AMDGPU::VGPR0) - .addReg(AMDGPU::VGPR0) - .addReg(AMDGPU::VGPR0) - .addReg(AMDGPU::VGPR0); - - // ... and terminate wavefront - BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); -} - -void SILowerControlFlowPass::If(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - unsigned Reg = MI.getOperand(0).getReg(); - unsigned Vcc = MI.getOperand(1).getReg(); - - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg) - .addReg(Vcc); - - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg) - .addReg(AMDGPU::EXEC) - .addReg(Reg); - - Skip(MI, MI.getOperand(2)); - - MI.eraseFromParent(); -} - -void SILowerControlFlowPass::Else(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Src = MI.getOperand(1).getReg(); - - BuildMI(MBB, MBB.getFirstNonPHI(), DL, - TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst) - .addReg(Src); // Saved EXEC - - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(Dst); - - Skip(MI, MI.getOperand(2)); - - MI.eraseFromParent(); -} - -void SILowerControlFlowPass::Break(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Src = MI.getOperand(1).getReg(); - - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) - .addReg(AMDGPU::EXEC) - .addReg(Src); - - MI.eraseFromParent(); -} - -void SILowerControlFlowPass::IfBreak(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Vcc = MI.getOperand(1).getReg(); - unsigned Src = MI.getOperand(2).getReg(); - - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) - .addReg(Vcc) - .addReg(Src); - - MI.eraseFromParent(); -} - -void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Saved = MI.getOperand(1).getReg(); - unsigned Src = MI.getOperand(2).getReg(); - - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) - .addReg(Saved) - .addReg(Src); - - MI.eraseFromParent(); -} - -void SILowerControlFlowPass::Loop(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - unsigned Src = MI.getOperand(0).getReg(); - - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(Src); - - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addOperand(MI.getOperand(1)) - .addReg(AMDGPU::EXEC); - - MI.eraseFromParent(); -} - -void SILowerControlFlowPass::EndCf(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - unsigned Reg = MI.getOperand(0).getReg(); - - BuildMI(MBB, MBB.getFirstNonPHI(), DL, - TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(Reg); - - MI.eraseFromParent(); -} - -void SILowerControlFlowPass::Branch(MachineInstr &MI) { - if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode()) - MI.eraseFromParent(); - - // If these aren't equal, this is probably an infinite loop. -} - -void SILowerControlFlowPass::Kill(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - const MachineOperand &Op = MI.getOperand(0); - -#ifndef NDEBUG - const SIMachineFunctionInfo *MFI - = MBB.getParent()->getInfo(); - // Kill is only allowed in pixel / geometry shaders. - assert(MFI->getShaderType() == ShaderType::PIXEL || - MFI->getShaderType() == ShaderType::GEOMETRY); -#endif - - // Clear this thread from the exec mask if the operand is negative - if ((Op.isImm())) { - // Constant operand: Set exec mask to 0 or do nothing - if (Op.getImm() & 0x80000000) { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) - .addImm(0); - } - } else { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC) - .addImm(0) - .addOperand(Op); - } - - MI.eraseFromParent(); -} - -void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) { - - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - MachineBasicBlock::iterator I = MI; - - unsigned Save = MI.getOperand(1).getReg(); - unsigned Idx = MI.getOperand(3).getReg(); - - if (AMDGPU::SReg_32RegClass.contains(Idx)) { - if (Offset) { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) - .addReg(Idx) - .addImm(Offset); - } else { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .addReg(Idx); - } - MBB.insert(I, MovRel); - } else { - - assert(AMDGPU::SReg_64RegClass.contains(Save)); - assert(AMDGPU::VGPR_32RegClass.contains(Idx)); - - // Save the EXEC mask - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save) - .addReg(AMDGPU::EXEC); - - // Read the next variant into VCC (lower 32 bits) <- also loop target - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - AMDGPU::VCC_LO) - .addReg(Idx); - - // Move index from VCC into M0 - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .addReg(AMDGPU::VCC_LO); - - // Compare the just read M0 value to all possible Idx values - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC) - .addReg(AMDGPU::M0) - .addReg(Idx); - - // Update EXEC, save the original EXEC value to VCC - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) - .addReg(AMDGPU::VCC); - - if (Offset) { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) - .addReg(AMDGPU::M0) - .addImm(Offset); - } - // Do the actual move - MBB.insert(I, MovRel); - - // Update EXEC, switch all done bits to 0 and all todo bits to 1 - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(AMDGPU::VCC); - - // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addImm(-7) - .addReg(AMDGPU::EXEC); - - // Restore EXEC - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) - .addReg(Save); - - } - MI.eraseFromParent(); -} - -/// \param @VecReg The register which holds element zero of the vector -/// being addressed into. -/// \param[out] @Reg The base register to use in the indirect addressing instruction. -/// \param[in,out] @Offset As an input, this is the constant offset part of the -// indirect Index. e.g. v0 = v[VecReg + Offset] -// As an output, this is a constant value that needs -// to be added to the value stored in M0. -void SILowerControlFlowPass::computeIndirectRegAndOffset(unsigned VecReg, - unsigned &Reg, - int &Offset) { - unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0); - if (!SubReg) - SubReg = VecReg; - - const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg); - int RegIdx = TRI->getHWRegIndex(SubReg) + Offset; - - if (RegIdx < 0) { - Offset = RegIdx; - RegIdx = 0; - } else { - Offset = 0; - } - - Reg = RC->getRegister(RegIdx); -} - -void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) { - - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Vec = MI.getOperand(2).getReg(); - int Off = MI.getOperand(4).getImm(); - unsigned Reg; - - computeIndirectRegAndOffset(Vec, Reg, Off); - - MachineInstr *MovRel = - BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) - .addReg(Reg) - .addReg(AMDGPU::M0, RegState::Implicit) - .addReg(Vec, RegState::Implicit); - - LoadM0(MI, MovRel, Off); -} - -void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) { - - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - - unsigned Dst = MI.getOperand(0).getReg(); - int Off = MI.getOperand(4).getImm(); - unsigned Val = MI.getOperand(5).getReg(); - unsigned Reg; - - computeIndirectRegAndOffset(Dst, Reg, Off); - - MachineInstr *MovRel = - BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32)) - .addReg(Reg, RegState::Define) - .addReg(Val) - .addReg(AMDGPU::M0, RegState::Implicit) - .addReg(Dst, RegState::Implicit); - - LoadM0(MI, MovRel, Off); -} - -bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { - TII = static_cast(MF.getSubtarget().getInstrInfo()); - TRI = - static_cast(MF.getSubtarget().getRegisterInfo()); - SIMachineFunctionInfo *MFI = MF.getInfo(); - - bool HaveKill = false; - bool NeedWQM = false; - bool NeedFlat = false; - unsigned Depth = 0; - - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - - MachineBasicBlock &MBB = *BI; - MachineBasicBlock::iterator I, Next; - for (I = MBB.begin(); I != MBB.end(); I = Next) { - Next = std::next(I); - - MachineInstr &MI = *I; - if (TII->isWQM(MI.getOpcode()) || TII->isDS(MI.getOpcode())) - NeedWQM = true; - - // Flat uses m0 in case it needs to access LDS. - if (TII->isFLAT(MI.getOpcode())) - NeedFlat = true; - - switch (MI.getOpcode()) { - default: break; - case AMDGPU::SI_IF: - ++Depth; - If(MI); - break; - - case AMDGPU::SI_ELSE: - Else(MI); - break; - - case AMDGPU::SI_BREAK: - Break(MI); - break; - - case AMDGPU::SI_IF_BREAK: - IfBreak(MI); - break; - - case AMDGPU::SI_ELSE_BREAK: - ElseBreak(MI); - break; - - case AMDGPU::SI_LOOP: - ++Depth; - Loop(MI); - break; - - case AMDGPU::SI_END_CF: - if (--Depth == 0 && HaveKill) { - SkipIfDead(MI); - HaveKill = false; - } - EndCf(MI); - break; - - case AMDGPU::SI_KILL: - if (Depth == 0) - SkipIfDead(MI); - else - HaveKill = true; - Kill(MI); - break; - - case AMDGPU::S_BRANCH: - Branch(MI); - break; - - case AMDGPU::SI_INDIRECT_SRC: - IndirectSrc(MI); - break; - - case AMDGPU::SI_INDIRECT_DST_V1: - case AMDGPU::SI_INDIRECT_DST_V2: - case AMDGPU::SI_INDIRECT_DST_V4: - case AMDGPU::SI_INDIRECT_DST_V8: - case AMDGPU::SI_INDIRECT_DST_V16: - IndirectDst(MI); - break; - } - } - } - - if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) { - MachineBasicBlock &MBB = MF.front(); - BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64), - AMDGPU::EXEC).addReg(AMDGPU::EXEC); - } - - // FIXME: This seems inappropriate to do here. - if (NeedFlat && MFI->IsKernel) { - // Insert the prologue initializing the SGPRs pointing to the scratch space - // for flat accesses. - const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); - - // TODO: What to use with function calls? - - // FIXME: This is reporting stack size that is used in a scratch buffer - // rather than registers as well. - uint64_t StackSizeBytes = FrameInfo->getStackSize(); - - int IndirectBegin - = static_cast(TII)->getIndirectIndexBegin(MF); - // Convert register index to 256-byte unit. - uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256); - - assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff && - "Stack limits should be smaller than 16-bits"); - - // Initialize the flat scratch register pair. - // TODO: Can we use one s_mov_b64 here? - - // Offset is in units of 256-bytes. - MachineBasicBlock &MBB = MF.front(); - DebugLoc NoDL; - MachineBasicBlock::iterator Start = MBB.getFirstNonPHI(); - const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32); - - assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes)); - - BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO) - .addImm(StackOffset); - - // Documentation says size is "per-thread scratch size in bytes" - BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI) - .addImm(StackSizeBytes); - } - - return true; -} diff --git a/lib/Target/R600/SILowerI1Copies.cpp b/lib/Target/R600/SILowerI1Copies.cpp deleted file mode 100644 index 67421e231d8..00000000000 --- a/lib/Target/R600/SILowerI1Copies.cpp +++ /dev/null @@ -1,151 +0,0 @@ -//===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -/// i1 values are usually inserted by the CFG Structurize pass and they are -/// unique in that they can be copied from VALU to SALU registers. -/// This is not possible for any other value type. Since there are no -/// MOV instructions for i1, we to use V_CMP_* and V_CNDMASK to move the i1. -/// -//===----------------------------------------------------------------------===// -// - -#define DEBUG_TYPE "si-i1-copies" -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "SIInstrInfo.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Function.h" -#include "llvm/Support/Debug.h" -#include "llvm/Target/TargetMachine.h" - -using namespace llvm; - -namespace { - -class SILowerI1Copies : public MachineFunctionPass { -public: - static char ID; - -public: - SILowerI1Copies() : MachineFunctionPass(ID) { - initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "SI Lower i1 Copies"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } -}; - -} // End anonymous namespace. - -INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, - "SI Lower i1 Copies", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, - "SI Lower i1 Copies", false, false) - -char SILowerI1Copies::ID = 0; - -char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID; - -FunctionPass *llvm::createSILowerI1CopiesPass() { - return new SILowerI1Copies(); -} - -bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { - MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIInstrInfo *TII = - static_cast(MF.getSubtarget().getInstrInfo()); - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - std::vector I1Defs; - - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - - MachineBasicBlock &MBB = *BI; - MachineBasicBlock::iterator I, Next; - for (I = MBB.begin(); I != MBB.end(); I = Next) { - Next = std::next(I); - MachineInstr &MI = *I; - - if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) { - unsigned Reg = MI.getOperand(0).getReg(); - const TargetRegisterClass *RC = MRI.getRegClass(Reg); - if (RC == &AMDGPU::VReg_1RegClass) - MRI.setRegClass(Reg, &AMDGPU::SReg_64RegClass); - continue; - } - - if (MI.getOpcode() != AMDGPU::COPY) - continue; - - const MachineOperand &Dst = MI.getOperand(0); - const MachineOperand &Src = MI.getOperand(1); - - if (!TargetRegisterInfo::isVirtualRegister(Src.getReg()) || - !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) - continue; - - const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg()); - const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg()); - - if (DstRC == &AMDGPU::VReg_1RegClass && - TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) { - I1Defs.push_back(Dst.getReg()); - DebugLoc DL = MI.getDebugLoc(); - - MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg()); - if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) { - if (DefInst->getOperand(1).isImm()) { - I1Defs.push_back(Dst.getReg()); - - int64_t Val = DefInst->getOperand(1).getImm(); - assert(Val == 0 || Val == -1); - - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32)) - .addOperand(Dst) - .addImm(Val); - MI.eraseFromParent(); - continue; - } - } - - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64)) - .addOperand(Dst) - .addImm(0) - .addImm(-1) - .addOperand(Src); - MI.eraseFromParent(); - } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) && - SrcRC == &AMDGPU::VReg_1RegClass) { - BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_I32_e64)) - .addOperand(Dst) - .addOperand(Src) - .addImm(0); - MI.eraseFromParent(); - } - } - } - - for (unsigned Reg : I1Defs) - MRI.setRegClass(Reg, &AMDGPU::VGPR_32RegClass); - - return false; -} diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp deleted file mode 100644 index 587ea63d679..00000000000 --- a/lib/Target/R600/SIMachineFunctionInfo.cpp +++ /dev/null @@ -1,77 +0,0 @@ -//===-- SIMachineFunctionInfo.cpp - SI Machine Function Info -------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -/// \file -//===----------------------------------------------------------------------===// - - -#include "SIMachineFunctionInfo.h" -#include "AMDGPUSubtarget.h" -#include "SIInstrInfo.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/LLVMContext.h" - -#define MAX_LANES 64 - -using namespace llvm; - - -// Pin the vtable to this file. -void SIMachineFunctionInfo::anchor() {} - -SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) - : AMDGPUMachineFunction(MF), - TIDReg(AMDGPU::NoRegister), - HasSpilledVGPRs(false), - PSInputAddr(0), - NumUserSGPRs(0), - LDSWaveSpillSize(0) { } - -SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( - MachineFunction *MF, - unsigned FrameIndex, - unsigned SubIdx) { - const MachineFrameInfo *FrameInfo = MF->getFrameInfo(); - const SIRegisterInfo *TRI = static_cast( - MF->getSubtarget().getRegisterInfo()); - MachineRegisterInfo &MRI = MF->getRegInfo(); - int64_t Offset = FrameInfo->getObjectOffset(FrameIndex); - Offset += SubIdx * 4; - - unsigned LaneVGPRIdx = Offset / (64 * 4); - unsigned Lane = (Offset / 4) % 64; - - struct SpilledReg Spill; - - if (!LaneVGPRs.count(LaneVGPRIdx)) { - unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass); - LaneVGPRs[LaneVGPRIdx] = LaneVGPR; - MRI.setPhysRegUsed(LaneVGPR); - - // Add this register as live-in to all blocks to avoid machine verifer - // complaining about use of an undefined physical register. - for (MachineFunction::iterator BI = MF->begin(), BE = MF->end(); - BI != BE; ++BI) { - BI->addLiveIn(LaneVGPR); - } - } - - Spill.VGPR = LaneVGPRs[LaneVGPRIdx]; - Spill.Lane = Lane; - return Spill; -} - -unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize( - const MachineFunction &MF) const { - const AMDGPUSubtarget &ST = MF.getSubtarget(); - // FIXME: We should get this information from kernel attributes if it - // is available. - return getShaderType() == ShaderType::COMPUTE ? 256 : ST.getWavefrontSize(); -} diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h deleted file mode 100644 index 667da4c8af6..00000000000 --- a/lib/Target/R600/SIMachineFunctionInfo.h +++ /dev/null @@ -1,66 +0,0 @@ -//===- SIMachineFunctionInfo.h - SIMachineFunctionInfo interface -*- C++ -*-==// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -// -//===----------------------------------------------------------------------===// - - -#ifndef LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H -#define LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H - -#include "AMDGPUMachineFunction.h" -#include "SIRegisterInfo.h" -#include - -namespace llvm { - -class MachineRegisterInfo; - -/// This class keeps track of the SPI_SP_INPUT_ADDR config register, which -/// tells the hardware which interpolation parameters to load. -class SIMachineFunctionInfo : public AMDGPUMachineFunction { - void anchor() override; - - unsigned TIDReg; - bool HasSpilledVGPRs; - -public: - - struct SpilledReg { - unsigned VGPR; - int Lane; - SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) { } - SpilledReg() : VGPR(0), Lane(-1) { } - bool hasLane() { return Lane != -1;} - }; - - // SIMachineFunctionInfo definition - - SIMachineFunctionInfo(const MachineFunction &MF); - SpilledReg getSpilledReg(MachineFunction *MF, unsigned FrameIndex, - unsigned SubIdx); - unsigned PSInputAddr; - unsigned NumUserSGPRs; - std::map LaneVGPRs; - unsigned LDSWaveSpillSize; - unsigned ScratchOffsetReg; - bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; }; - unsigned getTIDReg() const { return TIDReg; }; - void setTIDReg(unsigned Reg) { TIDReg = Reg; } - bool hasSpilledVGPRs() const { return HasSpilledVGPRs; } - void setHasSpilledVGPRs(bool Spill = true) { HasSpilledVGPRs = Spill; } - - unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const; -}; - -} // End namespace llvm - - -#endif diff --git a/lib/Target/R600/SIPrepareScratchRegs.cpp b/lib/Target/R600/SIPrepareScratchRegs.cpp deleted file mode 100644 index 0a7f684552f..00000000000 --- a/lib/Target/R600/SIPrepareScratchRegs.cpp +++ /dev/null @@ -1,194 +0,0 @@ -//===-- SIPrepareScratchRegs.cpp - Use predicates for control flow --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// -/// This pass loads scratch pointer and scratch offset into a register or a -/// frame index which can be used anywhere in the program. These values will -/// be used for spilling VGPRs. -/// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "SIDefines.h" -#include "SIInstrInfo.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RegisterScavenging.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/LLVMContext.h" - -using namespace llvm; - -namespace { - -class SIPrepareScratchRegs : public MachineFunctionPass { - -private: - static char ID; - -public: - SIPrepareScratchRegs() : MachineFunctionPass(ID) { } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "SI prepare scratch registers"; - } - -}; - -} // End anonymous namespace - -char SIPrepareScratchRegs::ID = 0; - -FunctionPass *llvm::createSIPrepareScratchRegs() { - return new SIPrepareScratchRegs(); -} - -bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) { - SIMachineFunctionInfo *MFI = MF.getInfo(); - const SIInstrInfo *TII = - static_cast(MF.getSubtarget().getInstrInfo()); - const SIRegisterInfo *TRI = &TII->getRegisterInfo(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - MachineFrameInfo *FrameInfo = MF.getFrameInfo(); - MachineBasicBlock *Entry = MF.begin(); - MachineBasicBlock::iterator I = Entry->begin(); - DebugLoc DL = I->getDebugLoc(); - - // FIXME: If we don't have enough VGPRs for SGPR spilling we will need to - // run this pass. - if (!MFI->hasSpilledVGPRs()) - return false; - - unsigned ScratchPtrPreloadReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); - unsigned ScratchOffsetPreloadReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); - - if (!Entry->isLiveIn(ScratchPtrPreloadReg)) - Entry->addLiveIn(ScratchPtrPreloadReg); - - if (!Entry->isLiveIn(ScratchOffsetPreloadReg)) - Entry->addLiveIn(ScratchOffsetPreloadReg); - - // Load the scratch offset. - unsigned ScratchOffsetReg = - TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_32RegClass); - int ScratchOffsetFI = -1; - - if (ScratchOffsetReg != AMDGPU::NoRegister) { - // Found an SGPR to use - MRI.setPhysRegUsed(ScratchOffsetReg); - BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg) - .addReg(ScratchOffsetPreloadReg); - } else { - // No SGPR is available, we must spill. - ScratchOffsetFI = FrameInfo->CreateSpillStackObject(4,4); - BuildMI(*Entry, I, DL, TII->get(AMDGPU::SI_SPILL_S32_SAVE)) - .addReg(ScratchOffsetPreloadReg) - .addFrameIndex(ScratchOffsetFI) - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef); - } - - - // Now that we have the scratch pointer and offset values, we need to - // add them to all the SI_SPILL_V* instructions. - - RegScavenger RS; - unsigned ScratchRsrcFI = FrameInfo->CreateSpillStackObject(16, 4); - RS.addScavengingFrameIndex(ScratchRsrcFI); - - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - - MachineBasicBlock &MBB = *BI; - // Add the scratch offset reg as a live-in so that the register scavenger - // doesn't re-use it. - if (!MBB.isLiveIn(ScratchOffsetReg) && - ScratchOffsetReg != AMDGPU::NoRegister) - MBB.addLiveIn(ScratchOffsetReg); - RS.enterBasicBlock(&MBB); - - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { - MachineInstr &MI = *I; - RS.forward(I); - DebugLoc DL = MI.getDebugLoc(); - if (!TII->isVGPRSpill(MI.getOpcode())) - continue; - - // Scratch resource - unsigned ScratchRsrcReg = - RS.scavengeRegister(&AMDGPU::SReg_128RegClass, 0); - - uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE | - 0xffffffff; // Size - - unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); - unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); - unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); - unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0) - .addExternalSymbol("SCRATCH_RSRC_DWORD0") - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1) - .addExternalSymbol("SCRATCH_RSRC_DWORD1") - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2) - .addImm(Rsrc & 0xffffffff) - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3) - .addImm(Rsrc >> 32) - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - // Scratch Offset - if (ScratchOffsetReg == AMDGPU::NoRegister) { - ScratchOffsetReg = RS.scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); - BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S32_RESTORE), - ScratchOffsetReg) - .addFrameIndex(ScratchOffsetFI) - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef); - } else if (!MBB.isLiveIn(ScratchOffsetReg)) { - MBB.addLiveIn(ScratchOffsetReg); - } - - if (ScratchRsrcReg == AMDGPU::NoRegister || - ScratchOffsetReg == AMDGPU::NoRegister) { - LLVMContext &Ctx = MF.getFunction()->getContext(); - Ctx.emitError("ran out of SGPRs for spilling VGPRs"); - ScratchRsrcReg = AMDGPU::SGPR0; - ScratchOffsetReg = AMDGPU::SGPR0; - } - MI.getOperand(2).setReg(ScratchRsrcReg); - MI.getOperand(2).setIsKill(true); - MI.getOperand(2).setIsUndef(false); - MI.getOperand(3).setReg(ScratchOffsetReg); - MI.getOperand(3).setIsUndef(false); - MI.getOperand(3).setIsKill(false); - MI.addOperand(MachineOperand::CreateReg(Rsrc0, false, true, true)); - MI.addOperand(MachineOperand::CreateReg(Rsrc1, false, true, true)); - MI.addOperand(MachineOperand::CreateReg(Rsrc2, false, true, true)); - MI.addOperand(MachineOperand::CreateReg(Rsrc3, false, true, true)); - } - } - return true; -} diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp deleted file mode 100644 index db2ff0b1f95..00000000000 --- a/lib/Target/R600/SIRegisterInfo.cpp +++ /dev/null @@ -1,543 +0,0 @@ -//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief SI implementation of the TargetRegisterInfo class. -// -//===----------------------------------------------------------------------===// - - -#include "SIRegisterInfo.h" -#include "SIInstrInfo.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/RegisterScavenging.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/LLVMContext.h" - -using namespace llvm; - -SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() {} - -BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { - BitVector Reserved(getNumRegs()); - Reserved.set(AMDGPU::EXEC); - - // EXEC_LO and EXEC_HI could be allocated and used as regular register, - // but this seems likely to result in bugs, so I'm marking them as reserved. - Reserved.set(AMDGPU::EXEC_LO); - Reserved.set(AMDGPU::EXEC_HI); - - Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); - Reserved.set(AMDGPU::FLAT_SCR); - Reserved.set(AMDGPU::FLAT_SCR_LO); - Reserved.set(AMDGPU::FLAT_SCR_HI); - - // Reserve some VGPRs to use as temp registers in case we have to spill VGPRs - Reserved.set(AMDGPU::VGPR255); - Reserved.set(AMDGPU::VGPR254); - - // Tonga and Iceland can only allocate a fixed number of SGPRs due - // to a hw bug. - if (MF.getSubtarget().hasSGPRInitBug()) { - unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); - // Reserve some SGPRs for FLAT_SCRATCH and VCC (4 SGPRs). - // Assume XNACK_MASK is unused. - unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4; - - for (unsigned i = Limit; i < NumSGPRs; ++i) { - unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); - MCRegAliasIterator R = MCRegAliasIterator(Reg, this, true); - - for (; R.isValid(); ++R) - Reserved.set(*R); - } - } - - return Reserved; -} - -unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, - unsigned Idx) const { - - const AMDGPUSubtarget &STI = MF.getSubtarget(); - // FIXME: We should adjust the max number of waves based on LDS size. - unsigned SGPRLimit = getNumSGPRsAllowed(STI.getGeneration(), - STI.getMaxWavesPerCU()); - unsigned VGPRLimit = getNumVGPRsAllowed(STI.getMaxWavesPerCU()); - - for (regclass_iterator I = regclass_begin(), E = regclass_end(); - I != E; ++I) { - - unsigned NumSubRegs = std::max((int)(*I)->getSize() / 4, 1); - unsigned Limit; - - if (isSGPRClass(*I)) { - Limit = SGPRLimit / NumSubRegs; - } else { - Limit = VGPRLimit / NumSubRegs; - } - - const int *Sets = getRegClassPressureSets(*I); - assert(Sets); - for (unsigned i = 0; Sets[i] != -1; ++i) { - if (Sets[i] == (int)Idx) - return Limit; - } - } - return 256; -} - -bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { - return Fn.getFrameInfo()->hasStackObjects(); -} - -static unsigned getNumSubRegsForSpillOp(unsigned Op) { - - switch (Op) { - case AMDGPU::SI_SPILL_S512_SAVE: - case AMDGPU::SI_SPILL_S512_RESTORE: - case AMDGPU::SI_SPILL_V512_SAVE: - case AMDGPU::SI_SPILL_V512_RESTORE: - return 16; - case AMDGPU::SI_SPILL_S256_SAVE: - case AMDGPU::SI_SPILL_S256_RESTORE: - case AMDGPU::SI_SPILL_V256_SAVE: - case AMDGPU::SI_SPILL_V256_RESTORE: - return 8; - case AMDGPU::SI_SPILL_S128_SAVE: - case AMDGPU::SI_SPILL_S128_RESTORE: - case AMDGPU::SI_SPILL_V128_SAVE: - case AMDGPU::SI_SPILL_V128_RESTORE: - return 4; - case AMDGPU::SI_SPILL_V96_SAVE: - case AMDGPU::SI_SPILL_V96_RESTORE: - return 3; - case AMDGPU::SI_SPILL_S64_SAVE: - case AMDGPU::SI_SPILL_S64_RESTORE: - case AMDGPU::SI_SPILL_V64_SAVE: - case AMDGPU::SI_SPILL_V64_RESTORE: - return 2; - case AMDGPU::SI_SPILL_S32_SAVE: - case AMDGPU::SI_SPILL_S32_RESTORE: - case AMDGPU::SI_SPILL_V32_SAVE: - case AMDGPU::SI_SPILL_V32_RESTORE: - return 1; - default: llvm_unreachable("Invalid spill opcode"); - } -} - -void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, - unsigned LoadStoreOp, - unsigned Value, - unsigned ScratchRsrcReg, - unsigned ScratchOffset, - int64_t Offset, - RegScavenger *RS) const { - - MachineBasicBlock *MBB = MI->getParent(); - const MachineFunction *MF = MI->getParent()->getParent(); - const SIInstrInfo *TII = - static_cast(MF->getSubtarget().getInstrInfo()); - LLVMContext &Ctx = MF->getFunction()->getContext(); - DebugLoc DL = MI->getDebugLoc(); - bool IsLoad = TII->get(LoadStoreOp).mayLoad(); - - bool RanOutOfSGPRs = false; - unsigned SOffset = ScratchOffset; - - unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); - unsigned Size = NumSubRegs * 4; - - if (!isUInt<12>(Offset + Size)) { - SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0); - if (SOffset == AMDGPU::NoRegister) { - RanOutOfSGPRs = true; - SOffset = AMDGPU::SGPR0; - } - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) - .addReg(ScratchOffset) - .addImm(Offset); - Offset = 0; - } - - if (RanOutOfSGPRs) - Ctx.emitError("Ran out of SGPRs for spilling VGPRS"); - - for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) { - unsigned SubReg = NumSubRegs > 1 ? - getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) : - Value; - bool IsKill = (i == e - 1); - - BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) - .addReg(SubReg, getDefRegState(IsLoad)) - .addReg(ScratchRsrcReg, getKillRegState(IsKill)) - .addReg(SOffset) - .addImm(Offset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // tfe - .addReg(Value, RegState::Implicit | getDefRegState(IsLoad)); - } -} - -void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, - int SPAdj, unsigned FIOperandNum, - RegScavenger *RS) const { - MachineFunction *MF = MI->getParent()->getParent(); - MachineBasicBlock *MBB = MI->getParent(); - SIMachineFunctionInfo *MFI = MF->getInfo(); - MachineFrameInfo *FrameInfo = MF->getFrameInfo(); - const SIInstrInfo *TII = - static_cast(MF->getSubtarget().getInstrInfo()); - DebugLoc DL = MI->getDebugLoc(); - - MachineOperand &FIOp = MI->getOperand(FIOperandNum); - int Index = MI->getOperand(FIOperandNum).getIndex(); - - switch (MI->getOpcode()) { - // SGPR register spill - case AMDGPU::SI_SPILL_S512_SAVE: - case AMDGPU::SI_SPILL_S256_SAVE: - case AMDGPU::SI_SPILL_S128_SAVE: - case AMDGPU::SI_SPILL_S64_SAVE: - case AMDGPU::SI_SPILL_S32_SAVE: { - unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); - - for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { - unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(), - &AMDGPU::SGPR_32RegClass, i); - struct SIMachineFunctionInfo::SpilledReg Spill = - MFI->getSpilledReg(MF, Index, i); - - if (Spill.VGPR == AMDGPU::NoRegister) { - LLVMContext &Ctx = MF->getFunction()->getContext(); - Ctx.emitError("Ran out of VGPRs for spilling SGPR"); - } - - BuildMI(*MBB, MI, DL, - TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), - Spill.VGPR) - .addReg(SubReg) - .addImm(Spill.Lane); - - } - MI->eraseFromParent(); - break; - } - - // SGPR register restore - case AMDGPU::SI_SPILL_S512_RESTORE: - case AMDGPU::SI_SPILL_S256_RESTORE: - case AMDGPU::SI_SPILL_S128_RESTORE: - case AMDGPU::SI_SPILL_S64_RESTORE: - case AMDGPU::SI_SPILL_S32_RESTORE: { - unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); - - for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { - unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(), - &AMDGPU::SGPR_32RegClass, i); - struct SIMachineFunctionInfo::SpilledReg Spill = - MFI->getSpilledReg(MF, Index, i); - - if (Spill.VGPR == AMDGPU::NoRegister) { - LLVMContext &Ctx = MF->getFunction()->getContext(); - Ctx.emitError("Ran out of VGPRs for spilling SGPR"); - } - - BuildMI(*MBB, MI, DL, - TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), - SubReg) - .addReg(Spill.VGPR) - .addImm(Spill.Lane) - .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); - } - - // TODO: only do this when it is needed - switch (MF->getSubtarget().getGeneration()) { - case AMDGPUSubtarget::SOUTHERN_ISLANDS: - // "VALU writes SGPR" -> "SMRD reads that SGPR" needs "S_NOP 3" on SI - TII->insertNOPs(MI, 3); - break; - case AMDGPUSubtarget::SEA_ISLANDS: - break; - default: // VOLCANIC_ISLANDS and later - // "VALU writes SGPR -> VMEM reads that SGPR" needs "S_NOP 4" on VI - // and later. This also applies to VALUs which write VCC, but we're - // unlikely to see VMEM use VCC. - TII->insertNOPs(MI, 4); - } - - MI->eraseFromParent(); - break; - } - - // VGPR register spill - case AMDGPU::SI_SPILL_V512_SAVE: - case AMDGPU::SI_SPILL_V256_SAVE: - case AMDGPU::SI_SPILL_V128_SAVE: - case AMDGPU::SI_SPILL_V96_SAVE: - case AMDGPU::SI_SPILL_V64_SAVE: - case AMDGPU::SI_SPILL_V32_SAVE: - buildScratchLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, - TII->getNamedOperand(*MI, AMDGPU::OpName::src)->getReg(), - TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(), - TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(), - FrameInfo->getObjectOffset(Index), RS); - MI->eraseFromParent(); - break; - case AMDGPU::SI_SPILL_V32_RESTORE: - case AMDGPU::SI_SPILL_V64_RESTORE: - case AMDGPU::SI_SPILL_V96_RESTORE: - case AMDGPU::SI_SPILL_V128_RESTORE: - case AMDGPU::SI_SPILL_V256_RESTORE: - case AMDGPU::SI_SPILL_V512_RESTORE: { - buildScratchLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, - TII->getNamedOperand(*MI, AMDGPU::OpName::dst)->getReg(), - TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(), - TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(), - FrameInfo->getObjectOffset(Index), RS); - MI->eraseFromParent(); - break; - } - - default: { - int64_t Offset = FrameInfo->getObjectOffset(Index); - FIOp.ChangeToImmediate(Offset); - if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) { - unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, SPAdj); - BuildMI(*MBB, MI, MI->getDebugLoc(), - TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) - .addImm(Offset); - FIOp.ChangeToRegister(TmpReg, false, false, true); - } - } - } -} - -const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass( - MVT VT) const { - switch(VT.SimpleTy) { - default: - case MVT::i32: return &AMDGPU::VGPR_32RegClass; - } -} - -unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const { - return getEncodingValue(Reg) & 0xff; -} - -const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { - assert(!TargetRegisterInfo::isVirtualRegister(Reg)); - - static const TargetRegisterClass *BaseClasses[] = { - &AMDGPU::VGPR_32RegClass, - &AMDGPU::SReg_32RegClass, - &AMDGPU::VReg_64RegClass, - &AMDGPU::SReg_64RegClass, - &AMDGPU::VReg_96RegClass, - &AMDGPU::VReg_128RegClass, - &AMDGPU::SReg_128RegClass, - &AMDGPU::VReg_256RegClass, - &AMDGPU::SReg_256RegClass, - &AMDGPU::VReg_512RegClass - }; - - for (const TargetRegisterClass *BaseClass : BaseClasses) { - if (BaseClass->contains(Reg)) { - return BaseClass; - } - } - return nullptr; -} - -bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { - return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) || - getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) || - getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) || - getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) || - getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) || - getCommonSubClass(&AMDGPU::VReg_512RegClass, RC); -} - -const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( - const TargetRegisterClass *SRC) const { - if (hasVGPRs(SRC)) { - return SRC; - } else if (SRC == &AMDGPU::SCCRegRegClass) { - return &AMDGPU::VCCRegRegClass; - } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_32RegClass)) { - return &AMDGPU::VGPR_32RegClass; - } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_64RegClass)) { - return &AMDGPU::VReg_64RegClass; - } else if (getCommonSubClass(SRC, &AMDGPU::SReg_128RegClass)) { - return &AMDGPU::VReg_128RegClass; - } else if (getCommonSubClass(SRC, &AMDGPU::SReg_256RegClass)) { - return &AMDGPU::VReg_256RegClass; - } else if (getCommonSubClass(SRC, &AMDGPU::SReg_512RegClass)) { - return &AMDGPU::VReg_512RegClass; - } - return nullptr; -} - -const TargetRegisterClass *SIRegisterInfo::getSubRegClass( - const TargetRegisterClass *RC, unsigned SubIdx) const { - if (SubIdx == AMDGPU::NoSubRegister) - return RC; - - // If this register has a sub-register, we can safely assume it is a 32-bit - // register, because all of SI's sub-registers are 32-bit. - if (isSGPRClass(RC)) { - return &AMDGPU::SGPR_32RegClass; - } else { - return &AMDGPU::VGPR_32RegClass; - } -} - -unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg, - const TargetRegisterClass *SubRC, - unsigned Channel) const { - - switch (Reg) { - case AMDGPU::VCC: - switch(Channel) { - case 0: return AMDGPU::VCC_LO; - case 1: return AMDGPU::VCC_HI; - default: llvm_unreachable("Invalid SubIdx for VCC"); - } - - case AMDGPU::FLAT_SCR: - switch (Channel) { - case 0: - return AMDGPU::FLAT_SCR_LO; - case 1: - return AMDGPU::FLAT_SCR_HI; - default: - llvm_unreachable("Invalid SubIdx for FLAT_SCR"); - } - break; - - case AMDGPU::EXEC: - switch (Channel) { - case 0: - return AMDGPU::EXEC_LO; - case 1: - return AMDGPU::EXEC_HI; - default: - llvm_unreachable("Invalid SubIdx for EXEC"); - } - break; - } - - const TargetRegisterClass *RC = getPhysRegClass(Reg); - // 32-bit registers don't have sub-registers, so we can just return the - // Reg. We need to have this check here, because the calculation below - // using getHWRegIndex() will fail with special 32-bit registers like - // VCC_LO, VCC_HI, EXEC_LO, EXEC_HI and M0. - if (RC->getSize() == 4) { - assert(Channel == 0); - return Reg; - } - - unsigned Index = getHWRegIndex(Reg); - return SubRC->getRegister(Index + Channel); -} - -bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const { - return OpType == AMDGPU::OPERAND_REG_IMM32; -} - -bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { - if (opCanUseLiteralConstant(OpType)) - return true; - - return OpType == AMDGPU::OPERAND_REG_INLINE_C; -} - -unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, - enum PreloadedValue Value) const { - - const SIMachineFunctionInfo *MFI = MF.getInfo(); - switch (Value) { - case SIRegisterInfo::TGID_X: - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0); - case SIRegisterInfo::TGID_Y: - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1); - case SIRegisterInfo::TGID_Z: - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2); - case SIRegisterInfo::SCRATCH_WAVE_OFFSET: - if (MFI->getShaderType() != ShaderType::COMPUTE) - return MFI->ScratchOffsetReg; - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4); - case SIRegisterInfo::SCRATCH_PTR: - return AMDGPU::SGPR2_SGPR3; - case SIRegisterInfo::INPUT_PTR: - return AMDGPU::SGPR0_SGPR1; - case SIRegisterInfo::TIDIG_X: - return AMDGPU::VGPR0; - case SIRegisterInfo::TIDIG_Y: - return AMDGPU::VGPR1; - case SIRegisterInfo::TIDIG_Z: - return AMDGPU::VGPR2; - } - llvm_unreachable("unexpected preloaded value type"); -} - -/// \brief Returns a register that is not used at any point in the function. -/// If all registers are used, then this function will return -// AMDGPU::NoRegister. -unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, - const TargetRegisterClass *RC) const { - - for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); - I != E; ++I) { - if (!MRI.isPhysRegUsed(*I)) - return *I; - } - return AMDGPU::NoRegister; -} - -unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const { - switch(WaveCount) { - case 10: return 24; - case 9: return 28; - case 8: return 32; - case 7: return 36; - case 6: return 40; - case 5: return 48; - case 4: return 64; - case 3: return 84; - case 2: return 128; - default: return 256; - } -} - -unsigned SIRegisterInfo::getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen, - unsigned WaveCount) const { - if (gen >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - switch (WaveCount) { - case 10: return 80; - case 9: return 80; - case 8: return 96; - default: return 102; - } - } else { - switch(WaveCount) { - case 10: return 48; - case 9: return 56; - case 8: return 64; - case 7: return 72; - case 6: return 80; - case 5: return 96; - default: return 103; - } - } -} diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h deleted file mode 100644 index bfdb67c5e12..00000000000 --- a/lib/Target/R600/SIRegisterInfo.h +++ /dev/null @@ -1,131 +0,0 @@ -//===-- SIRegisterInfo.h - SI Register Info Interface ----------*- C++ -*--===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Interface definition for SIRegisterInfo -// -//===----------------------------------------------------------------------===// - - -#ifndef LLVM_LIB_TARGET_R600_SIREGISTERINFO_H -#define LLVM_LIB_TARGET_R600_SIREGISTERINFO_H - -#include "AMDGPURegisterInfo.h" -#include "AMDGPUSubtarget.h" -#include "llvm/Support/Debug.h" - -namespace llvm { - -struct SIRegisterInfo : public AMDGPURegisterInfo { - - SIRegisterInfo(); - - BitVector getReservedRegs(const MachineFunction &MF) const override; - - unsigned getRegPressureSetLimit(const MachineFunction &MF, - unsigned Idx) const override; - - bool requiresRegisterScavenging(const MachineFunction &Fn) const override; - - void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, - unsigned FIOperandNum, - RegScavenger *RS) const override; - - /// \brief get the register class of the specified type to use in the - /// CFGStructurizer - const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override; - - unsigned getHWRegIndex(unsigned Reg) const override; - - /// \brief Return the 'base' register class for this register. - /// e.g. SGPR0 => SReg_32, VGPR => VGPR_32 SGPR0_SGPR1 -> SReg_32, etc. - const TargetRegisterClass *getPhysRegClass(unsigned Reg) const; - - /// \returns true if this class contains only SGPR registers - bool isSGPRClass(const TargetRegisterClass *RC) const { - if (!RC) - return false; - - return !hasVGPRs(RC); - } - - /// \returns true if this class ID contains only SGPR registers - bool isSGPRClassID(unsigned RCID) const { - if (static_cast(RCID) == -1) - return false; - - return isSGPRClass(getRegClass(RCID)); - } - - /// \returns true if this class contains VGPR registers. - bool hasVGPRs(const TargetRegisterClass *RC) const; - - /// \returns A VGPR reg class with the same width as \p SRC - const TargetRegisterClass *getEquivalentVGPRClass( - const TargetRegisterClass *SRC) const; - - /// \returns The register class that is used for a sub-register of \p RC for - /// the given \p SubIdx. If \p SubIdx equals NoSubRegister, \p RC will - /// be returned. - const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC, - unsigned SubIdx) const; - - /// \p Channel This is the register channel (e.g. a value from 0-16), not the - /// SubReg index. - /// \returns The sub-register of Reg that is in Channel. - unsigned getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC, - unsigned Channel) const; - - /// \returns True if operands defined with this operand type can accept - /// a literal constant (i.e. any 32-bit immediate). - bool opCanUseLiteralConstant(unsigned OpType) const; - - /// \returns True if operands defined with this operand type can accept - /// an inline constant. i.e. An integer value in the range (-16, 64) or - /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. - bool opCanUseInlineConstant(unsigned OpType) const; - - enum PreloadedValue { - TGID_X, - TGID_Y, - TGID_Z, - SCRATCH_WAVE_OFFSET, - SCRATCH_PTR, - INPUT_PTR, - TIDIG_X, - TIDIG_Y, - TIDIG_Z - }; - - /// \brief Returns the physical register that \p Value is stored in. - unsigned getPreloadedValue(const MachineFunction &MF, - enum PreloadedValue Value) const; - - /// \brief Give the maximum number of VGPRs that can be used by \p WaveCount - /// concurrent waves. - unsigned getNumVGPRsAllowed(unsigned WaveCount) const; - - /// \brief Give the maximum number of SGPRs that can be used by \p WaveCount - /// concurrent waves. - unsigned getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen, - unsigned WaveCount) const; - - unsigned findUnusedRegister(const MachineRegisterInfo &MRI, - const TargetRegisterClass *RC) const; - -private: - void buildScratchLoadStore(MachineBasicBlock::iterator MI, - unsigned LoadStoreOp, unsigned Value, - unsigned ScratchRsrcReg, unsigned ScratchOffset, - int64_t Offset, RegScavenger *RS) const; -}; - -} // End namespace llvm - -#endif diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td deleted file mode 100644 index 2a9017fa2a9..00000000000 --- a/lib/Target/R600/SIRegisterInfo.td +++ /dev/null @@ -1,284 +0,0 @@ -//===-- SIRegisterInfo.td - SI Register defs ---------------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// Declarations that describe the SI registers -//===----------------------------------------------------------------------===// - -class SIReg encoding = 0> : Register { - let Namespace = "AMDGPU"; - let HWEncoding = encoding; -} - -// Special Registers -def VCC_LO : SIReg<"vcc_lo", 106>; -def VCC_HI : SIReg<"vcc_hi", 107>; - -// VCC for 64-bit instructions -def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> { - let Namespace = "AMDGPU"; - let SubRegIndices = [sub0, sub1]; - let HWEncoding = 106; -} - -def EXEC_LO : SIReg<"exec_lo", 126>; -def EXEC_HI : SIReg<"exec_hi", 127>; - -def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]> { - let Namespace = "AMDGPU"; - let SubRegIndices = [sub0, sub1]; - let HWEncoding = 126; -} - -def SCC : SIReg<"scc", 253>; -def M0 : SIReg <"m0", 124>; - -def FLAT_SCR_LO : SIReg<"flat_scr_lo", 104>; // Offset in units of 256-bytes. -def FLAT_SCR_HI : SIReg<"flat_scr_hi", 105>; // Size is the per-thread scratch size, in bytes. - -// Pair to indicate location of scratch space for flat accesses. -def FLAT_SCR : RegisterWithSubRegs <"flat_scr", [FLAT_SCR_LO, FLAT_SCR_HI]> { - let Namespace = "AMDGPU"; - let SubRegIndices = [sub0, sub1]; - let HWEncoding = 104; -} - -// SGPR registers -foreach Index = 0-101 in { - def SGPR#Index : SIReg <"SGPR"#Index, Index>; -} - -// VGPR registers -foreach Index = 0-255 in { - def VGPR#Index : SIReg <"VGPR"#Index, Index> { - let HWEncoding{8} = 1; - } -} - -//===----------------------------------------------------------------------===// -// Groupings using register classes and tuples -//===----------------------------------------------------------------------===// - -// SGPR 32-bit registers -def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, - (add (sequence "SGPR%u", 0, 101))>; - -// SGPR 64-bit registers -def SGPR_64Regs : RegisterTuples<[sub0, sub1], - [(add (decimate (trunc SGPR_32, 101), 2)), - (add (decimate (shl SGPR_32, 1), 2))]>; - -// SGPR 128-bit registers -def SGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3], - [(add (decimate (trunc SGPR_32, 99), 4)), - (add (decimate (shl SGPR_32, 1), 4)), - (add (decimate (shl SGPR_32, 2), 4)), - (add (decimate (shl SGPR_32, 3), 4))]>; - -// SGPR 256-bit registers -def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7], - [(add (decimate (trunc SGPR_32, 95), 4)), - (add (decimate (shl SGPR_32, 1), 4)), - (add (decimate (shl SGPR_32, 2), 4)), - (add (decimate (shl SGPR_32, 3), 4)), - (add (decimate (shl SGPR_32, 4), 4)), - (add (decimate (shl SGPR_32, 5), 4)), - (add (decimate (shl SGPR_32, 6), 4)), - (add (decimate (shl SGPR_32, 7), 4))]>; - -// SGPR 512-bit registers -def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, - sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15], - [(add (decimate (trunc SGPR_32, 87), 4)), - (add (decimate (shl SGPR_32, 1), 4)), - (add (decimate (shl SGPR_32, 2), 4)), - (add (decimate (shl SGPR_32, 3), 4)), - (add (decimate (shl SGPR_32, 4), 4)), - (add (decimate (shl SGPR_32, 5), 4)), - (add (decimate (shl SGPR_32, 6), 4)), - (add (decimate (shl SGPR_32, 7), 4)), - (add (decimate (shl SGPR_32, 8), 4)), - (add (decimate (shl SGPR_32, 9), 4)), - (add (decimate (shl SGPR_32, 10), 4)), - (add (decimate (shl SGPR_32, 11), 4)), - (add (decimate (shl SGPR_32, 12), 4)), - (add (decimate (shl SGPR_32, 13), 4)), - (add (decimate (shl SGPR_32, 14), 4)), - (add (decimate (shl SGPR_32, 15), 4))]>; - -// VGPR 32-bit registers -def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, - (add (sequence "VGPR%u", 0, 255))>; - -// VGPR 64-bit registers -def VGPR_64 : RegisterTuples<[sub0, sub1], - [(add (trunc VGPR_32, 255)), - (add (shl VGPR_32, 1))]>; - -// VGPR 96-bit registers -def VGPR_96 : RegisterTuples<[sub0, sub1, sub2], - [(add (trunc VGPR_32, 254)), - (add (shl VGPR_32, 1)), - (add (shl VGPR_32, 2))]>; - -// VGPR 128-bit registers -def VGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3], - [(add (trunc VGPR_32, 253)), - (add (shl VGPR_32, 1)), - (add (shl VGPR_32, 2)), - (add (shl VGPR_32, 3))]>; - -// VGPR 256-bit registers -def VGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7], - [(add (trunc VGPR_32, 249)), - (add (shl VGPR_32, 1)), - (add (shl VGPR_32, 2)), - (add (shl VGPR_32, 3)), - (add (shl VGPR_32, 4)), - (add (shl VGPR_32, 5)), - (add (shl VGPR_32, 6)), - (add (shl VGPR_32, 7))]>; - -// VGPR 512-bit registers -def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, - sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15], - [(add (trunc VGPR_32, 241)), - (add (shl VGPR_32, 1)), - (add (shl VGPR_32, 2)), - (add (shl VGPR_32, 3)), - (add (shl VGPR_32, 4)), - (add (shl VGPR_32, 5)), - (add (shl VGPR_32, 6)), - (add (shl VGPR_32, 7)), - (add (shl VGPR_32, 8)), - (add (shl VGPR_32, 9)), - (add (shl VGPR_32, 10)), - (add (shl VGPR_32, 11)), - (add (shl VGPR_32, 12)), - (add (shl VGPR_32, 13)), - (add (shl VGPR_32, 14)), - (add (shl VGPR_32, 15))]>; - -//===----------------------------------------------------------------------===// -// Register classes used as source and destination -//===----------------------------------------------------------------------===// - -class RegImmMatcher : AsmOperandClass { - let Name = name; - let RenderMethod = "addRegOrImmOperands"; -} - -// Special register classes for predicates and the M0 register -def SCCReg : RegisterClass<"AMDGPU", [i32, i1], 32, (add SCC)> { - let CopyCost = -1; // Theoretically it is possible to read from SCC, - // but it should never be necessary. -} - -def VCCReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add VCC)>; -def EXECReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add EXEC)>; - -// Register class for all scalar registers (SGPRs + Special Registers) -def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32, - (add SGPR_32, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI) ->; - -def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 64, (add SGPR_64Regs)>; - -def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 64, - (add SGPR_64, VCCReg, EXECReg, FLAT_SCR) ->; - -def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8], 128, (add SGPR_128)>; - -def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256)>; - -def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 512, (add SGPR_512)>; - -// Register class for all vector registers (VGPRs + Interploation Registers) -def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 64, (add VGPR_64)>; - -def VReg_96 : RegisterClass<"AMDGPU", [untyped], 96, (add VGPR_96)> { - let Size = 96; -} - -def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)>; - -def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add VGPR_256)>; - -def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>; - -def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> { - let Size = 32; -} - -class RegImmOperand : RegisterOperand { - let OperandNamespace = "AMDGPU"; - let OperandType = "OPERAND_REG_IMM32"; -} - -class RegInlineOperand : RegisterOperand { - let OperandNamespace = "AMDGPU"; - let OperandType = "OPERAND_REG_INLINE_C"; -} - -//===----------------------------------------------------------------------===// -// SSrc_* Operands with an SGPR or a 32-bit immediate -//===----------------------------------------------------------------------===// - -def SSrc_32 : RegImmOperand { - let ParserMatchClass = RegImmMatcher<"SSrc32">; -} - -def SSrc_64 : RegImmOperand { - let ParserMatchClass = RegImmMatcher<"SSrc64">; -} - -//===----------------------------------------------------------------------===// -// SCSrc_* Operands with an SGPR or a inline constant -//===----------------------------------------------------------------------===// - -def SCSrc_32 : RegInlineOperand { - let ParserMatchClass = RegImmMatcher<"SCSrc32">; -} - -//===----------------------------------------------------------------------===// -// VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate -//===----------------------------------------------------------------------===// - -def VS_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VGPR_32, SReg_32)>; - -def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>; - -def VSrc_32 : RegisterOperand { - let OperandNamespace = "AMDGPU"; - let OperandType = "OPERAND_REG_IMM32"; - let ParserMatchClass = RegImmMatcher<"VSrc32">; -} - -def VSrc_64 : RegisterOperand { - let OperandNamespace = "AMDGPU"; - let OperandType = "OPERAND_REG_IMM32"; - let ParserMatchClass = RegImmMatcher<"VSrc64">; -} - -//===----------------------------------------------------------------------===// -// VCSrc_* Operands with an SGPR, VGPR or an inline constant -//===----------------------------------------------------------------------===// - -def VCSrc_32 : RegisterOperand { - let OperandNamespace = "AMDGPU"; - let OperandType = "OPERAND_REG_INLINE_C"; - let ParserMatchClass = RegImmMatcher<"VCSrc32">; -} - -def VCSrc_64 : RegisterOperand { - let OperandNamespace = "AMDGPU"; - let OperandType = "OPERAND_REG_INLINE_C"; - let ParserMatchClass = RegImmMatcher<"VCSrc64">; -} diff --git a/lib/Target/R600/SISchedule.td b/lib/Target/R600/SISchedule.td deleted file mode 100644 index 9b1f676020b..00000000000 --- a/lib/Target/R600/SISchedule.td +++ /dev/null @@ -1,91 +0,0 @@ -//===-- SISchedule.td - SI Scheduling definitons -------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// MachineModel definitions for Southern Islands (SI) -// -//===----------------------------------------------------------------------===// - -def WriteBranch : SchedWrite; -def WriteExport : SchedWrite; -def WriteLDS : SchedWrite; -def WriteSALU : SchedWrite; -def WriteSMEM : SchedWrite; -def WriteVMEM : SchedWrite; - -// Vector ALU instructions -def Write32Bit : SchedWrite; -def WriteQuarterRate32 : SchedWrite; - -def WriteFloatFMA : SchedWrite; - -def WriteDouble : SchedWrite; -def WriteDoubleAdd : SchedWrite; - -def SIFullSpeedModel : SchedMachineModel; -def SIQuarterSpeedModel : SchedMachineModel; - -// BufferSize = 0 means the processors are in-order. -let BufferSize = 0 in { - -// XXX: Are the resource counts correct? -def HWBranch : ProcResource<1>; -def HWExport : ProcResource<7>; // Taken from S_WAITCNT -def HWLGKM : ProcResource<31>; // Taken from S_WAITCNT -def HWSALU : ProcResource<1>; -def HWVMEM : ProcResource<15>; // Taken from S_WAITCNT -def HWVALU : ProcResource<1>; - -} - -class HWWriteRes resources, - int latency> : WriteRes { - let Latency = latency; -} - -class HWVALUWriteRes : - HWWriteRes; - - -// The latency numbers are taken from AMD Accelerated Parallel Processing -// guide. They may not be acurate. - -// The latency values are 1 / (operations / cycle) / 4. -multiclass SICommonWriteRes { - - def : HWWriteRes; // XXX: Guessed ??? - def : HWWriteRes; // XXX: Guessed ??? - def : HWWriteRes; // 2 - 64 - def : HWWriteRes; - def : HWWriteRes; // XXX: Guessed ??? - def : HWWriteRes; // 300 - 600 - - def : HWVALUWriteRes; - def : HWVALUWriteRes; -} - - -let SchedModel = SIFullSpeedModel in { - -defm : SICommonWriteRes; - -def : HWVALUWriteRes; -def : HWVALUWriteRes; -def : HWVALUWriteRes; - -} // End SchedModel = SIFullSpeedModel - -let SchedModel = SIQuarterSpeedModel in { - -defm : SICommonWriteRes; - -def : HWVALUWriteRes; -def : HWVALUWriteRes; -def : HWVALUWriteRes; - -} // End SchedModel = SIQuarterSpeedModel diff --git a/lib/Target/R600/SIShrinkInstructions.cpp b/lib/Target/R600/SIShrinkInstructions.cpp deleted file mode 100644 index 51e72cdb5f9..00000000000 --- a/lib/Target/R600/SIShrinkInstructions.cpp +++ /dev/null @@ -1,272 +0,0 @@ -//===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -/// The pass tries to use the 32-bit encoding for instructions when possible. -//===----------------------------------------------------------------------===// -// - -#include "AMDGPU.h" -#include "AMDGPUMCInstLower.h" -#include "AMDGPUSubtarget.h" -#include "SIInstrInfo.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetMachine.h" - -#define DEBUG_TYPE "si-shrink-instructions" - -STATISTIC(NumInstructionsShrunk, - "Number of 64-bit instruction reduced to 32-bit."); -STATISTIC(NumLiteralConstantsFolded, - "Number of literal constants folded into 32-bit instructions."); - -namespace llvm { - void initializeSIShrinkInstructionsPass(PassRegistry&); -} - -using namespace llvm; - -namespace { - -class SIShrinkInstructions : public MachineFunctionPass { -public: - static char ID; - -public: - SIShrinkInstructions() : MachineFunctionPass(ID) { - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "SI Shrink Instructions"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } -}; - -} // End anonymous namespace. - -INITIALIZE_PASS_BEGIN(SIShrinkInstructions, DEBUG_TYPE, - "SI Lower il Copies", false, false) -INITIALIZE_PASS_END(SIShrinkInstructions, DEBUG_TYPE, - "SI Lower il Copies", false, false) - -char SIShrinkInstructions::ID = 0; - -FunctionPass *llvm::createSIShrinkInstructionsPass() { - return new SIShrinkInstructions(); -} - -static bool isVGPR(const MachineOperand *MO, const SIRegisterInfo &TRI, - const MachineRegisterInfo &MRI) { - if (!MO->isReg()) - return false; - - if (TargetRegisterInfo::isVirtualRegister(MO->getReg())) - return TRI.hasVGPRs(MRI.getRegClass(MO->getReg())); - - return TRI.hasVGPRs(TRI.getPhysRegClass(MO->getReg())); -} - -static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, - const SIRegisterInfo &TRI, - const MachineRegisterInfo &MRI) { - - const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); - // Can't shrink instruction with three operands. - // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add - // a special case for it. It can only be shrunk if the third operand - // is vcc. We should handle this the same way we handle vopc, by addding - // a register allocation hint pre-regalloc and then do the shrining - // post-regalloc. - if (Src2) - return false; - - const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); - const MachineOperand *Src1Mod = - TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); - - if (Src1 && (!isVGPR(Src1, TRI, MRI) || (Src1Mod && Src1Mod->getImm() != 0))) - return false; - - // We don't need to check src0, all input types are legal, so just make sure - // src0 isn't using any modifiers. - if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)) - return false; - - // Check output modifiers - if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) - return false; - - if (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp)) - return false; - - return true; -} - -/// \brief This function checks \p MI for operands defined by a move immediate -/// instruction and then folds the literal constant into the instruction if it -/// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instruction -/// and will only fold literal constants if we are still in SSA. -static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, - MachineRegisterInfo &MRI, bool TryToCommute = true) { - - if (!MRI.isSSA()) - return; - - assert(TII->isVOP1(MI.getOpcode()) || TII->isVOP2(MI.getOpcode()) || - TII->isVOPC(MI.getOpcode())); - - const SIRegisterInfo &TRI = TII->getRegisterInfo(); - int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); - MachineOperand &Src0 = MI.getOperand(Src0Idx); - - // Only one literal constant is allowed per instruction, so if src0 is a - // literal constant then we can't do any folding. - if (Src0.isImm() && - TII->isLiteralConstant(Src0, TII->getOpSize(MI, Src0Idx))) - return; - - // Literal constants and SGPRs can only be used in Src0, so if Src0 is an - // SGPR, we cannot commute the instruction, so we can't fold any literal - // constants. - if (Src0.isReg() && !isVGPR(&Src0, TRI, MRI)) - return; - - // Try to fold Src0 - if (Src0.isReg()) { - unsigned Reg = Src0.getReg(); - MachineInstr *Def = MRI.getUniqueVRegDef(Reg); - if (Def && Def->isMoveImmediate()) { - MachineOperand &MovSrc = Def->getOperand(1); - bool ConstantFolded = false; - - if (MovSrc.isImm() && isUInt<32>(MovSrc.getImm())) { - Src0.ChangeToImmediate(MovSrc.getImm()); - ConstantFolded = true; - } - if (ConstantFolded) { - if (MRI.use_empty(Reg)) - Def->eraseFromParent(); - ++NumLiteralConstantsFolded; - return; - } - } - } - - // We have failed to fold src0, so commute the instruction and try again. - if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(&MI)) - foldImmediates(MI, TII, MRI, false); - -} - -bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { - MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIInstrInfo *TII = - static_cast(MF.getSubtarget().getInstrInfo()); - const SIRegisterInfo &TRI = TII->getRegisterInfo(); - std::vector I1Defs; - - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - - MachineBasicBlock &MBB = *BI; - MachineBasicBlock::iterator I, Next; - for (I = MBB.begin(); I != MBB.end(); I = Next) { - Next = std::next(I); - MachineInstr &MI = *I; - - // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. - if (MI.getOpcode() == AMDGPU::S_MOV_B32) { - const MachineOperand &Src = MI.getOperand(1); - - if (Src.isImm()) { - if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4)) - MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); - } - - continue; - } - - if (!TII->hasVALU32BitEncoding(MI.getOpcode())) - continue; - - if (!canShrink(MI, TII, TRI, MRI)) { - // Try commuting the instruction and see if that enables us to shrink - // it. - if (!MI.isCommutable() || !TII->commuteInstruction(&MI) || - !canShrink(MI, TII, TRI, MRI)) - continue; - } - - // getVOPe32 could be -1 here if we started with an instruction that had - // a 32-bit encoding and then commuted it to an instruction that did not. - if (!TII->hasVALU32BitEncoding(MI.getOpcode())) - continue; - - int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); - - if (TII->isVOPC(Op32)) { - unsigned DstReg = MI.getOperand(0).getReg(); - if (TargetRegisterInfo::isVirtualRegister(DstReg)) { - // VOPC instructions can only write to the VCC register. We can't - // force them to use VCC here, because the register allocator has - // trouble with sequences like this, which cause the allocator to run - // out of registers if vreg0 and vreg1 belong to the VCCReg register - // class: - // vreg0 = VOPC; - // vreg1 = VOPC; - // S_AND_B64 vreg0, vreg1 - // - // So, instead of forcing the instruction to write to VCC, we provide - // a hint to the register allocator to use VCC and then we we will run - // this pass again after RA and shrink it if it outputs to VCC. - MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC); - continue; - } - if (DstReg != AMDGPU::VCC) - continue; - } - - // We can shrink this instruction - DEBUG(dbgs() << "Shrinking "; MI.dump(); dbgs() << '\n';); - - MachineInstrBuilder Inst32 = - BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32)); - - // dst - Inst32.addOperand(MI.getOperand(0)); - - Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); - - const MachineOperand *Src1 = - TII->getNamedOperand(MI, AMDGPU::OpName::src1); - if (Src1) - Inst32.addOperand(*Src1); - - ++NumInstructionsShrunk; - MI.eraseFromParent(); - - foldImmediates(*Inst32, TII, MRI); - DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); - - - } - } - return false; -} diff --git a/lib/Target/R600/SITypeRewriter.cpp b/lib/Target/R600/SITypeRewriter.cpp deleted file mode 100644 index 591ce857cc7..00000000000 --- a/lib/Target/R600/SITypeRewriter.cpp +++ /dev/null @@ -1,161 +0,0 @@ -//===-- SITypeRewriter.cpp - Remove unwanted types ------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// This pass removes performs the following type substitution on all -/// non-compute shaders: -/// -/// v16i8 => i128 -/// - v16i8 is used for constant memory resource descriptors. This type is -/// legal for some compute APIs, and we don't want to declare it as legal -/// in the backend, because we want the legalizer to expand all v16i8 -/// operations. -/// v1* => * -/// - Having v1* types complicates the legalizer and we can easily replace -/// - them with the element type. -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InstVisitor.h" - -using namespace llvm; - -namespace { - -class SITypeRewriter : public FunctionPass, - public InstVisitor { - - static char ID; - Module *Mod; - Type *v16i8; - Type *v4i32; - -public: - SITypeRewriter() : FunctionPass(ID) { } - bool doInitialization(Module &M) override; - bool runOnFunction(Function &F) override; - const char *getPassName() const override { - return "SI Type Rewriter"; - } - void visitLoadInst(LoadInst &I); - void visitCallInst(CallInst &I); - void visitBitCast(BitCastInst &I); -}; - -} // End anonymous namespace - -char SITypeRewriter::ID = 0; - -bool SITypeRewriter::doInitialization(Module &M) { - Mod = &M; - v16i8 = VectorType::get(Type::getInt8Ty(M.getContext()), 16); - v4i32 = VectorType::get(Type::getInt32Ty(M.getContext()), 4); - return false; -} - -bool SITypeRewriter::runOnFunction(Function &F) { - Attribute A = F.getFnAttribute("ShaderType"); - - unsigned ShaderType = ShaderType::COMPUTE; - if (A.isStringAttribute()) { - StringRef Str = A.getValueAsString(); - Str.getAsInteger(0, ShaderType); - } - if (ShaderType == ShaderType::COMPUTE) - return false; - - visit(F); - visit(F); - - return false; -} - -void SITypeRewriter::visitLoadInst(LoadInst &I) { - Value *Ptr = I.getPointerOperand(); - Type *PtrTy = Ptr->getType(); - Type *ElemTy = PtrTy->getPointerElementType(); - IRBuilder<> Builder(&I); - if (ElemTy == v16i8) { - Value *BitCast = Builder.CreateBitCast(Ptr, - PointerType::get(v4i32,PtrTy->getPointerAddressSpace())); - LoadInst *Load = Builder.CreateLoad(BitCast); - SmallVector, 8> MD; - I.getAllMetadataOtherThanDebugLoc(MD); - for (unsigned i = 0, e = MD.size(); i != e; ++i) { - Load->setMetadata(MD[i].first, MD[i].second); - } - Value *BitCastLoad = Builder.CreateBitCast(Load, I.getType()); - I.replaceAllUsesWith(BitCastLoad); - I.eraseFromParent(); - } -} - -void SITypeRewriter::visitCallInst(CallInst &I) { - IRBuilder<> Builder(&I); - - SmallVector Args; - SmallVector Types; - bool NeedToReplace = false; - Function *F = I.getCalledFunction(); - std::string Name = F->getName(); - for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) { - Value *Arg = I.getArgOperand(i); - if (Arg->getType() == v16i8) { - Args.push_back(Builder.CreateBitCast(Arg, v4i32)); - Types.push_back(v4i32); - NeedToReplace = true; - Name = Name + ".v4i32"; - } else if (Arg->getType()->isVectorTy() && - Arg->getType()->getVectorNumElements() == 1 && - Arg->getType()->getVectorElementType() == - Type::getInt32Ty(I.getContext())){ - Type *ElementTy = Arg->getType()->getVectorElementType(); - std::string TypeName = "i32"; - InsertElementInst *Def = cast(Arg); - Args.push_back(Def->getOperand(1)); - Types.push_back(ElementTy); - std::string VecTypeName = "v1" + TypeName; - Name = Name.replace(Name.find(VecTypeName), VecTypeName.length(), TypeName); - NeedToReplace = true; - } else { - Args.push_back(Arg); - Types.push_back(Arg->getType()); - } - } - - if (!NeedToReplace) { - return; - } - Function *NewF = Mod->getFunction(Name); - if (!NewF) { - NewF = Function::Create(FunctionType::get(F->getReturnType(), Types, false), GlobalValue::ExternalLinkage, Name, Mod); - NewF->setAttributes(F->getAttributes()); - } - I.replaceAllUsesWith(Builder.CreateCall(NewF, Args)); - I.eraseFromParent(); -} - -void SITypeRewriter::visitBitCast(BitCastInst &I) { - IRBuilder<> Builder(&I); - if (I.getDestTy() != v4i32) { - return; - } - - if (BitCastInst *Op = dyn_cast(I.getOperand(0))) { - if (Op->getSrcTy() == v4i32) { - I.replaceAllUsesWith(Op->getOperand(0)); - I.eraseFromParent(); - } - } -} - -FunctionPass *llvm::createSITypeRewriter() { - return new SITypeRewriter(); -} diff --git a/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp b/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp deleted file mode 100644 index d723d6e3e8b..00000000000 --- a/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp +++ /dev/null @@ -1,30 +0,0 @@ -//===-- TargetInfo/AMDGPUTargetInfo.cpp - TargetInfo for AMDGPU -----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -// -//===----------------------------------------------------------------------===// - -#include "AMDGPUTargetMachine.h" -#include "llvm/Support/TargetRegistry.h" - -using namespace llvm; - -/// \brief The target which suports all AMD GPUs. This will eventually -/// be deprecated and there will be a R600 target and a GCN target. -Target llvm::TheAMDGPUTarget; -/// \brief The target for GCN GPUs -Target llvm::TheGCNTarget; - -/// \brief Extern function to initialize the targets for the AMDGPU backend -extern "C" void LLVMInitializeR600TargetInfo() { - RegisterTarget - R600(TheAMDGPUTarget, "r600", "AMD GPUs HD2XXX-HD6XXX"); - RegisterTarget GCN(TheGCNTarget, "amdgcn", "AMD GCN GPUs"); -} diff --git a/lib/Target/R600/TargetInfo/CMakeLists.txt b/lib/Target/R600/TargetInfo/CMakeLists.txt deleted file mode 100644 index c3bd26c7a89..00000000000 --- a/lib/Target/R600/TargetInfo/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -add_llvm_library(LLVMR600Info - AMDGPUTargetInfo.cpp - ) diff --git a/lib/Target/R600/TargetInfo/LLVMBuild.txt b/lib/Target/R600/TargetInfo/LLVMBuild.txt deleted file mode 100644 index c3d3cf51cc8..00000000000 --- a/lib/Target/R600/TargetInfo/LLVMBuild.txt +++ /dev/null @@ -1,23 +0,0 @@ -;===- ./lib/Target/R600/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===; -; -; The LLVM Compiler Infrastructure -; -; This file is distributed under the University of Illinois Open Source -; License. See LICENSE.TXT for details. -; -;===------------------------------------------------------------------------===; -; -; This is an LLVMBuild description file for the components in this subdirectory. -; -; For more information on the LLVMBuild system, please see: -; -; http://llvm.org/docs/LLVMBuild.html -; -;===------------------------------------------------------------------------===; - -[component_0] -type = Library -name = R600Info -parent = R600 -required_libraries = Support -add_to_library_groups = R600 diff --git a/lib/Target/R600/TargetInfo/Makefile b/lib/Target/R600/TargetInfo/Makefile deleted file mode 100644 index b8ac4e78230..00000000000 --- a/lib/Target/R600/TargetInfo/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -##===- lib/Target/AMDGPU/TargetInfo/Makefile ----------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## -LEVEL = ../../../.. -LIBRARYNAME = LLVMR600Info - -# Hack: we need to include 'main' target directory to grab private headers -CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/R600/VIInstrFormats.td b/lib/Target/R600/VIInstrFormats.td deleted file mode 100644 index d8738f99263..00000000000 --- a/lib/Target/R600/VIInstrFormats.td +++ /dev/null @@ -1,166 +0,0 @@ -//===-- VIInstrFormats.td - VI Instruction Encodings ----------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// VI Instruction format definitions. -// -//===----------------------------------------------------------------------===// - -class DSe_vi op> : Enc64 { - bits<8> vdst; - bits<1> gds; - bits<8> addr; - bits<8> data0; - bits<8> data1; - bits<8> offset0; - bits<8> offset1; - - let Inst{7-0} = offset0; - let Inst{15-8} = offset1; - let Inst{16} = gds; - let Inst{24-17} = op; - let Inst{31-26} = 0x36; //encoding - let Inst{39-32} = addr; - let Inst{47-40} = data0; - let Inst{55-48} = data1; - let Inst{63-56} = vdst; -} - -class MUBUFe_vi op> : Enc64 { - bits<12> offset; - bits<1> offen; - bits<1> idxen; - bits<1> glc; - bits<1> lds; - bits<8> vaddr; - bits<8> vdata; - bits<7> srsrc; - bits<1> slc; - bits<1> tfe; - bits<8> soffset; - - let Inst{11-0} = offset; - let Inst{12} = offen; - let Inst{13} = idxen; - let Inst{14} = glc; - let Inst{16} = lds; - let Inst{17} = slc; - let Inst{24-18} = op; - let Inst{31-26} = 0x38; //encoding - let Inst{39-32} = vaddr; - let Inst{47-40} = vdata; - let Inst{52-48} = srsrc{6-2}; - let Inst{55} = tfe; - let Inst{63-56} = soffset; -} - -class MTBUFe_vi op> : Enc64 { - bits<12> offset; - bits<1> offen; - bits<1> idxen; - bits<1> glc; - bits<4> dfmt; - bits<3> nfmt; - bits<8> vaddr; - bits<8> vdata; - bits<7> srsrc; - bits<1> slc; - bits<1> tfe; - bits<8> soffset; - - let Inst{11-0} = offset; - let Inst{12} = offen; - let Inst{13} = idxen; - let Inst{14} = glc; - let Inst{18-15} = op; - let Inst{22-19} = dfmt; - let Inst{25-23} = nfmt; - let Inst{31-26} = 0x3a; //encoding - let Inst{39-32} = vaddr; - let Inst{47-40} = vdata; - let Inst{52-48} = srsrc{6-2}; - let Inst{54} = slc; - let Inst{55} = tfe; - let Inst{63-56} = soffset; -} - -class SMEMe_vi op, bit imm> : Enc64 { - bits<7> sbase; - bits<7> sdata; - bits<1> glc; - bits<20> offset; - - let Inst{5-0} = sbase{6-1}; - let Inst{12-6} = sdata; - let Inst{16} = glc; - let Inst{17} = imm; - let Inst{25-18} = op; - let Inst{31-26} = 0x30; //encoding - let Inst{51-32} = offset; -} - -class VOP3e_vi op> : Enc64 { - bits<8> vdst; - bits<2> src0_modifiers; - bits<9> src0; - bits<2> src1_modifiers; - bits<9> src1; - bits<2> src2_modifiers; - bits<9> src2; - bits<1> clamp; - bits<2> omod; - - let Inst{7-0} = vdst; - let Inst{8} = src0_modifiers{1}; - let Inst{9} = src1_modifiers{1}; - let Inst{10} = src2_modifiers{1}; - let Inst{15} = clamp; - let Inst{25-16} = op; - let Inst{31-26} = 0x34; //encoding - let Inst{40-32} = src0; - let Inst{49-41} = src1; - let Inst{58-50} = src2; - let Inst{60-59} = omod; - let Inst{61} = src0_modifiers{0}; - let Inst{62} = src1_modifiers{0}; - let Inst{63} = src2_modifiers{0}; -} - -class VOP3be_vi op> : Enc64 { - bits<8> vdst; - bits<2> src0_modifiers; - bits<9> src0; - bits<2> src1_modifiers; - bits<9> src1; - bits<2> src2_modifiers; - bits<9> src2; - bits<7> sdst; - bits<2> omod; - bits<1> clamp; - - let Inst{7-0} = vdst; - let Inst{14-8} = sdst; - let Inst{15} = clamp; - let Inst{25-16} = op; - let Inst{31-26} = 0x34; //encoding - let Inst{40-32} = src0; - let Inst{49-41} = src1; - let Inst{58-50} = src2; - let Inst{60-59} = omod; - let Inst{61} = src0_modifiers{0}; - let Inst{62} = src1_modifiers{0}; - let Inst{63} = src2_modifiers{0}; -} - -class EXPe_vi : EXPe { - let Inst{31-26} = 0x31; //encoding -} - -class VINTRPe_vi op> : VINTRPe { - let Inst{31-26} = 0x35; // encoding -} diff --git a/lib/Target/R600/VIInstructions.td b/lib/Target/R600/VIInstructions.td deleted file mode 100644 index 5bf86e649ce..00000000000 --- a/lib/Target/R600/VIInstructions.td +++ /dev/null @@ -1,106 +0,0 @@ -//===-- VIInstructions.td - VI Instruction Defintions ---------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// Instruction definitions for VI and newer. -//===----------------------------------------------------------------------===// - -let SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI in { - -//===----------------------------------------------------------------------===// -// VOP1 Instructions -//===----------------------------------------------------------------------===// - -defm V_CVT_F16_U16 : VOP1Inst , "v_cvt_f16_u16", VOP_F16_I16>; -defm V_CVT_F16_I16 : VOP1Inst , "v_cvt_f16_i16", VOP_F16_I16>; -defm V_CVT_U16_F16 : VOP1Inst , "v_cvt_u16_f16", VOP_I16_F16>; -defm V_CVT_I16_F16 : VOP1Inst , "v_cvt_i16_f16", VOP_I16_F16>; -defm V_RCP_F16 : VOP1Inst , "v_rcp_f16", VOP_F16_F16>; -defm V_SQRT_F16 : VOP1Inst , "v_sqrt_f16", VOP_F16_F16>; -defm V_RSQ_F16 : VOP1Inst , "v_rsq_f16", VOP_F16_F16>; -defm V_LOG_F16 : VOP1Inst , "v_log_f16", VOP_F16_F16>; -defm V_EXP_F16 : VOP1Inst , "v_exp_f16", VOP_F16_F16>; -defm V_FREXP_MANT_F16 : VOP1Inst , "v_frexp_mant_f16", - VOP_F16_F16 ->; -defm V_FREXP_EXP_I16_F16 : VOP1Inst , "v_frexp_exp_i16_f16", - VOP_I16_F16 ->; -defm V_FLOOR_F16 : VOP1Inst , "v_floor_f16", VOP_F16_F16>; -defm V_CEIL_F16 : VOP1Inst , "v_ceil_f16", VOP_F16_F16>; -defm V_TRUNC_F16 : VOP1Inst , "v_trunc_f16", VOP_F16_F16>; -defm V_RNDNE_F16 : VOP1Inst , "v_rndne_f16", VOP_F16_F16>; -defm V_FRACT_F16 : VOP1Inst , "v_fract_f16", VOP_F16_F16>; -defm V_SIN_F16 : VOP1Inst , "v_sin_f16", VOP_F16_F16>; -defm V_COS_F16 : VOP1Inst , "v_cos_f16", VOP_F16_F16>; - -//===----------------------------------------------------------------------===// -// VOP2 Instructions -//===----------------------------------------------------------------------===// - -let isCommutable = 1 in { - -defm V_ADD_F16 : VOP2Inst , "v_add_f16", VOP_F16_F16_F16>; -defm V_SUB_F16 : VOP2Inst , "v_sub_f16", VOP_F16_F16_F16>; -defm V_SUBREV_F16 : VOP2Inst , "v_subrev_f16", VOP_F16_F16_F16, - null_frag, "v_sub_f16" ->; -defm V_MUL_F16 : VOP2Inst , "v_mul_f16", VOP_F16_F16_F16>; -defm V_MAC_F16 : VOP2Inst , "v_mac_f16", VOP_F16_F16_F16>; -} // End isCommutable = 1 -defm V_MADMK_F16 : VOP2MADK , "v_madmk_f16">; -let isCommutable = 1 in { -defm V_MADAK_F16 : VOP2MADK , "v_madak_f16">; -defm V_ADD_U16 : VOP2Inst , "v_add_u16", VOP_I16_I16_I16>; -defm V_SUB_U16 : VOP2Inst , "v_sub_u16" , VOP_I16_I16_I16>; -defm V_SUBREV_U16 : VOP2Inst , "v_subrev_u16", VOP_I16_I16_I16>; -defm V_MUL_LO_U16 : VOP2Inst , "v_mul_lo_u16", VOP_I16_I16_I16>; -} // End isCommutable = 1 -defm V_LSHLREV_B16 : VOP2Inst , "v_lshlrev_b16", VOP_I16_I16_I16>; -defm V_LSHRREV_B16 : VOP2Inst , "v_lshrrev_b16", VOP_I16_I16_I16>; -defm V_ASHRREV_B16 : VOP2Inst , "v_ashrrev_b16", VOP_I16_I16_I16>; -let isCommutable = 1 in { -defm V_MAX_F16 : VOP2Inst , "v_max_f16", VOP_F16_F16_F16>; -defm V_MIN_F16 : VOP2Inst , "v_min_f16", VOP_F16_F16_F16>; -defm V_MAX_U16 : VOP2Inst , "v_max_u16", VOP_I16_I16_I16>; -defm V_MAX_I16 : VOP2Inst , "v_max_i16", VOP_I16_I16_I16>; -defm V_MIN_U16 : VOP2Inst , "v_min_u16", VOP_I16_I16_I16>; -defm V_MIN_I16 : VOP2Inst , "v_min_i16", VOP_I16_I16_I16>; -} // End isCommutable = 1 -defm V_LDEXP_F16 : VOP2Inst , "v_ldexp_f16", VOP_F16_F16_I16>; - -// Aliases to simplify matching of floating-pint instructions that are VOP2 on -// SI and VOP3 on VI. - -class SI2_VI3Alias : InstAlias < - name#" $dst, $src0, $src1", - (inst VGPR_32:$dst, 0, VCSrc_32:$src0, 0, VCSrc_32:$src1, 0, 0) ->, PredicateControl { - let UseInstAsmMatchConverter = 0; -} - -def : SI2_VI3Alias <"v_ldexp_f32", V_LDEXP_F32_e64_vi>; -def : SI2_VI3Alias <"v_cvt_pkaccum_u8_f32", V_CVT_PKACCUM_U8_F32_e64_vi>; -def : SI2_VI3Alias <"v_cvt_pknorm_i16_f32", V_CVT_PKNORM_I16_F32_e64_vi>; -def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>; -def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>; - -} // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI - -//===----------------------------------------------------------------------===// -// SMEM Patterns -//===----------------------------------------------------------------------===// - -let Predicates = [isVI] in { - -// 1. Offset as 20bit DWORD immediate -def : Pat < - (SIload_constant v4i32:$sbase, IMM20bit:$offset), - (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset)) ->; - -} // End Predicates = [isVI] diff --git a/test/CodeGen/AMDGPU/32-bit-local-address-space.ll b/test/CodeGen/AMDGPU/32-bit-local-address-space.ll new file mode 100644 index 00000000000..c7bcfd2ddab --- /dev/null +++ b/test/CodeGen/AMDGPU/32-bit-local-address-space.ll @@ -0,0 +1,139 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; On Southern Islands GPUs the local address space(3) uses 32-bit pointers and +; the global address space(1) uses 64-bit pointers. These tests check to make sure +; the correct pointer size is used for the local address space. + +; The e{{32|64}} suffix on the instructions refers to the encoding size and not +; the size of the operands. The operand size is denoted in the instruction name. +; Instructions with B32, U32, and I32 in their name take 32-bit operands, while +; instructions with B64, U64, and I64 take 64-bit operands. + +; FUNC-LABEL: {{^}}local_address_load: +; SI: v_mov_b32_e{{32|64}} [[PTR:v[0-9]]] +; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]] +define void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { +entry: + %0 = load i32, i32 addrspace(3)* %in + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_address_gep: +; SI: s_add_i32 [[SPTR:s[0-9]]] +; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] +; SI: ds_read_b32 [[VPTR]] +define void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) { +entry: + %0 = getelementptr i32, i32 addrspace(3)* %in, i32 %offset + %1 = load i32, i32 addrspace(3)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_address_gep_const_offset: +; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}} +; SI: ds_read_b32 v{{[0-9]+}}, [[VPTR]] offset:4 +define void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { +entry: + %0 = getelementptr i32, i32 addrspace(3)* %in, i32 1 + %1 = load i32, i32 addrspace(3)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; Offset too large, can't fold into 16-bit immediate offset. +; FUNC-LABEL: {{^}}local_address_gep_large_const_offset: +; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004 +; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] +; SI: ds_read_b32 [[VPTR]] +define void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { +entry: + %0 = getelementptr i32, i32 addrspace(3)* %in, i32 16385 + %1 = load i32, i32 addrspace(3)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}null_32bit_lds_ptr: +; SI: v_cmp_ne_i32 +; SI-NOT: v_cmp_ne_i32 +; SI: v_cndmask_b32 +define void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind { + %cmp = icmp ne i32 addrspace(3)* %lds, null + %x = select i1 %cmp, i32 123, i32 456 + store i32 %x, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}mul_32bit_ptr: +; SI: s_mul_i32 +; SI-NEXT: s_add_i32 +; SI: ds_read_b32 +define void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) { + %ptr = getelementptr [3 x float], [3 x float] addrspace(3)* %lds, i32 %tid, i32 0 + %val = load float, float addrspace(3)* %ptr + store float %val, float addrspace(1)* %out + ret void +} + +@g_lds = addrspace(3) global float undef, align 4 + +; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset: +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0 +; SI: ds_read_b32 v{{[0-9]+}}, [[REG]] +define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) { + %val = load float, float addrspace(3)* @g_lds + store float %val, float addrspace(1)* %out + ret void +} + + +@ptr = addrspace(3) global i32 addrspace(3)* undef +@dst = addrspace(3) global [16384 x i32] undef + +; FUNC-LABEL: {{^}}global_ptr: +; SI: ds_write_b32 +define void @global_ptr() nounwind { + store i32 addrspace(3)* getelementptr ([16384 x i32], [16384 x i32] addrspace(3)* @dst, i32 0, i32 16), i32 addrspace(3)* addrspace(3)* @ptr + ret void +} + +; FUNC-LABEL: {{^}}local_address_store: +; SI: ds_write_b32 +define void @local_address_store(i32 addrspace(3)* %out, i32 %val) { + store i32 %val, i32 addrspace(3)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_address_gep_store: +; SI: s_add_i32 [[SADDR:s[0-9]+]], +; SI: v_mov_b32_e32 [[ADDR:v[0-9]+]], [[SADDR]] +; SI: ds_write_b32 [[ADDR]], v{{[0-9]+}} +define void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) { + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 %offset + store i32 %val, i32 addrspace(3)* %gep, align 4 + ret void +} + +; FUNC-LABEL: {{^}}local_address_gep_const_offset_store: +; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}} +; SI: v_mov_b32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}} +; SI: ds_write_b32 [[VPTR]], [[VAL]] offset:4 +define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) { + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 1 + store i32 %val, i32 addrspace(3)* %gep, align 4 + ret void +} + +; Offset too large, can't fold into 16-bit immediate offset. +; FUNC-LABEL: {{^}}local_address_gep_large_const_offset_store: +; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004 +; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] +; SI: ds_write_b32 [[VPTR]], v{{[0-9]+$}} +define void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) { + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 16385 + store i32 %val, i32 addrspace(3)* %gep, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/README b/test/CodeGen/AMDGPU/README new file mode 100644 index 00000000000..96998bba28f --- /dev/null +++ b/test/CodeGen/AMDGPU/README @@ -0,0 +1,21 @@ ++==============================================================================+ +| How to organize the lit tests | ++==============================================================================+ + +- If you write a test for matching a single DAG opcode or intrinsic, it should + go in a file called {opcode_name,intrinsic_name}.ll (e.g. fadd.ll) + +- If you write a test that matches several DAG opcodes and checks for a single + ISA instruction, then that test should go in a file called {ISA_name}.ll (e.g. + bfi_int.ll + +- For all other tests, use your best judgement for organizing tests and naming + the files. + ++==============================================================================+ +| Naming conventions | ++==============================================================================+ + +- Use dash '-' and not underscore '_' to separate words in file names, unless + the file is named after a DAG opcode or ISA instruction that has an + underscore '_' in its name. diff --git a/test/CodeGen/AMDGPU/add-debug.ll b/test/CodeGen/AMDGPU/add-debug.ll new file mode 100644 index 00000000000..529905dd36a --- /dev/null +++ b/test/CodeGen/AMDGPU/add-debug.ll @@ -0,0 +1,24 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tahiti -debug +; RUN: llc < %s -march=amdgcn -mcpu=tonga -debug +; REQUIRES: asserts + +; Check that SelectionDAGDumper does not crash on int_SI_if. +define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { +entry: + %0 = icmp eq i64 %a, 0 + br i1 %0, label %if, label %else + +if: + %1 = load i64, i64 addrspace(1)* %in + br label %endif + +else: + %2 = add i64 %a, %b + br label %endif + +endif: + %3 = phi i64 [%1, %if], [%2, %else] + store i64 %3, i64 addrspace(1)* %out + ret void +} + diff --git a/test/CodeGen/AMDGPU/add.ll b/test/CodeGen/AMDGPU/add.ll new file mode 100644 index 00000000000..655e75dbc1a --- /dev/null +++ b/test/CodeGen/AMDGPU/add.ll @@ -0,0 +1,192 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s + +;FUNC-LABEL: {{^}}test1: +;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +;SI: v_add_i32_e32 [[REG:v[0-9]+]], {{v[0-9]+, v[0-9]+}} +;SI-NOT: [[REG]] +;SI: buffer_store_dword [[REG]], +define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %a = load i32, i32 addrspace(1)* %in + %b = load i32, i32 addrspace(1)* %b_ptr + %result = add i32 %a, %b + store i32 %result, i32 addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test2: +;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 + %a = load <2 x i32>, <2 x i32> addrspace(1)* %in + %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr + %result = add <2 x i32> %a, %b + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test4: +;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 + %a = load <4 x i32>, <4 x i32> addrspace(1)* %in + %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr + %result = add <4 x i32> %a, %b + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test8: +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT + +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +define void @test8(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) { +entry: + %0 = add <8 x i32> %a, %b + store <8 x i32> %0, <8 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test16: +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT +; EG: ADD_INT + +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +define void @test16(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) { +entry: + %0 = add <16 x i32> %a, %b + store <16 x i32> %0, <16 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}add64: +; SI: s_add_u32 +; SI: s_addc_u32 + +; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] +; EG-DAG: ADD_INT {{[* ]*}}[[LO]] +; EG-DAG: ADDC_UINT +; EG-DAG: ADD_INT +; EG-DAG: ADD_INT {{[* ]*}}[[HI]] +; EG-NOT: SUB +define void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +entry: + %0 = add i64 %a, %b + store i64 %0, i64 addrspace(1)* %out + ret void +} + +; The v_addc_u32 and v_add_i32 instruction can't read SGPRs, because they +; use VCC. The test is designed so that %a will be stored in an SGPR and +; %0 will be stored in a VGPR, so the comiler will be forced to copy %a +; to a VGPR before doing the add. + +; FUNC-LABEL: {{^}}add64_sgpr_vgpr: +; SI-NOT: v_addc_u32_e32 s + +; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] +; EG-DAG: ADD_INT {{[* ]*}}[[LO]] +; EG-DAG: ADDC_UINT +; EG-DAG: ADD_INT +; EG-DAG: ADD_INT {{[* ]*}}[[HI]] +; EG-NOT: SUB +define void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) { +entry: + %0 = load i64, i64 addrspace(1)* %in + %1 = add i64 %a, %0 + store i64 %1, i64 addrspace(1)* %out + ret void +} + +; Test i64 add inside a branch. +; FUNC-LABEL: {{^}}add64_in_branch: +; SI: s_add_u32 +; SI: s_addc_u32 + +; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] +; EG-DAG: ADD_INT {{[* ]*}}[[LO]] +; EG-DAG: ADDC_UINT +; EG-DAG: ADD_INT +; EG-DAG: ADD_INT {{[* ]*}}[[HI]] +; EG-NOT: SUB +define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { +entry: + %0 = icmp eq i64 %a, 0 + br i1 %0, label %if, label %else + +if: + %1 = load i64, i64 addrspace(1)* %in + br label %endif + +else: + %2 = add i64 %a, %b + br label %endif + +endif: + %3 = phi i64 [%1, %if], [%2, %else] + store i64 %3, i64 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/add_i64.ll b/test/CodeGen/AMDGPU/add_i64.ll new file mode 100644 index 00000000000..8346add7df9 --- /dev/null +++ b/test/CodeGen/AMDGPU/add_i64.ll @@ -0,0 +1,84 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s + + +declare i32 @llvm.r600.read.tidig.x() readnone + +; SI-LABEL: {{^}}test_i64_vreg: +; SI: v_add_i32 +; SI: v_addc_u32 +define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) { + %tid = call i32 @llvm.r600.read.tidig.x() readnone + %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid + %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid + %a = load i64, i64 addrspace(1)* %a_ptr + %b = load i64, i64 addrspace(1)* %b_ptr + %result = add i64 %a, %b + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; Check that the SGPR add operand is correctly moved to a VGPR. +; SI-LABEL: {{^}}sgpr_operand: +; SI: v_add_i32 +; SI: v_addc_u32 +define void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) { + %foo = load i64, i64 addrspace(1)* %in, align 8 + %result = add i64 %foo, %a + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; Swap the arguments. Check that the SGPR -> VGPR copy works with the +; SGPR as other operand. +; +; SI-LABEL: {{^}}sgpr_operand_reversed: +; SI: v_add_i32 +; SI: v_addc_u32 +define void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) { + %foo = load i64, i64 addrspace(1)* %in, align 8 + %result = add i64 %a, %foo + store i64 %result, i64 addrspace(1)* %out + ret void +} + + +; SI-LABEL: {{^}}test_v2i64_sreg: +; SI: s_add_u32 +; SI: s_addc_u32 +; SI: s_add_u32 +; SI: s_addc_u32 +define void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) { + %result = add <2 x i64> %a, %b + store <2 x i64> %result, <2 x i64> addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}test_v2i64_vreg: +; SI: v_add_i32 +; SI: v_addc_u32 +; SI: v_add_i32 +; SI: v_addc_u32 +define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) { + %tid = call i32 @llvm.r600.read.tidig.x() readnone + %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid + %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid + %a = load <2 x i64>, <2 x i64> addrspace(1)* %a_ptr + %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr + %result = add <2 x i64> %a, %b + store <2 x i64> %result, <2 x i64> addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}trunc_i64_add_to_i32: +; SI: s_load_dword s[[SREG0:[0-9]+]] +; SI: s_load_dword s[[SREG1:[0-9]+]] +; SI: s_add_i32 [[SRESULT:s[0-9]+]], s[[SREG1]], s[[SREG0]] +; SI-NOT: addc +; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] +; SI: buffer_store_dword [[VRESULT]], +define void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { + %add = add i64 %b, %a + %trunc = trunc i64 %add to i32 + store i32 %trunc, i32 addrspace(1)* %out, align 8 + ret void +} diff --git a/test/CodeGen/AMDGPU/address-space.ll b/test/CodeGen/AMDGPU/address-space.ll new file mode 100644 index 00000000000..4be8c584752 --- /dev/null +++ b/test/CodeGen/AMDGPU/address-space.ll @@ -0,0 +1,36 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s + +; Test that codegenprepare understands address space sizes + +%struct.foo = type { [3 x float], [3 x float] } + +; FIXME: Extra V_MOV from SGPR to VGPR for second read. The address is +; already in a VGPR after the first read. + +; CHECK-LABEL: {{^}}do_as_ptr_calcs: +; CHECK: s_load_dword [[SREG1:s[0-9]+]], +; CHECK: v_mov_b32_e32 [[VREG2:v[0-9]+]], [[SREG1]] +; CHECK: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]] +; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG1]] offset:12 +; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG2]] offset:20 +define void @do_as_ptr_calcs(%struct.foo addrspace(3)* nocapture %ptr) nounwind { +entry: + %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0 + %y = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 2 + br label %bb32 + +bb32: + %a = load float, float addrspace(3)* %x, align 4 + %b = load float, float addrspace(3)* %y, align 4 + %cmp = fcmp one float %a, %b + br i1 %cmp, label %bb34, label %bb33 + +bb33: + unreachable + +bb34: + unreachable +} + + diff --git a/test/CodeGen/AMDGPU/and.ll b/test/CodeGen/AMDGPU/and.ll new file mode 100644 index 00000000000..5672d470bd7 --- /dev/null +++ b/test/CodeGen/AMDGPU/and.ll @@ -0,0 +1,296 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}test2: +; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 + %a = load <2 x i32>, <2 x i32> addrspace(1) * %in + %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr + %result = and <2 x i32> %a, %b + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test4: +; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 + %a = load <4 x i32>, <4 x i32> addrspace(1) * %in + %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr + %result = and <4 x i32> %a, %b + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_and_i32: +; SI: s_and_b32 +define void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { + %and = and i32 %a, %b + store i32 %and, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_and_constant_i32: +; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687 +define void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) { + %and = and i32 %a, 1234567 + store i32 %and, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_and_i32: +; SI: v_and_b32 +define void @v_and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) { + %a = load i32, i32 addrspace(1)* %aptr, align 4 + %b = load i32, i32 addrspace(1)* %bptr, align 4 + %and = and i32 %a, %b + store i32 %and, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_and_constant_i32 +; SI: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, v{{[0-9]+}} +define void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { + %a = load i32, i32 addrspace(1)* %aptr, align 4 + %and = and i32 %a, 1234567 + store i32 %and, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_and_inline_imm_64_i32 +; SI: v_and_b32_e32 v{{[0-9]+}}, 64, v{{[0-9]+}} +define void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { + %a = load i32, i32 addrspace(1)* %aptr, align 4 + %and = and i32 %a, 64 + store i32 %and, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_and_inline_imm_neg_16_i32 +; SI: v_and_b32_e32 v{{[0-9]+}}, -16, v{{[0-9]+}} +define void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { + %a = load i32, i32 addrspace(1)* %aptr, align 4 + %and = and i32 %a, -16 + store i32 %and, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_and_i64 +; SI: s_and_b64 +define void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { + %and = and i64 %a, %b + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FIXME: Should use SGPRs +; FUNC-LABEL: {{^}}s_and_i1: +; SI: v_and_b32 +define void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) { + %and = and i1 %a, %b + store i1 %and, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_and_constant_i64 +; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) { + %and = and i64 %a, 281474976710655 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_and_i64: +; SI: v_and_b32 +; SI: v_and_b32 +define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) { + %a = load i64, i64 addrspace(1)* %aptr, align 8 + %b = load i64, i64 addrspace(1)* %bptr, align 8 + %and = and i64 %a, %b + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_and_i64_br: +; SI: v_and_b32 +; SI: v_and_b32 +define void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i32 %cond) { +entry: + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %if, label %endif + +if: + %a = load i64, i64 addrspace(1)* %aptr, align 8 + %b = load i64, i64 addrspace(1)* %bptr, align 8 + %and = and i64 %a, %b + br label %endif + +endif: + %tmp1 = phi i64 [%and, %if], [0, %entry] + store i64 %tmp1, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_and_constant_i64: +; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { + %a = load i64, i64 addrspace(1)* %aptr, align 8 + %and = and i64 %a, 1234567 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FIXME: Replace and 0 with mov 0 +; FUNC-LABEL: {{^}}v_and_inline_imm_i64: +; SI: v_and_b32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}} +; SI: v_and_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}} +define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { + %a = load i64, i64 addrspace(1)* %aptr, align 8 + %and = and i64 %a, 64 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_and_inline_imm_64_i64 +; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 64 +define void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { + %and = and i64 %a, 64 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_and_inline_imm_1_i64 +; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1 +define void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { + %and = and i64 %a, 1 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_and_inline_imm_1.0_i64 +; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1.0 +define void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { + %and = and i64 %a, 4607182418800017408 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_and_inline_imm_neg_1.0_i64 +; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -1.0 +define void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { + %and = and i64 %a, 13830554455654793216 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_and_inline_imm_0.5_i64 +; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0.5 +define void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { + %and = and i64 %a, 4602678819172646912 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_and_inline_imm_neg_0.5_i64 +; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -0.5 +define void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { + %and = and i64 %a, 13826050856027422720 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_and_inline_imm_2.0_i64 +; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 2.0 +define void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { + %and = and i64 %a, 4611686018427387904 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_and_inline_imm_neg_2.0_i64 +; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -2.0 +define void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { + %and = and i64 %a, 13835058055282163712 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_and_inline_imm_4.0_i64 +; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 4.0 +define void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { + %and = and i64 %a, 4616189618054758400 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_and_inline_imm_neg_4.0_i64 +; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -4.0 +define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { + %and = and i64 %a, 13839561654909534208 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + + +; Test with the 64-bit integer bitpattern for a 32-bit float in the +; low 32-bits, which is not a valid 64-bit inline immmediate. + +; FUNC-LABEL: {{^}}s_and_inline_imm_f32_4.0_i64 +; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 4.0 +; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0{{$}} +; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} +define void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { + %and = and i64 %a, 1082130432 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FIXME: Copy of -1 register +; FUNC-LABEL: {{^}}s_and_inline_imm_f32_neg_4.0_i64 +; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], -4.0 +; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -1{{$}} +; SI-DAG: s_mov_b32 s[[K_HI_COPY:[0-9]+]], s[[K_HI]] +; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI_COPY]]{{\]}} +define void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { + %and = and i64 %a, -1065353216 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; Shift into upper 32-bits +; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_4.0_i64 +; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 4.0 +; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}} +; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} +define void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { + %and = and i64 %a, 4647714815446351872 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_neg_4.0_i64 +; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -4.0 +; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}} +; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} +define void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { + %and = and i64 %a, 13871086852301127680 + store i64 %and, i64 addrspace(1)* %out, align 8 + ret void +} diff --git a/test/CodeGen/AMDGPU/anyext.ll b/test/CodeGen/AMDGPU/anyext.ll new file mode 100644 index 00000000000..48d8f312249 --- /dev/null +++ b/test/CodeGen/AMDGPU/anyext.ll @@ -0,0 +1,15 @@ +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +; CHECK-LABEL: {{^}}anyext_i1_i32: +; CHECK: v_cndmask_b32_e64 +define void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) { +entry: + %0 = icmp eq i32 %cond, 0 + %1 = zext i1 %0 to i8 + %2 = xor i8 %1, -1 + %3 = and i8 %2, 1 + %4 = zext i8 %3 to i32 + store i32 %4, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll new file mode 100644 index 00000000000..8c2a0795860 --- /dev/null +++ b/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll @@ -0,0 +1,44 @@ +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=+promote-alloca < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s + +declare i32 @llvm.SI.tid() nounwind readnone +declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate + +; The required pointer calculations for the alloca'd actually requires +; an add and won't be folded into the addressing, which fails with a +; 64-bit pointer add. This should work since private pointers should +; be 32-bits. + +; SI-LABEL: {{^}}test_private_array_ptr_calc: + +; FIXME: We end up with zero argument for ADD, because +; SIRegisterInfo::eliminateFrameIndex() blindly replaces the frame index +; with the appropriate offset. We should fold this into the store. +; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], 0, v{{[0-9]+}} +; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}] +; +; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this +; alloca to a vector. It currently fails because it does not know how +; to interpret: +; getelementptr [4 x i32], [4 x i32]* %alloca, i32 1, i32 %b + +; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], 16 +; SI-PROMOTE: ds_write_b32 [[PTRREG]] +define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) { + %alloca = alloca [4 x i32], i32 4, align 16 + %tid = call i32 @llvm.SI.tid() readnone + %a_ptr = getelementptr i32, i32 addrspace(1)* %inA, i32 %tid + %b_ptr = getelementptr i32, i32 addrspace(1)* %inB, i32 %tid + %a = load i32, i32 addrspace(1)* %a_ptr + %b = load i32, i32 addrspace(1)* %b_ptr + %result = add i32 %a, %b + %alloca_ptr = getelementptr [4 x i32], [4 x i32]* %alloca, i32 1, i32 %b + store i32 %result, i32* %alloca_ptr, align 4 + ; Dummy call + call void @llvm.AMDGPU.barrier.local() nounwind noduplicate + %reload = load i32, i32* %alloca_ptr, align 4 + %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + store i32 %reload, i32 addrspace(1)* %out_ptr, align 4 + ret void +} + diff --git a/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll b/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll new file mode 100644 index 00000000000..eae095eb844 --- /dev/null +++ b/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll @@ -0,0 +1,17 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare i32 @llvm.SI.tid() readnone + +; SI-LABEL: {{^}}test_array_ptr_calc: +; SI: v_mul_lo_i32 +; SI: v_mul_hi_i32 +define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) { + %tid = call i32 @llvm.SI.tid() readnone + %a_ptr = getelementptr [1025 x i32], [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0 + %b_ptr = getelementptr i32, i32 addrspace(1)* %inB, i32 %tid + %a = load i32, i32 addrspace(1)* %a_ptr + %b = load i32, i32 addrspace(1)* %b_ptr + %result = add i32 %a, %b + store i32 %result, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll b/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll new file mode 100644 index 00000000000..ef2560ef184 --- /dev/null +++ b/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll @@ -0,0 +1,92 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SICI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SICI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset: +; GCN: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7 +; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] +; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]] +; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 +; GCN: s_endpgm +define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic + %result = extractvalue { i32, i1 } %pair, 0 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset: +; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7 +; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0 +; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd +; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] +; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]] +; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]] +; GCN: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32 +; GCN: buffer_store_dwordx2 [[RESULT]], +; GCN: s_endpgm +define void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic + %result = extractvalue { i64, i1 } %pair, 0 + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_bad_si_offset +; SI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CIVI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind { + %sub = sub i32 %a, %b + %add = add i32 %sub, 4 + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add + %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic + %result = extractvalue { i32, i1 } %pair, 0 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i32_offset: +; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9 +; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xa +; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24 +; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28 +; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7 +; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] +; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]] +; GCN: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 +; GCN: s_endpgm +define void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic + %result = extractvalue { i32, i1 } %pair, 0 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i64_offset: +; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9 +; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24 +; VI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7 +; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0 +; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] +; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]] +; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]] +; GCN: ds_cmpst_b64 [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_cmpxchg_noret_i64_offset(i64 addrspace(3)* %ptr, i64 %swap) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic + %result = extractvalue { i64, i1 } %pair, 0 + ret void +} diff --git a/test/CodeGen/AMDGPU/atomic_load_add.ll b/test/CodeGen/AMDGPU/atomic_load_add.ll new file mode 100644 index 00000000000..20c685447ee --- /dev/null +++ b/test/CodeGen/AMDGPU/atomic_load_add.ll @@ -0,0 +1,39 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}atomic_add_local: +; R600: LDS_ADD * +; SI: ds_add_u32 +define void @atomic_add_local(i32 addrspace(3)* %local) { + %unused = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_add_local_const_offset: +; R600: LDS_ADD * +; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 +define void @atomic_add_local_const_offset(i32 addrspace(3)* %local) { + %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4 + %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_add_ret_local: +; R600: LDS_ADD_RET * +; SI: ds_add_rtn_u32 +define void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { + %val = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}atomic_add_ret_local_const_offset: +; R600: LDS_ADD_RET * +; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20 +define void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { + %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5 + %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst + store i32 %val, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/atomic_load_sub.ll b/test/CodeGen/AMDGPU/atomic_load_sub.ll new file mode 100644 index 00000000000..4c6f45525b9 --- /dev/null +++ b/test/CodeGen/AMDGPU/atomic_load_sub.ll @@ -0,0 +1,39 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}atomic_sub_local: +; R600: LDS_SUB * +; SI: ds_sub_u32 +define void @atomic_sub_local(i32 addrspace(3)* %local) { + %unused = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_sub_local_const_offset: +; R600: LDS_SUB * +; SI: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 +define void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) { + %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4 + %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_sub_ret_local: +; R600: LDS_SUB_RET * +; SI: ds_sub_rtn_u32 +define void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { + %val = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}atomic_sub_ret_local_const_offset: +; R600: LDS_SUB_RET * +; SI: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20 +define void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { + %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5 + %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst + store i32 %val, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/basic-branch.ll b/test/CodeGen/AMDGPU/basic-branch.ll new file mode 100644 index 00000000000..abdc4afef47 --- /dev/null +++ b/test/CodeGen/AMDGPU/basic-branch.ll @@ -0,0 +1,16 @@ +; XFAIL: * +; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s + +; CHECK-LABEL: {{^}}test_branch( +define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind { + %cmp = icmp ne i32 %val, 0 + br i1 %cmp, label %store, label %end + +store: + store i32 222, i32 addrspace(1)* %out + ret void + +end: + ret void +} diff --git a/test/CodeGen/AMDGPU/basic-loop.ll b/test/CodeGen/AMDGPU/basic-loop.ll new file mode 100644 index 00000000000..f0263caf5d6 --- /dev/null +++ b/test/CodeGen/AMDGPU/basic-loop.ll @@ -0,0 +1,18 @@ +; RUN: llc -O0 -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s +; RUN: llc -O0 -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s + +; CHECK-LABEL: {{^}}test_loop: +define void @test_loop(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind { +entry: + br label %loop.body + +loop.body: + %i = phi i32 [0, %entry], [%i.inc, %loop.body] + store i32 222, i32 addrspace(1)* %out + %cmp = icmp ne i32 %i, %val + %i.inc = add i32 %i, 1 + br i1 %cmp, label %loop.body, label %end + +end: + ret void +} diff --git a/test/CodeGen/AMDGPU/bfe_uint.ll b/test/CodeGen/AMDGPU/bfe_uint.ll new file mode 100644 index 00000000000..32e3fc26106 --- /dev/null +++ b/test/CodeGen/AMDGPU/bfe_uint.ll @@ -0,0 +1,26 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; CHECK: {{^}}bfe_def: +; CHECK: BFE_UINT +define void @bfe_def(i32 addrspace(1)* %out, i32 %x) { +entry: + %0 = lshr i32 %x, 5 + %1 = and i32 %0, 15 ; 0xf + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; This program could be implemented using a BFE_UINT instruction, however +; since the lshr constant + number of bits in the mask is >= 32, it can also be +; implmented with a LSHR instruction, which is better, because LSHR has less +; operands and requires less constants. + +; CHECK: {{^}}bfe_shift: +; CHECK-NOT: BFE_UINT +define void @bfe_shift(i32 addrspace(1)* %out, i32 %x) { +entry: + %0 = lshr i32 %x, 16 + %1 = and i32 %0, 65535 ; 0xffff + store i32 %1, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/bfi_int.ll b/test/CodeGen/AMDGPU/bfi_int.ll new file mode 100644 index 00000000000..03349349735 --- /dev/null +++ b/test/CodeGen/AMDGPU/bfi_int.ll @@ -0,0 +1,53 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 %s +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s + +; BFI_INT Definition pattern from ISA docs +; (y & x) | (z & ~x) +; +; R600: {{^}}bfi_def: +; R600: BFI_INT +; SI: @bfi_def +; SI: v_bfi_b32 +define void @bfi_def(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { +entry: + %0 = xor i32 %x, -1 + %1 = and i32 %z, %0 + %2 = and i32 %y, %x + %3 = or i32 %1, %2 + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; SHA-256 Ch function +; z ^ (x & (y ^ z)) +; R600: {{^}}bfi_sha256_ch: +; R600: BFI_INT +; SI: @bfi_sha256_ch +; SI: v_bfi_b32 +define void @bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { +entry: + %0 = xor i32 %y, %z + %1 = and i32 %x, %0 + %2 = xor i32 %z, %1 + store i32 %2, i32 addrspace(1)* %out + ret void +} + +; SHA-256 Ma function +; ((x & z) | (y & (x | z))) +; R600: {{^}}bfi_sha256_ma: +; R600: XOR_INT * [[DST:T[0-9]+\.[XYZW]]], KC0[2].Z, KC0[2].W +; R600: BFI_INT * {{T[0-9]+\.[XYZW]}}, {{[[DST]]|PV\.[XYZW]}}, KC0[3].X, KC0[2].W +; SI: v_xor_b32_e32 [[DST:v[0-9]+]], {{s[0-9]+, v[0-9]+}} +; SI: v_bfi_b32 {{v[0-9]+}}, [[DST]], {{s[0-9]+, v[0-9]+}} + +define void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { +entry: + %0 = and i32 %x, %z + %1 = or i32 %x, %z + %2 = and i32 %y, %1 + %3 = or i32 %0, %2 + store i32 %3, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/big_alu.ll b/test/CodeGen/AMDGPU/big_alu.ll new file mode 100644 index 00000000000..2671c5d102b --- /dev/null +++ b/test/CodeGen/AMDGPU/big_alu.ll @@ -0,0 +1,1173 @@ +;RUN: llc < %s -march=r600 -mcpu=cedar + +;This test ensures that R600 backend can handle ifcvt properly +;and do not generate ALU clauses with more than 128 instructions. + +define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7, <4 x float> inreg %reg8, <4 x float> inreg %reg9) #0 { +main_body: + %0 = extractelement <4 x float> %reg0, i32 0 + %1 = extractelement <4 x float> %reg0, i32 1 + %2 = extractelement <4 x float> %reg0, i32 2 + %3 = extractelement <4 x float> %reg0, i32 3 + %4 = extractelement <4 x float> %reg1, i32 0 + %5 = extractelement <4 x float> %reg9, i32 0 + %6 = extractelement <4 x float> %reg8, i32 0 + %7 = fcmp ugt float %6, 0.000000e+00 + %8 = select i1 %7, float %4, float %5 + %9 = extractelement <4 x float> %reg1, i32 1 + %10 = extractelement <4 x float> %reg9, i32 1 + %11 = extractelement <4 x float> %reg8, i32 0 + %12 = fcmp ugt float %11, 0.000000e+00 + %13 = select i1 %12, float %9, float %10 + %14 = extractelement <4 x float> %reg1, i32 2 + %15 = extractelement <4 x float> %reg9, i32 2 + %16 = extractelement <4 x float> %reg8, i32 0 + %17 = fcmp ugt float %16, 0.000000e+00 + %18 = select i1 %17, float %14, float %15 + %19 = extractelement <4 x float> %reg1, i32 3 + %20 = extractelement <4 x float> %reg9, i32 3 + %21 = extractelement <4 x float> %reg8, i32 0 + %22 = extractelement <4 x float> %reg2, i32 0 + %23 = extractelement <4 x float> %reg2, i32 1 + %24 = extractelement <4 x float> %reg2, i32 2 + %25 = extractelement <4 x float> %reg2, i32 3 + %26 = extractelement <4 x float> %reg3, i32 0 + %27 = extractelement <4 x float> %reg3, i32 1 + %28 = extractelement <4 x float> %reg3, i32 2 + %29 = extractelement <4 x float> %reg3, i32 3 + %30 = extractelement <4 x float> %reg4, i32 0 + %31 = extractelement <4 x float> %reg4, i32 1 + %32 = extractelement <4 x float> %reg4, i32 2 + %33 = extractelement <4 x float> %reg4, i32 3 + %34 = extractelement <4 x float> %reg5, i32 0 + %35 = extractelement <4 x float> %reg5, i32 1 + %36 = extractelement <4 x float> %reg5, i32 2 + %37 = extractelement <4 x float> %reg5, i32 3 + %38 = extractelement <4 x float> %reg6, i32 0 + %39 = extractelement <4 x float> %reg6, i32 1 + %40 = extractelement <4 x float> %reg6, i32 2 + %41 = extractelement <4 x float> %reg6, i32 3 + %42 = extractelement <4 x float> %reg7, i32 0 + %43 = extractelement <4 x float> %reg7, i32 1 + %44 = extractelement <4 x float> %reg7, i32 2 + %45 = extractelement <4 x float> %reg7, i32 3 + %46 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11) + %47 = extractelement <4 x float> %46, i32 0 + %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11) + %49 = extractelement <4 x float> %48, i32 1 + %50 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11) + %51 = extractelement <4 x float> %50, i32 2 + %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12) + %53 = extractelement <4 x float> %52, i32 0 + %54 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14) + %55 = extractelement <4 x float> %54, i32 0 + %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14) + %57 = extractelement <4 x float> %56, i32 1 + %58 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14) + %59 = extractelement <4 x float> %58, i32 2 + %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14) + %61 = extractelement <4 x float> %60, i32 3 + %62 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16) + %63 = extractelement <4 x float> %62, i32 0 + %64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16) + %65 = extractelement <4 x float> %64, i32 1 + %66 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16) + %67 = extractelement <4 x float> %66, i32 2 + %68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) + %69 = extractelement <4 x float> %68, i32 0 + %70 = fcmp oge float %69, 3.500000e+00 + %71 = sext i1 %70 to i32 + %72 = bitcast i32 %71 to float + %73 = bitcast float %72 to i32 + %74 = icmp ne i32 %73, 0 + %. = select i1 %74, float 0.000000e+00, float 0.000000e+00 + %75 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) + %76 = extractelement <4 x float> %75, i32 0 + %77 = fcmp oge float %76, 2.000000e+00 + %78 = sext i1 %77 to i32 + %79 = bitcast i32 %78 to float + %80 = bitcast float %79 to i32 + %81 = icmp ne i32 %80, 0 + br i1 %81, label %IF137, label %ENDIF136 + +IF137: ; preds = %main_body + %82 = insertelement <4 x float> undef, float %30, i32 0 + %83 = insertelement <4 x float> %82, float %31, i32 1 + %84 = insertelement <4 x float> %83, float %32, i32 2 + %85 = insertelement <4 x float> %84, float 0.000000e+00, i32 3 + %86 = insertelement <4 x float> undef, float %30, i32 0 + %87 = insertelement <4 x float> %86, float %31, i32 1 + %88 = insertelement <4 x float> %87, float %32, i32 2 + %89 = insertelement <4 x float> %88, float 0.000000e+00, i32 3 + %90 = call float @llvm.AMDGPU.dp4(<4 x float> %85, <4 x float> %89) + %91 = call float @llvm.AMDGPU.rsq.f32(float %90) + %92 = fmul float %30, %91 + %93 = fmul float %31, %91 + %94 = fmul float %32, %91 + %95 = insertelement <4 x float> undef, float %92, i32 0 + %96 = insertelement <4 x float> %95, float %93, i32 1 + %97 = insertelement <4 x float> %96, float %94, i32 2 + %98 = insertelement <4 x float> %97, float 0.000000e+00, i32 3 + %99 = insertelement <4 x float> undef, float %37, i32 0 + %100 = insertelement <4 x float> %99, float %38, i32 1 + %101 = insertelement <4 x float> %100, float %39, i32 2 + %102 = insertelement <4 x float> %101, float 0.000000e+00, i32 3 + %103 = call float @llvm.AMDGPU.dp4(<4 x float> %98, <4 x float> %102) + %104 = insertelement <4 x float> undef, float %92, i32 0 + %105 = insertelement <4 x float> %104, float %93, i32 1 + %106 = insertelement <4 x float> %105, float %94, i32 2 + %107 = insertelement <4 x float> %106, float 0.000000e+00, i32 3 + %108 = insertelement <4 x float> undef, float %40, i32 0 + %109 = insertelement <4 x float> %108, float %41, i32 1 + %110 = insertelement <4 x float> %109, float %42, i32 2 + %111 = insertelement <4 x float> %110, float 0.000000e+00, i32 3 + %112 = call float @llvm.AMDGPU.dp4(<4 x float> %107, <4 x float> %111) + %113 = fsub float -0.000000e+00, %92 + %114 = fsub float -0.000000e+00, %93 + %115 = fsub float -0.000000e+00, %94 + %116 = insertelement <4 x float> undef, float %34, i32 0 + %117 = insertelement <4 x float> %116, float %35, i32 1 + %118 = insertelement <4 x float> %117, float %36, i32 2 + %119 = insertelement <4 x float> %118, float 0.000000e+00, i32 3 + %120 = insertelement <4 x float> undef, float %113, i32 0 + %121 = insertelement <4 x float> %120, float %114, i32 1 + %122 = insertelement <4 x float> %121, float %115, i32 2 + %123 = insertelement <4 x float> %122, float 0.000000e+00, i32 3 + %124 = call float @llvm.AMDGPU.dp4(<4 x float> %119, <4 x float> %123) + %125 = fdiv float 1.000000e+00, %124 + %126 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) + %127 = extractelement <4 x float> %126, i32 0 + %128 = fmul float %127, %125 + %129 = fmul float %103, %128 + %130 = fmul float %112, %128 + %131 = bitcast float %. to i32 + %132 = sitofp i32 %131 to float + %133 = fdiv float 1.000000e+00, %132 + %134 = bitcast float %. to i32 + %135 = add i32 %134, -1 + %136 = bitcast i32 %135 to float + %137 = bitcast float %136 to i32 + br label %LOOP + +ENDIF136: ; preds = %main_body, %ENDIF154 + %temp68.1 = phi float [ %600, %ENDIF154 ], [ 0.000000e+00, %main_body ] + %temp69.0 = phi float [ %602, %ENDIF154 ], [ 0.000000e+00, %main_body ] + %temp70.0 = phi float [ %604, %ENDIF154 ], [ 1.000000e+00, %main_body ] + %138 = fmul float %26, 0x3F847AE140000000 + %139 = fmul float %27, 0x3F847AE140000000 + %140 = fmul float %28, 0x3F847AE140000000 + %141 = insertelement <4 x float> undef, float %138, i32 0 + %142 = insertelement <4 x float> %141, float %139, i32 1 + %143 = insertelement <4 x float> %142, float %140, i32 2 + %144 = insertelement <4 x float> %143, float 0.000000e+00, i32 3 + %145 = extractelement <4 x float> %144, i32 0 + %146 = extractelement <4 x float> %144, i32 1 + %147 = extractelement <4 x float> %144, i32 2 + %148 = extractelement <4 x float> %144, i32 3 + %149 = insertelement <4 x float> undef, float %145, i32 0 + %150 = insertelement <4 x float> %149, float %146, i32 1 + %151 = insertelement <4 x float> %150, float %147, i32 2 + %152 = insertelement <4 x float> %151, float %148, i32 3 + %153 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %152, i32 16, i32 0, i32 3) + %154 = extractelement <4 x float> %153, i32 0 + %155 = extractelement <4 x float> %153, i32 1 + %156 = extractelement <4 x float> %153, i32 2 + %157 = extractelement <4 x float> %153, i32 3 + %158 = fmul float %26, 0x3F45A07B40000000 + %159 = fmul float %27, 0x3F45A07B40000000 + %160 = fmul float %28, 0x3F45A07B40000000 + %161 = insertelement <4 x float> undef, float %158, i32 0 + %162 = insertelement <4 x float> %161, float %159, i32 1 + %163 = insertelement <4 x float> %162, float %160, i32 2 + %164 = insertelement <4 x float> %163, float 0.000000e+00, i32 3 + %165 = extractelement <4 x float> %164, i32 0 + %166 = extractelement <4 x float> %164, i32 1 + %167 = extractelement <4 x float> %164, i32 2 + %168 = extractelement <4 x float> %164, i32 3 + %169 = insertelement <4 x float> undef, float %165, i32 0 + %170 = insertelement <4 x float> %169, float %166, i32 1 + %171 = insertelement <4 x float> %170, float %167, i32 2 + %172 = insertelement <4 x float> %171, float %168, i32 3 + %173 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %172, i32 16, i32 0, i32 3) + %174 = extractelement <4 x float> %173, i32 0 + %175 = extractelement <4 x float> %173, i32 1 + %176 = extractelement <4 x float> %173, i32 2 + %177 = extractelement <4 x float> %173, i32 3 + %178 = fmul float %176, 3.000000e+03 + %179 = fadd float %178, %28 + %180 = fdiv float 1.000000e+00, %33 + %181 = fmul float %32, %180 + %182 = call float @fabs(float %181) + %183 = fmul float %174, 0x3FD99999A0000000 + %184 = fadd float %183, 0x3FAEB851E0000000 + %185 = fmul float %175, 0x3FE3333340000000 + %186 = fadd float %185, %184 + %187 = fmul float %176, 2.000000e+00 + %188 = fadd float %187, %186 + %189 = fmul float %177, 4.000000e+00 + %190 = fadd float %189, %188 + %191 = fmul float %154, 0x3FB99999A0000000 + %192 = fadd float %191, %190 + %193 = fmul float %155, 0x3FD99999A0000000 + %194 = fadd float %193, %192 + %195 = fmul float %156, 0x3FE99999A0000000 + %196 = fadd float %195, %194 + %197 = fmul float %157, 0x4000CCCCC0000000 + %198 = fadd float %197, %196 + %199 = fmul float 0xBE5EFB4CC0000000, %182 + %200 = fmul float %199, %182 + %201 = call float @llvm.AMDIL.exp.(float %200) + %202 = call float @llvm.AMDGPU.lrp(float %201, float %198, float 0x3FA99999A0000000) + %203 = fadd float %202, 0x3FF4CCCCC0000000 + %204 = fmul float %203, 0x3FE1C71C80000000 + %205 = call float @llvm.AMDIL.clamp.(float %204, float 0.000000e+00, float 1.000000e+00) + %206 = fadd float %202, 0x3FF4CCCCC0000000 + %207 = fmul float %206, 0x3FE1C71C80000000 + %208 = call float @llvm.AMDIL.clamp.(float %207, float 0.000000e+00, float 1.000000e+00) + %209 = fadd float %202, 2.000000e+00 + %210 = fmul float %209, 0x3FD611A7A0000000 + %211 = call float @llvm.AMDIL.clamp.(float %210, float 0.000000e+00, float 1.000000e+00) + %212 = fmul float 2.000000e+00, %205 + %213 = fsub float -0.000000e+00, %212 + %214 = fadd float 3.000000e+00, %213 + %215 = fmul float %205, %214 + %216 = fmul float %205, %215 + %217 = fmul float 2.000000e+00, %208 + %218 = fsub float -0.000000e+00, %217 + %219 = fadd float 3.000000e+00, %218 + %220 = fmul float %208, %219 + %221 = fmul float %208, %220 + %222 = fmul float 2.000000e+00, %211 + %223 = fsub float -0.000000e+00, %222 + %224 = fadd float 3.000000e+00, %223 + %225 = fmul float %211, %224 + %226 = fmul float %211, %225 + %227 = fmul float %26, 0x3F368B5CC0000000 + %228 = fmul float %27, 0x3F368B5CC0000000 + %229 = insertelement <4 x float> undef, float %227, i32 0 + %230 = insertelement <4 x float> %229, float %228, i32 1 + %231 = insertelement <4 x float> %230, float 0.000000e+00, i32 2 + %232 = insertelement <4 x float> %231, float 0.000000e+00, i32 3 + %233 = extractelement <4 x float> %232, i32 0 + %234 = extractelement <4 x float> %232, i32 1 + %235 = insertelement <4 x float> undef, float %233, i32 0 + %236 = insertelement <4 x float> %235, float %234, i32 1 + %237 = insertelement <4 x float> %236, float undef, i32 2 + %238 = insertelement <4 x float> %237, float undef, i32 3 + %239 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %238, i32 17, i32 1, i32 2) + %240 = extractelement <4 x float> %239, i32 0 + %241 = insertelement <4 x float> undef, float %240, i32 0 + %242 = insertelement <4 x float> %241, float %228, i32 1 + %243 = insertelement <4 x float> %242, float 0.000000e+00, i32 2 + %244 = insertelement <4 x float> %243, float 0.000000e+00, i32 3 + %245 = extractelement <4 x float> %244, i32 0 + %246 = insertelement <4 x float> undef, float %245, i32 0 + %247 = insertelement <4 x float> %246, float undef, i32 1 + %248 = insertelement <4 x float> %247, float undef, i32 2 + %249 = insertelement <4 x float> %248, float undef, i32 3 + %250 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %249, i32 18, i32 2, i32 1) + %251 = extractelement <4 x float> %250, i32 0 + %252 = extractelement <4 x float> %250, i32 1 + %253 = extractelement <4 x float> %250, i32 2 + %254 = extractelement <4 x float> %250, i32 3 + %255 = fmul float %251, %216 + %256 = fmul float %252, %221 + %257 = fmul float %253, %226 + %258 = fmul float %254, 0.000000e+00 + %259 = fadd float %202, 0x3FF4CCCCC0000000 + %260 = fmul float %259, 0x3FE1C71C80000000 + %261 = call float @llvm.AMDIL.clamp.(float %260, float 0.000000e+00, float 1.000000e+00) + %262 = fadd float %202, 0x3FF4CCCCC0000000 + %263 = fmul float %262, 0x3FE1C71C80000000 + %264 = call float @llvm.AMDIL.clamp.(float %263, float 0.000000e+00, float 1.000000e+00) + %265 = fadd float %202, 2.000000e+00 + %266 = fmul float %265, 0x3FD611A7A0000000 + %267 = call float @llvm.AMDIL.clamp.(float %266, float 0.000000e+00, float 1.000000e+00) + %268 = fmul float 2.000000e+00, %261 + %269 = fsub float -0.000000e+00, %268 + %270 = fadd float 3.000000e+00, %269 + %271 = fmul float %261, %270 + %272 = fmul float %261, %271 + %273 = fmul float 2.000000e+00, %264 + %274 = fsub float -0.000000e+00, %273 + %275 = fadd float 3.000000e+00, %274 + %276 = fmul float %264, %275 + %277 = fmul float %264, %276 + %278 = fmul float 2.000000e+00, %267 + %279 = fsub float -0.000000e+00, %278 + %280 = fadd float 3.000000e+00, %279 + %281 = fmul float %267, %280 + %282 = fmul float %267, %281 + %283 = fmul float %26, 0x3F22DFD6A0000000 + %284 = fmul float %27, 0x3F22DFD6A0000000 + %285 = insertelement <4 x float> undef, float %283, i32 0 + %286 = insertelement <4 x float> %285, float %284, i32 1 + %287 = insertelement <4 x float> %286, float 0.000000e+00, i32 2 + %288 = insertelement <4 x float> %287, float 0.000000e+00, i32 3 + %289 = extractelement <4 x float> %288, i32 0 + %290 = extractelement <4 x float> %288, i32 1 + %291 = insertelement <4 x float> undef, float %289, i32 0 + %292 = insertelement <4 x float> %291, float %290, i32 1 + %293 = insertelement <4 x float> %292, float undef, i32 2 + %294 = insertelement <4 x float> %293, float undef, i32 3 + %295 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %294, i32 19, i32 3, i32 2) + %296 = extractelement <4 x float> %295, i32 0 + %297 = extractelement <4 x float> %295, i32 1 + %298 = extractelement <4 x float> %295, i32 2 + %299 = extractelement <4 x float> %295, i32 3 + %300 = fmul float %296, %272 + %301 = fmul float %297, %277 + %302 = fmul float %298, %282 + %303 = fmul float %299, 0.000000e+00 + %304 = fmul float %temp68.1, %37 + %305 = fmul float %temp68.1, %38 + %306 = fmul float %temp68.1, %39 + %307 = fmul float %temp69.0, %40 + %308 = fadd float %307, %304 + %309 = fmul float %temp69.0, %41 + %310 = fadd float %309, %305 + %311 = fmul float %temp69.0, %42 + %312 = fadd float %311, %306 + %313 = fmul float %temp70.0, %34 + %314 = fadd float %313, %308 + %315 = fmul float %temp70.0, %35 + %316 = fadd float %315, %310 + %317 = fmul float %temp70.0, %36 + %318 = fadd float %317, %312 + %319 = insertelement <4 x float> undef, float %314, i32 0 + %320 = insertelement <4 x float> %319, float %316, i32 1 + %321 = insertelement <4 x float> %320, float %318, i32 2 + %322 = insertelement <4 x float> %321, float 0.000000e+00, i32 3 + %323 = insertelement <4 x float> undef, float %314, i32 0 + %324 = insertelement <4 x float> %323, float %316, i32 1 + %325 = insertelement <4 x float> %324, float %318, i32 2 + %326 = insertelement <4 x float> %325, float 0.000000e+00, i32 3 + %327 = call float @llvm.AMDGPU.dp4(<4 x float> %322, <4 x float> %326) + %328 = call float @llvm.AMDGPU.rsq.f32(float %327) + %329 = fmul float %314, %328 + %330 = fmul float %316, %328 + %331 = fmul float %318, %328 + %332 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6) + %333 = extractelement <4 x float> %332, i32 0 + %334 = fsub float -0.000000e+00, %333 + %335 = fadd float 1.000000e+00, %334 + %336 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) + %337 = extractelement <4 x float> %336, i32 0 + %338 = fsub float -0.000000e+00, %337 + %339 = fadd float 1.000000e+00, %338 + %340 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) + %341 = extractelement <4 x float> %340, i32 0 + %342 = fsub float -0.000000e+00, %341 + %343 = fadd float 1.000000e+00, %342 + %344 = fsub float -0.000000e+00, %335 + %345 = fadd float %202, %344 + %346 = fsub float -0.000000e+00, %339 + %347 = fadd float %202, %346 + %348 = fadd float %347, 0xBFE3333340000000 + %349 = fsub float -0.000000e+00, %202 + %350 = fsub float -0.000000e+00, %343 + %351 = fadd float %349, %350 + %352 = insertelement <4 x float> undef, float %43, i32 0 + %353 = insertelement <4 x float> %352, float %44, i32 1 + %354 = insertelement <4 x float> %353, float %45, i32 2 + %355 = insertelement <4 x float> %354, float 0.000000e+00, i32 3 + %356 = insertelement <4 x float> undef, float %43, i32 0 + %357 = insertelement <4 x float> %356, float %44, i32 1 + %358 = insertelement <4 x float> %357, float %45, i32 2 + %359 = insertelement <4 x float> %358, float 0.000000e+00, i32 3 + %360 = call float @llvm.AMDGPU.dp4(<4 x float> %355, <4 x float> %359) + %361 = call float @llvm.AMDGPU.rsq.f32(float %360) + %362 = fmul float %45, %361 + %363 = call float @fabs(float %362) + %364 = fmul float %176, 0x3FECCCCCC0000000 + %365 = fadd float %364, %363 + %366 = fadd float %365, 0xBFEFAE1480000000 + %367 = fmul float %366, 0xC023FFFFC0000000 + %368 = call float @llvm.AMDIL.clamp.(float %367, float 0.000000e+00, float 1.000000e+00) + %369 = fsub float -0.000000e+00, %335 + %370 = fadd float %202, %369 + %371 = fadd float %370, 0x3FBEB851E0000000 + %372 = fsub float -0.000000e+00, %339 + %373 = fadd float %202, %372 + %374 = fadd float %373, 0xBFE0A3D700000000 + %375 = fsub float -0.000000e+00, %202 + %376 = fsub float -0.000000e+00, %343 + %377 = fadd float %375, %376 + %378 = insertelement <4 x float> undef, float %43, i32 0 + %379 = insertelement <4 x float> %378, float %44, i32 1 + %380 = insertelement <4 x float> %379, float %45, i32 2 + %381 = insertelement <4 x float> %380, float 0.000000e+00, i32 3 + %382 = insertelement <4 x float> undef, float %43, i32 0 + %383 = insertelement <4 x float> %382, float %44, i32 1 + %384 = insertelement <4 x float> %383, float %45, i32 2 + %385 = insertelement <4 x float> %384, float 0.000000e+00, i32 3 + %386 = call float @llvm.AMDGPU.dp4(<4 x float> %381, <4 x float> %385) + %387 = call float @llvm.AMDGPU.rsq.f32(float %386) + %388 = fmul float %45, %387 + %389 = call float @fabs(float %388) + %390 = fmul float %176, 0x3FF51EB860000000 + %391 = fadd float %390, %389 + %392 = fadd float %391, 0xBFEFAE1480000000 + %393 = fmul float %392, 0xC0490001A0000000 + %394 = call float @llvm.AMDIL.clamp.(float %393, float 0.000000e+00, float 1.000000e+00) + %395 = fmul float 2.000000e+00, %368 + %396 = fsub float -0.000000e+00, %395 + %397 = fadd float 3.000000e+00, %396 + %398 = fmul float %368, %397 + %399 = fmul float %368, %398 + %400 = call float @llvm.AMDGPU.lrp(float %399, float %255, float %345) + %401 = call float @llvm.AMDGPU.lrp(float %399, float %256, float %348) + %402 = call float @llvm.AMDGPU.lrp(float %399, float %257, float %351) + %403 = call float @llvm.AMDGPU.lrp(float %399, float %258, float 0.000000e+00) + %404 = fmul float 2.000000e+00, %394 + %405 = fsub float -0.000000e+00, %404 + %406 = fadd float 3.000000e+00, %405 + %407 = fmul float %394, %406 + %408 = fmul float %394, %407 + %409 = call float @llvm.AMDGPU.lrp(float %408, float %255, float %371) + %410 = call float @llvm.AMDGPU.lrp(float %408, float %256, float %374) + %411 = call float @llvm.AMDGPU.lrp(float %408, float %257, float %377) + %412 = call float @llvm.AMDGPU.lrp(float %408, float %258, float 0x3FD3333340000000) + %413 = fcmp oge float 2.200000e+03, %179 + %414 = sext i1 %413 to i32 + %415 = bitcast i32 %414 to float + %416 = bitcast float %415 to i32 + %417 = icmp ne i32 %416, 0 + br i1 %417, label %IF161, label %ENDIF160 + +LOOP: ; preds = %ENDIF139, %IF137 + %temp88.0 = phi float [ 0.000000e+00, %IF137 ], [ %446, %ENDIF139 ] + %temp92.0 = phi float [ 1.000000e+00, %IF137 ], [ %.temp92.0, %ENDIF139 ] + %temp96.0 = phi float [ 0.000000e+00, %IF137 ], [ %477, %ENDIF139 ] + %418 = bitcast float %temp96.0 to i32 + %419 = icmp sge i32 %418, %137 + %420 = sext i1 %419 to i32 + %421 = bitcast i32 %420 to float + %422 = bitcast float %421 to i32 + %423 = icmp ne i32 %422, 0 + br i1 %423, label %IF140, label %ENDIF139 + +IF140: ; preds = %LOOP + %424 = fmul float %133, 5.000000e-01 + %425 = fmul float %129, %temp92.0 + %426 = fadd float %425, %22 + %427 = fmul float %130, %temp92.0 + %428 = fadd float %427, %23 + %429 = insertelement <4 x float> undef, float %426, i32 0 + %430 = insertelement <4 x float> %429, float %428, i32 1 + %431 = insertelement <4 x float> %430, float 0.000000e+00, i32 2 + %432 = insertelement <4 x float> %431, float 0.000000e+00, i32 3 + %433 = extractelement <4 x float> %432, i32 0 + %434 = extractelement <4 x float> %432, i32 1 + %435 = insertelement <4 x float> undef, float %433, i32 0 + %436 = insertelement <4 x float> %435, float %434, i32 1 + %437 = insertelement <4 x float> %436, float undef, i32 2 + %438 = insertelement <4 x float> %437, float undef, i32 3 + %439 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %438, i32 20, i32 4, i32 2) + %440 = extractelement <4 x float> %439, i32 3 + %441 = fcmp oge float %temp92.0, %440 + %442 = sext i1 %441 to i32 + %443 = bitcast i32 %442 to float + %444 = bitcast float %443 to i32 + %445 = icmp ne i32 %444, 0 + br i1 %445, label %IF146, label %ENDIF145 + +ENDIF139: ; preds = %LOOP + %446 = fadd float %temp88.0, %133 + %447 = fmul float %129, %446 + %448 = fadd float %447, %22 + %449 = fmul float %130, %446 + %450 = fadd float %449, %23 + %451 = insertelement <4 x float> undef, float %448, i32 0 + %452 = insertelement <4 x float> %451, float %450, i32 1 + %453 = insertelement <4 x float> %452, float 0.000000e+00, i32 2 + %454 = insertelement <4 x float> %453, float 0.000000e+00, i32 3 + %455 = extractelement <4 x float> %454, i32 0 + %456 = extractelement <4 x float> %454, i32 1 + %457 = insertelement <4 x float> undef, float %455, i32 0 + %458 = insertelement <4 x float> %457, float %456, i32 1 + %459 = insertelement <4 x float> %458, float undef, i32 2 + %460 = insertelement <4 x float> %459, float undef, i32 3 + %461 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %460, i32 20, i32 4, i32 2) + %462 = extractelement <4 x float> %461, i32 3 + %463 = fcmp olt float 0x3FEFDF3B60000000, %temp92.0 + %464 = sext i1 %463 to i32 + %465 = bitcast i32 %464 to float + %466 = fcmp oge float %446, %462 + %467 = sext i1 %466 to i32 + %468 = bitcast i32 %467 to float + %469 = bitcast float %465 to i32 + %470 = bitcast float %468 to i32 + %471 = and i32 %469, %470 + %472 = bitcast i32 %471 to float + %473 = bitcast float %472 to i32 + %474 = icmp ne i32 %473, 0 + %.temp92.0 = select i1 %474, float %446, float %temp92.0 + %475 = bitcast float %temp96.0 to i32 + %476 = add i32 %475, 1 + %477 = bitcast i32 %476 to float + br label %LOOP + +IF146: ; preds = %IF140 + %478 = fmul float 2.000000e+00, %424 + %479 = fsub float -0.000000e+00, %478 + %480 = fadd float %temp92.0, %479 + br label %ENDIF145 + +ENDIF145: ; preds = %IF140, %IF146 + %temp88.1 = phi float [ %480, %IF146 ], [ %temp92.0, %IF140 ] + %481 = fadd float %temp88.1, %424 + %482 = fmul float %424, 5.000000e-01 + %483 = fmul float %129, %481 + %484 = fadd float %483, %22 + %485 = fmul float %130, %481 + %486 = fadd float %485, %23 + %487 = insertelement <4 x float> undef, float %484, i32 0 + %488 = insertelement <4 x float> %487, float %486, i32 1 + %489 = insertelement <4 x float> %488, float 0.000000e+00, i32 2 + %490 = insertelement <4 x float> %489, float %440, i32 3 + %491 = extractelement <4 x float> %490, i32 0 + %492 = extractelement <4 x float> %490, i32 1 + %493 = insertelement <4 x float> undef, float %491, i32 0 + %494 = insertelement <4 x float> %493, float %492, i32 1 + %495 = insertelement <4 x float> %494, float undef, i32 2 + %496 = insertelement <4 x float> %495, float undef, i32 3 + %497 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %496, i32 20, i32 4, i32 2) + %498 = extractelement <4 x float> %497, i32 3 + %499 = fcmp oge float %481, %498 + %500 = sext i1 %499 to i32 + %501 = bitcast i32 %500 to float + %502 = bitcast float %501 to i32 + %503 = icmp ne i32 %502, 0 + br i1 %503, label %IF149, label %ENDIF148 + +IF149: ; preds = %ENDIF145 + %504 = fmul float 2.000000e+00, %482 + %505 = fsub float -0.000000e+00, %504 + %506 = fadd float %481, %505 + br label %ENDIF148 + +ENDIF148: ; preds = %ENDIF145, %IF149 + %temp88.2 = phi float [ %506, %IF149 ], [ %481, %ENDIF145 ] + %temp92.2 = phi float [ %481, %IF149 ], [ %temp92.0, %ENDIF145 ] + %507 = fadd float %temp88.2, %482 + %508 = fmul float %482, 5.000000e-01 + %509 = fmul float %129, %507 + %510 = fadd float %509, %22 + %511 = fmul float %130, %507 + %512 = fadd float %511, %23 + %513 = insertelement <4 x float> undef, float %510, i32 0 + %514 = insertelement <4 x float> %513, float %512, i32 1 + %515 = insertelement <4 x float> %514, float 0.000000e+00, i32 2 + %516 = insertelement <4 x float> %515, float %498, i32 3 + %517 = extractelement <4 x float> %516, i32 0 + %518 = extractelement <4 x float> %516, i32 1 + %519 = insertelement <4 x float> undef, float %517, i32 0 + %520 = insertelement <4 x float> %519, float %518, i32 1 + %521 = insertelement <4 x float> %520, float undef, i32 2 + %522 = insertelement <4 x float> %521, float undef, i32 3 + %523 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %522, i32 20, i32 4, i32 2) + %524 = extractelement <4 x float> %523, i32 3 + %525 = fcmp oge float %507, %524 + %526 = sext i1 %525 to i32 + %527 = bitcast i32 %526 to float + %528 = bitcast float %527 to i32 + %529 = icmp ne i32 %528, 0 + br i1 %529, label %IF152, label %ENDIF151 + +IF152: ; preds = %ENDIF148 + %530 = fmul float 2.000000e+00, %508 + %531 = fsub float -0.000000e+00, %530 + %532 = fadd float %507, %531 + br label %ENDIF151 + +ENDIF151: ; preds = %ENDIF148, %IF152 + %temp88.3 = phi float [ %532, %IF152 ], [ %507, %ENDIF148 ] + %temp92.3 = phi float [ %507, %IF152 ], [ %temp92.2, %ENDIF148 ] + %533 = fadd float %temp88.3, %508 + %534 = fmul float %508, 5.000000e-01 + %535 = fmul float %129, %533 + %536 = fadd float %535, %22 + %537 = fmul float %130, %533 + %538 = fadd float %537, %23 + %539 = insertelement <4 x float> undef, float %536, i32 0 + %540 = insertelement <4 x float> %539, float %538, i32 1 + %541 = insertelement <4 x float> %540, float 0.000000e+00, i32 2 + %542 = insertelement <4 x float> %541, float %524, i32 3 + %543 = extractelement <4 x float> %542, i32 0 + %544 = extractelement <4 x float> %542, i32 1 + %545 = insertelement <4 x float> undef, float %543, i32 0 + %546 = insertelement <4 x float> %545, float %544, i32 1 + %547 = insertelement <4 x float> %546, float undef, i32 2 + %548 = insertelement <4 x float> %547, float undef, i32 3 + %549 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %548, i32 20, i32 4, i32 2) + %550 = extractelement <4 x float> %549, i32 3 + %551 = fcmp oge float %533, %550 + %552 = sext i1 %551 to i32 + %553 = bitcast i32 %552 to float + %554 = bitcast float %553 to i32 + %555 = icmp ne i32 %554, 0 + br i1 %555, label %IF155, label %ENDIF154 + +IF155: ; preds = %ENDIF151 + %556 = fmul float 2.000000e+00, %534 + %557 = fsub float -0.000000e+00, %556 + %558 = fadd float %533, %557 + br label %ENDIF154 + +ENDIF154: ; preds = %ENDIF151, %IF155 + %temp88.4 = phi float [ %558, %IF155 ], [ %533, %ENDIF151 ] + %temp92.4 = phi float [ %533, %IF155 ], [ %temp92.3, %ENDIF151 ] + %559 = fadd float %temp88.4, %534 + %560 = fmul float %129, %559 + %561 = fadd float %560, %22 + %562 = fmul float %130, %559 + %563 = fadd float %562, %23 + %564 = insertelement <4 x float> undef, float %561, i32 0 + %565 = insertelement <4 x float> %564, float %563, i32 1 + %566 = insertelement <4 x float> %565, float 0.000000e+00, i32 2 + %567 = insertelement <4 x float> %566, float %550, i32 3 + %568 = extractelement <4 x float> %567, i32 0 + %569 = extractelement <4 x float> %567, i32 1 + %570 = insertelement <4 x float> undef, float %568, i32 0 + %571 = insertelement <4 x float> %570, float %569, i32 1 + %572 = insertelement <4 x float> %571, float undef, i32 2 + %573 = insertelement <4 x float> %572, float undef, i32 3 + %574 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %573, i32 20, i32 4, i32 2) + %575 = extractelement <4 x float> %574, i32 3 + %576 = fcmp oge float %559, %575 + %577 = sext i1 %576 to i32 + %578 = bitcast i32 %577 to float + %579 = bitcast float %578 to i32 + %580 = icmp ne i32 %579, 0 + %.temp92.4 = select i1 %580, float %559, float %temp92.4 + %581 = fmul float %129, %.temp92.4 + %582 = fadd float %581, %22 + %583 = fmul float %130, %.temp92.4 + %584 = fadd float %583, %23 + %585 = insertelement <4 x float> undef, float %582, i32 0 + %586 = insertelement <4 x float> %585, float %584, i32 1 + %587 = insertelement <4 x float> %586, float 0.000000e+00, i32 2 + %588 = insertelement <4 x float> %587, float %575, i32 3 + %589 = extractelement <4 x float> %588, i32 0 + %590 = extractelement <4 x float> %588, i32 1 + %591 = insertelement <4 x float> undef, float %589, i32 0 + %592 = insertelement <4 x float> %591, float %590, i32 1 + %593 = insertelement <4 x float> %592, float undef, i32 2 + %594 = insertelement <4 x float> %593, float undef, i32 3 + %595 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %594, i32 20, i32 4, i32 2) + %596 = extractelement <4 x float> %595, i32 0 + %597 = extractelement <4 x float> %595, i32 1 + %598 = extractelement <4 x float> %595, i32 2 + %599 = fmul float %596, 2.000000e+00 + %600 = fadd float %599, -1.000000e+00 + %601 = fmul float %597, 2.000000e+00 + %602 = fadd float %601, -1.000000e+00 + %603 = fmul float %598, 2.000000e+00 + %604 = fadd float %603, -1.000000e+00 + br label %ENDIF136 + +IF161: ; preds = %ENDIF136 + %605 = fmul float %202, 0x3FB99999A0000000 + %606 = fcmp uge float 0x3FE4CCCCC0000000, %605 + %607 = select i1 %606, float 0x3FE4CCCCC0000000, float %605 + %608 = fcmp uge float %607, 5.000000e-01 + %609 = select i1 %608, float 5.000000e-01, float %607 + %610 = call float @llvm.AMDGPU.lrp(float %609, float %400, float %300) + %611 = call float @llvm.AMDGPU.lrp(float %609, float %401, float %301) + %612 = call float @llvm.AMDGPU.lrp(float %609, float %402, float %302) + %613 = call float @llvm.AMDGPU.lrp(float %609, float %403, float %303) + %614 = insertelement <4 x float> undef, float %329, i32 0 + %615 = insertelement <4 x float> %614, float %330, i32 1 + %616 = insertelement <4 x float> %615, float %331, i32 2 + %617 = insertelement <4 x float> %616, float 0.000000e+00, i32 3 + %618 = insertelement <4 x float> undef, float %63, i32 0 + %619 = insertelement <4 x float> %618, float %65, i32 1 + %620 = insertelement <4 x float> %619, float %67, i32 2 + %621 = insertelement <4 x float> %620, float 0.000000e+00, i32 3 + %622 = call float @llvm.AMDGPU.dp4(<4 x float> %617, <4 x float> %621) + %623 = fcmp uge float 0x3FE6666660000000, %622 + %624 = select i1 %623, float 0x3FE6666660000000, float %622 + %625 = fmul float %8, %624 + %626 = fmul float %13, %624 + %627 = fmul float %18, %624 + %628 = insertelement <4 x float> undef, float %34, i32 0 + %629 = insertelement <4 x float> %628, float %35, i32 1 + %630 = insertelement <4 x float> %629, float %36, i32 2 + %631 = insertelement <4 x float> %630, float 0.000000e+00, i32 3 + %632 = insertelement <4 x float> undef, float %63, i32 0 + %633 = insertelement <4 x float> %632, float %65, i32 1 + %634 = insertelement <4 x float> %633, float %67, i32 2 + %635 = insertelement <4 x float> %634, float 0.000000e+00, i32 3 + %636 = call float @llvm.AMDGPU.dp4(<4 x float> %631, <4 x float> %635) + %637 = fcmp uge float 0x3FECCCCCC0000000, %636 + %638 = select i1 %637, float 0x3FECCCCCC0000000, float %636 + %639 = fmul float %625, %638 + %640 = fmul float %626, %638 + %641 = fmul float %627, %638 + br label %ENDIF160 + +ENDIF160: ; preds = %ENDIF136, %IF161 + %temp84.0 = phi float [ %610, %IF161 ], [ %255, %ENDIF136 ] + %temp85.0 = phi float [ %611, %IF161 ], [ %256, %ENDIF136 ] + %temp86.0 = phi float [ %612, %IF161 ], [ %257, %ENDIF136 ] + %temp87.0 = phi float [ %613, %IF161 ], [ %258, %ENDIF136 ] + %temp92.6 = phi float [ %639, %IF161 ], [ %415, %ENDIF136 ] + %temp93.0 = phi float [ %640, %IF161 ], [ 0.000000e+00, %ENDIF136 ] + %temp94.0 = phi float [ %641, %IF161 ], [ 0.000000e+00, %ENDIF136 ] + %642 = fcmp olt float 2.200000e+03, %179 + %643 = sext i1 %642 to i32 + %644 = bitcast i32 %643 to float + %645 = fcmp olt float %179, 2.300000e+03 + %646 = sext i1 %645 to i32 + %647 = bitcast i32 %646 to float + %648 = bitcast float %644 to i32 + %649 = bitcast float %647 to i32 + %650 = and i32 %648, %649 + %651 = bitcast i32 %650 to float + %652 = bitcast float %651 to i32 + %653 = icmp ne i32 %652, 0 + br i1 %653, label %IF164, label %ENDIF163 + +IF164: ; preds = %ENDIF160 + %654 = fmul float %202, 5.000000e-01 + %655 = fcmp uge float 0x3FE4CCCCC0000000, %654 + %656 = select i1 %655, float 0x3FE4CCCCC0000000, float %654 + %657 = fcmp uge float %656, 0x3FD6666660000000 + %658 = select i1 %657, float 0x3FD6666660000000, float %656 + %659 = call float @llvm.AMDGPU.lrp(float %658, float %400, float %300) + %660 = call float @llvm.AMDGPU.lrp(float %658, float %401, float %301) + %661 = call float @llvm.AMDGPU.lrp(float %658, float %402, float %302) + %662 = call float @llvm.AMDGPU.lrp(float %658, float %403, float %303) + %663 = insertelement <4 x float> undef, float %329, i32 0 + %664 = insertelement <4 x float> %663, float %330, i32 1 + %665 = insertelement <4 x float> %664, float %331, i32 2 + %666 = insertelement <4 x float> %665, float 0.000000e+00, i32 3 + %667 = insertelement <4 x float> undef, float %63, i32 0 + %668 = insertelement <4 x float> %667, float %65, i32 1 + %669 = insertelement <4 x float> %668, float %67, i32 2 + %670 = insertelement <4 x float> %669, float 0.000000e+00, i32 3 + %671 = call float @llvm.AMDGPU.dp4(<4 x float> %666, <4 x float> %670) + %672 = fcmp uge float 0x3FE6666660000000, %671 + %673 = select i1 %672, float 0x3FE6666660000000, float %671 + %674 = fmul float %8, %673 + %675 = fmul float %13, %673 + %676 = fmul float %18, %673 + %677 = insertelement <4 x float> undef, float %34, i32 0 + %678 = insertelement <4 x float> %677, float %35, i32 1 + %679 = insertelement <4 x float> %678, float %36, i32 2 + %680 = insertelement <4 x float> %679, float 0.000000e+00, i32 3 + %681 = insertelement <4 x float> undef, float %63, i32 0 + %682 = insertelement <4 x float> %681, float %65, i32 1 + %683 = insertelement <4 x float> %682, float %67, i32 2 + %684 = insertelement <4 x float> %683, float 0.000000e+00, i32 3 + %685 = call float @llvm.AMDGPU.dp4(<4 x float> %680, <4 x float> %684) + %686 = fcmp uge float 0x3FECCCCCC0000000, %685 + %687 = select i1 %686, float 0x3FECCCCCC0000000, float %685 + %688 = fmul float %674, %687 + %689 = fmul float %675, %687 + %690 = fmul float %676, %687 + br label %ENDIF163 + +ENDIF163: ; preds = %ENDIF160, %IF164 + %temp84.1 = phi float [ %659, %IF164 ], [ %temp84.0, %ENDIF160 ] + %temp85.1 = phi float [ %660, %IF164 ], [ %temp85.0, %ENDIF160 ] + %temp86.1 = phi float [ %661, %IF164 ], [ %temp86.0, %ENDIF160 ] + %temp87.1 = phi float [ %662, %IF164 ], [ %temp87.0, %ENDIF160 ] + %temp92.7 = phi float [ %688, %IF164 ], [ %temp92.6, %ENDIF160 ] + %temp93.1 = phi float [ %689, %IF164 ], [ %temp93.0, %ENDIF160 ] + %temp94.1 = phi float [ %690, %IF164 ], [ %temp94.0, %ENDIF160 ] + %691 = fcmp oge float %179, 2.300000e+03 + %692 = sext i1 %691 to i32 + %693 = bitcast i32 %692 to float + %694 = fcmp olt float %179, 2.480000e+03 + %695 = sext i1 %694 to i32 + %696 = bitcast i32 %695 to float + %697 = bitcast float %693 to i32 + %698 = bitcast float %696 to i32 + %699 = and i32 %697, %698 + %700 = bitcast i32 %699 to float + %701 = bitcast float %700 to i32 + %702 = icmp ne i32 %701, 0 + br i1 %702, label %IF167, label %ENDIF166 + +IF167: ; preds = %ENDIF163 + %703 = fmul float %202, 5.000000e-01 + %704 = fcmp uge float 0x3FE4CCCCC0000000, %703 + %705 = select i1 %704, float 0x3FE4CCCCC0000000, float %703 + %706 = fcmp uge float %705, 0x3FD3333340000000 + %707 = select i1 %706, float 0x3FD3333340000000, float %705 + %708 = call float @llvm.AMDGPU.lrp(float %707, float %409, float %300) + %709 = call float @llvm.AMDGPU.lrp(float %707, float %410, float %301) + %710 = call float @llvm.AMDGPU.lrp(float %707, float %411, float %302) + %711 = call float @llvm.AMDGPU.lrp(float %707, float %412, float %303) + %712 = insertelement <4 x float> undef, float %329, i32 0 + %713 = insertelement <4 x float> %712, float %330, i32 1 + %714 = insertelement <4 x float> %713, float %331, i32 2 + %715 = insertelement <4 x float> %714, float 0.000000e+00, i32 3 + %716 = insertelement <4 x float> undef, float %63, i32 0 + %717 = insertelement <4 x float> %716, float %65, i32 1 + %718 = insertelement <4 x float> %717, float %67, i32 2 + %719 = insertelement <4 x float> %718, float 0.000000e+00, i32 3 + %720 = call float @llvm.AMDGPU.dp4(<4 x float> %715, <4 x float> %719) + %721 = fcmp uge float 0x3FEB333340000000, %720 + %722 = select i1 %721, float 0x3FEB333340000000, float %720 + %723 = fmul float %8, %722 + %724 = fmul float %13, %722 + %725 = fmul float %18, %722 + %726 = insertelement <4 x float> undef, float %34, i32 0 + %727 = insertelement <4 x float> %726, float %35, i32 1 + %728 = insertelement <4 x float> %727, float %36, i32 2 + %729 = insertelement <4 x float> %728, float 0.000000e+00, i32 3 + %730 = insertelement <4 x float> undef, float %63, i32 0 + %731 = insertelement <4 x float> %730, float %65, i32 1 + %732 = insertelement <4 x float> %731, float %67, i32 2 + %733 = insertelement <4 x float> %732, float 0.000000e+00, i32 3 + %734 = call float @llvm.AMDGPU.dp4(<4 x float> %729, <4 x float> %733) + %735 = fcmp uge float 0x3FECCCCCC0000000, %734 + %736 = select i1 %735, float 0x3FECCCCCC0000000, float %734 + %737 = fmul float %723, %736 + %738 = fmul float %724, %736 + %739 = fmul float %725, %736 + br label %ENDIF166 + +ENDIF166: ; preds = %ENDIF163, %IF167 + %temp84.2 = phi float [ %708, %IF167 ], [ %temp84.1, %ENDIF163 ] + %temp85.2 = phi float [ %709, %IF167 ], [ %temp85.1, %ENDIF163 ] + %temp86.2 = phi float [ %710, %IF167 ], [ %temp86.1, %ENDIF163 ] + %temp87.2 = phi float [ %711, %IF167 ], [ %temp87.1, %ENDIF163 ] + %temp92.8 = phi float [ %737, %IF167 ], [ %temp92.7, %ENDIF163 ] + %temp93.2 = phi float [ %738, %IF167 ], [ %temp93.1, %ENDIF163 ] + %temp94.2 = phi float [ %739, %IF167 ], [ %temp94.1, %ENDIF163 ] + %740 = fcmp oge float %179, 2.480000e+03 + %741 = sext i1 %740 to i32 + %742 = bitcast i32 %741 to float + %743 = fcmp olt float %179, 2.530000e+03 + %744 = sext i1 %743 to i32 + %745 = bitcast i32 %744 to float + %746 = bitcast float %742 to i32 + %747 = bitcast float %745 to i32 + %748 = and i32 %746, %747 + %749 = bitcast i32 %748 to float + %750 = bitcast float %749 to i32 + %751 = icmp ne i32 %750, 0 + br i1 %751, label %IF170, label %ENDIF169 + +IF170: ; preds = %ENDIF166 + %752 = fmul float %202, 5.000000e-01 + %753 = fcmp uge float 0x3FE4CCCCC0000000, %752 + %754 = select i1 %753, float 0x3FE4CCCCC0000000, float %752 + %755 = fcmp uge float %754, 0x3FC99999A0000000 + %756 = select i1 %755, float 0x3FC99999A0000000, float %754 + %757 = call float @llvm.AMDGPU.lrp(float %756, float %409, float %300) + %758 = call float @llvm.AMDGPU.lrp(float %756, float %410, float %301) + %759 = call float @llvm.AMDGPU.lrp(float %756, float %411, float %302) + %760 = call float @llvm.AMDGPU.lrp(float %756, float %412, float %303) + %761 = insertelement <4 x float> undef, float %329, i32 0 + %762 = insertelement <4 x float> %761, float %330, i32 1 + %763 = insertelement <4 x float> %762, float %331, i32 2 + %764 = insertelement <4 x float> %763, float 0.000000e+00, i32 3 + %765 = insertelement <4 x float> undef, float %63, i32 0 + %766 = insertelement <4 x float> %765, float %65, i32 1 + %767 = insertelement <4 x float> %766, float %67, i32 2 + %768 = insertelement <4 x float> %767, float 0.000000e+00, i32 3 + %769 = call float @llvm.AMDGPU.dp4(<4 x float> %764, <4 x float> %768) + %770 = fcmp uge float 0x3FEB333340000000, %769 + %771 = select i1 %770, float 0x3FEB333340000000, float %769 + %772 = fmul float %8, %771 + %773 = fmul float %13, %771 + %774 = fmul float %18, %771 + %775 = insertelement <4 x float> undef, float %34, i32 0 + %776 = insertelement <4 x float> %775, float %35, i32 1 + %777 = insertelement <4 x float> %776, float %36, i32 2 + %778 = insertelement <4 x float> %777, float 0.000000e+00, i32 3 + %779 = insertelement <4 x float> undef, float %63, i32 0 + %780 = insertelement <4 x float> %779, float %65, i32 1 + %781 = insertelement <4 x float> %780, float %67, i32 2 + %782 = insertelement <4 x float> %781, float 0.000000e+00, i32 3 + %783 = call float @llvm.AMDGPU.dp4(<4 x float> %778, <4 x float> %782) + %784 = fcmp uge float 0x3FECCCCCC0000000, %783 + %785 = select i1 %784, float 0x3FECCCCCC0000000, float %783 + %786 = fmul float %772, %785 + %787 = fmul float %773, %785 + %788 = fmul float %774, %785 + br label %ENDIF169 + +ENDIF169: ; preds = %ENDIF166, %IF170 + %temp84.3 = phi float [ %757, %IF170 ], [ %temp84.2, %ENDIF166 ] + %temp85.3 = phi float [ %758, %IF170 ], [ %temp85.2, %ENDIF166 ] + %temp86.3 = phi float [ %759, %IF170 ], [ %temp86.2, %ENDIF166 ] + %temp87.3 = phi float [ %760, %IF170 ], [ %temp87.2, %ENDIF166 ] + %temp92.9 = phi float [ %786, %IF170 ], [ %temp92.8, %ENDIF166 ] + %temp93.3 = phi float [ %787, %IF170 ], [ %temp93.2, %ENDIF166 ] + %temp94.3 = phi float [ %788, %IF170 ], [ %temp94.2, %ENDIF166 ] + %789 = fcmp oge float %179, 2.530000e+03 + %790 = sext i1 %789 to i32 + %791 = bitcast i32 %790 to float + %792 = fcmp olt float %179, 2.670000e+03 + %793 = sext i1 %792 to i32 + %794 = bitcast i32 %793 to float + %795 = bitcast float %791 to i32 + %796 = bitcast float %794 to i32 + %797 = and i32 %795, %796 + %798 = bitcast i32 %797 to float + %799 = bitcast float %798 to i32 + %800 = icmp ne i32 %799, 0 + br i1 %800, label %IF173, label %ENDIF172 + +IF173: ; preds = %ENDIF169 + %801 = fmul float %202, 5.000000e-01 + %802 = fcmp uge float 0x3FE4CCCCC0000000, %801 + %803 = select i1 %802, float 0x3FE4CCCCC0000000, float %801 + %804 = fcmp uge float %803, 0x3FB99999A0000000 + %805 = select i1 %804, float 0x3FB99999A0000000, float %803 + %806 = call float @llvm.AMDGPU.lrp(float %805, float %400, float %300) + %807 = call float @llvm.AMDGPU.lrp(float %805, float %401, float %301) + %808 = call float @llvm.AMDGPU.lrp(float %805, float %402, float %302) + %809 = call float @llvm.AMDGPU.lrp(float %805, float %403, float %303) + %810 = insertelement <4 x float> undef, float %329, i32 0 + %811 = insertelement <4 x float> %810, float %330, i32 1 + %812 = insertelement <4 x float> %811, float %331, i32 2 + %813 = insertelement <4 x float> %812, float 0.000000e+00, i32 3 + %814 = insertelement <4 x float> undef, float %63, i32 0 + %815 = insertelement <4 x float> %814, float %65, i32 1 + %816 = insertelement <4 x float> %815, float %67, i32 2 + %817 = insertelement <4 x float> %816, float 0.000000e+00, i32 3 + %818 = call float @llvm.AMDGPU.dp4(<4 x float> %813, <4 x float> %817) + %819 = fcmp uge float 0x3FEB333340000000, %818 + %820 = select i1 %819, float 0x3FEB333340000000, float %818 + %821 = fmul float %8, %820 + %822 = fmul float %13, %820 + %823 = fmul float %18, %820 + %824 = insertelement <4 x float> undef, float %34, i32 0 + %825 = insertelement <4 x float> %824, float %35, i32 1 + %826 = insertelement <4 x float> %825, float %36, i32 2 + %827 = insertelement <4 x float> %826, float 0.000000e+00, i32 3 + %828 = insertelement <4 x float> undef, float %63, i32 0 + %829 = insertelement <4 x float> %828, float %65, i32 1 + %830 = insertelement <4 x float> %829, float %67, i32 2 + %831 = insertelement <4 x float> %830, float 0.000000e+00, i32 3 + %832 = call float @llvm.AMDGPU.dp4(<4 x float> %827, <4 x float> %831) + %833 = fcmp uge float 0x3FECCCCCC0000000, %832 + %834 = select i1 %833, float 0x3FECCCCCC0000000, float %832 + %835 = fmul float %821, %834 + %836 = fmul float %822, %834 + %837 = fmul float %823, %834 + br label %ENDIF172 + +ENDIF172: ; preds = %ENDIF169, %IF173 + %temp84.4 = phi float [ %806, %IF173 ], [ %temp84.3, %ENDIF169 ] + %temp85.4 = phi float [ %807, %IF173 ], [ %temp85.3, %ENDIF169 ] + %temp86.4 = phi float [ %808, %IF173 ], [ %temp86.3, %ENDIF169 ] + %temp87.4 = phi float [ %809, %IF173 ], [ %temp87.3, %ENDIF169 ] + %temp92.10 = phi float [ %835, %IF173 ], [ %temp92.9, %ENDIF169 ] + %temp93.4 = phi float [ %836, %IF173 ], [ %temp93.3, %ENDIF169 ] + %temp94.4 = phi float [ %837, %IF173 ], [ %temp94.3, %ENDIF169 ] + %838 = fcmp oge float %179, 2.670000e+03 + %839 = sext i1 %838 to i32 + %840 = bitcast i32 %839 to float + %841 = bitcast float %840 to i32 + %842 = icmp ne i32 %841, 0 + br i1 %842, label %IF176, label %ENDIF175 + +IF176: ; preds = %ENDIF172 + %843 = fmul float %202, 0x3FB99999A0000000 + %844 = fcmp uge float 0.000000e+00, %843 + %845 = select i1 %844, float 0.000000e+00, float %843 + %846 = fcmp uge float %845, 0x3FD99999A0000000 + %847 = select i1 %846, float 0x3FD99999A0000000, float %845 + %848 = call float @llvm.AMDGPU.lrp(float %847, float %400, float %300) + %849 = call float @llvm.AMDGPU.lrp(float %847, float %401, float %301) + %850 = call float @llvm.AMDGPU.lrp(float %847, float %402, float %302) + %851 = call float @llvm.AMDGPU.lrp(float %847, float %403, float %303) + %852 = insertelement <4 x float> undef, float %329, i32 0 + %853 = insertelement <4 x float> %852, float %330, i32 1 + %854 = insertelement <4 x float> %853, float %331, i32 2 + %855 = insertelement <4 x float> %854, float 0.000000e+00, i32 3 + %856 = insertelement <4 x float> undef, float %63, i32 0 + %857 = insertelement <4 x float> %856, float %65, i32 1 + %858 = insertelement <4 x float> %857, float %67, i32 2 + %859 = insertelement <4 x float> %858, float 0.000000e+00, i32 3 + %860 = call float @llvm.AMDGPU.dp4(<4 x float> %855, <4 x float> %859) + %861 = fcmp uge float 0x3FEB333340000000, %860 + %862 = select i1 %861, float 0x3FEB333340000000, float %860 + %863 = fmul float %8, %862 + %864 = fmul float %13, %862 + %865 = fmul float %18, %862 + %866 = insertelement <4 x float> undef, float %34, i32 0 + %867 = insertelement <4 x float> %866, float %35, i32 1 + %868 = insertelement <4 x float> %867, float %36, i32 2 + %869 = insertelement <4 x float> %868, float 0.000000e+00, i32 3 + %870 = insertelement <4 x float> undef, float %63, i32 0 + %871 = insertelement <4 x float> %870, float %65, i32 1 + %872 = insertelement <4 x float> %871, float %67, i32 2 + %873 = insertelement <4 x float> %872, float 0.000000e+00, i32 3 + %874 = call float @llvm.AMDGPU.dp4(<4 x float> %869, <4 x float> %873) + %875 = fcmp uge float 0x3FECCCCCC0000000, %874 + %876 = select i1 %875, float 0x3FECCCCCC0000000, float %874 + %877 = fmul float %863, %876 + %878 = fmul float %864, %876 + %879 = fmul float %865, %876 + br label %ENDIF175 + +ENDIF175: ; preds = %ENDIF172, %IF176 + %temp84.5 = phi float [ %848, %IF176 ], [ %temp84.4, %ENDIF172 ] + %temp85.5 = phi float [ %849, %IF176 ], [ %temp85.4, %ENDIF172 ] + %temp86.5 = phi float [ %850, %IF176 ], [ %temp86.4, %ENDIF172 ] + %temp87.5 = phi float [ %851, %IF176 ], [ %temp87.4, %ENDIF172 ] + %temp92.11 = phi float [ %877, %IF176 ], [ %temp92.10, %ENDIF172 ] + %temp93.5 = phi float [ %878, %IF176 ], [ %temp93.4, %ENDIF172 ] + %temp94.5 = phi float [ %879, %IF176 ], [ %temp94.4, %ENDIF172 ] + %880 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10) + %881 = extractelement <4 x float> %880, i32 0 + %882 = fcmp olt float %881, %179 + %883 = sext i1 %882 to i32 + %884 = bitcast i32 %883 to float + %885 = bitcast float %884 to i32 + %886 = icmp ne i32 %885, 0 + br i1 %886, label %IF179, label %ENDIF178 + +IF179: ; preds = %ENDIF175 + %887 = fadd float %202, 1.000000e+00 + %888 = fadd float %202, 1.000000e+00 + %889 = fadd float %202, 1.000000e+00 + %890 = insertelement <4 x float> undef, float %43, i32 0 + %891 = insertelement <4 x float> %890, float %44, i32 1 + %892 = insertelement <4 x float> %891, float %45, i32 2 + %893 = insertelement <4 x float> %892, float 0.000000e+00, i32 3 + %894 = insertelement <4 x float> undef, float %43, i32 0 + %895 = insertelement <4 x float> %894, float %44, i32 1 + %896 = insertelement <4 x float> %895, float %45, i32 2 + %897 = insertelement <4 x float> %896, float 0.000000e+00, i32 3 + %898 = call float @llvm.AMDGPU.dp4(<4 x float> %893, <4 x float> %897) + %899 = call float @llvm.AMDGPU.rsq.f32(float %898) + %900 = fmul float %45, %899 + %901 = call float @fabs(float %900) + %902 = fmul float %176, 0x3FECCCCCC0000000 + %903 = fadd float %902, %901 + %904 = fadd float %903, 0xBFEFAE1480000000 + %905 = fmul float %904, 0xC043FFFE20000000 + %906 = call float @llvm.AMDIL.clamp.(float %905, float 0.000000e+00, float 1.000000e+00) + %907 = fmul float 2.000000e+00, %906 + %908 = fsub float -0.000000e+00, %907 + %909 = fadd float 3.000000e+00, %908 + %910 = fmul float %906, %909 + %911 = fmul float %906, %910 + %912 = call float @llvm.AMDGPU.lrp(float %911, float %temp84.5, float %887) + %913 = call float @llvm.AMDGPU.lrp(float %911, float %temp85.5, float %888) + %914 = call float @llvm.AMDGPU.lrp(float %911, float %temp86.5, float %889) + %915 = call float @llvm.AMDGPU.lrp(float %911, float %temp87.5, float 0.000000e+00) + %916 = fmul float %202, 5.000000e-01 + %917 = fcmp uge float 0x3FE4CCCCC0000000, %916 + %918 = select i1 %917, float 0x3FE4CCCCC0000000, float %916 + %919 = fcmp uge float %918, 0x3FE3333340000000 + %920 = select i1 %919, float 0x3FE3333340000000, float %918 + %921 = call float @llvm.AMDGPU.lrp(float %920, float %912, float %temp84.5) + %922 = call float @llvm.AMDGPU.lrp(float %920, float %913, float %temp85.5) + %923 = call float @llvm.AMDGPU.lrp(float %920, float %914, float %temp86.5) + %924 = call float @llvm.AMDGPU.lrp(float %920, float %915, float %temp87.5) + %925 = insertelement <4 x float> undef, float %329, i32 0 + %926 = insertelement <4 x float> %925, float %330, i32 1 + %927 = insertelement <4 x float> %926, float %331, i32 2 + %928 = insertelement <4 x float> %927, float 0.000000e+00, i32 3 + %929 = insertelement <4 x float> undef, float %63, i32 0 + %930 = insertelement <4 x float> %929, float %65, i32 1 + %931 = insertelement <4 x float> %930, float %67, i32 2 + %932 = insertelement <4 x float> %931, float 0.000000e+00, i32 3 + %933 = call float @llvm.AMDGPU.dp4(<4 x float> %928, <4 x float> %932) + %934 = fcmp uge float 0x3FE99999A0000000, %933 + %935 = select i1 %934, float 0x3FE99999A0000000, float %933 + %936 = fmul float %8, %935 + %937 = fmul float %13, %935 + %938 = fmul float %18, %935 + %939 = insertelement <4 x float> undef, float %34, i32 0 + %940 = insertelement <4 x float> %939, float %35, i32 1 + %941 = insertelement <4 x float> %940, float %36, i32 2 + %942 = insertelement <4 x float> %941, float 0.000000e+00, i32 3 + %943 = insertelement <4 x float> undef, float %63, i32 0 + %944 = insertelement <4 x float> %943, float %65, i32 1 + %945 = insertelement <4 x float> %944, float %67, i32 2 + %946 = insertelement <4 x float> %945, float 0.000000e+00, i32 3 + %947 = call float @llvm.AMDGPU.dp4(<4 x float> %942, <4 x float> %946) + %948 = fcmp uge float 0x3FECCCCCC0000000, %947 + %949 = select i1 %948, float 0x3FECCCCCC0000000, float %947 + %950 = fmul float %936, %949 + %951 = fmul float %937, %949 + %952 = fmul float %938, %949 + br label %ENDIF178 + +ENDIF178: ; preds = %ENDIF175, %IF179 + %temp84.6 = phi float [ %921, %IF179 ], [ %temp84.5, %ENDIF175 ] + %temp85.6 = phi float [ %922, %IF179 ], [ %temp85.5, %ENDIF175 ] + %temp86.6 = phi float [ %923, %IF179 ], [ %temp86.5, %ENDIF175 ] + %temp87.6 = phi float [ %924, %IF179 ], [ %temp87.5, %ENDIF175 ] + %temp92.12 = phi float [ %950, %IF179 ], [ %temp92.11, %ENDIF175 ] + %temp93.6 = phi float [ %951, %IF179 ], [ %temp93.5, %ENDIF175 ] + %temp94.6 = phi float [ %952, %IF179 ], [ %temp94.5, %ENDIF175 ] + %953 = fmul float %55, %temp92.12 + %954 = fmul float %57, %temp93.6 + %955 = fmul float %59, %temp94.6 + %956 = fmul float %61, 0.000000e+00 + %957 = fmul float %temp84.6, %953 + %958 = fmul float %temp85.6, %954 + %959 = fmul float %temp86.6, %955 + %960 = fmul float %temp87.6, %956 + %961 = fmul float %2, -2.000000e+00 + %962 = fadd float %961, 1.000000e+00 + %963 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 23) + %964 = extractelement <4 x float> %963, i32 2 + %965 = fsub float -0.000000e+00, %964 + %966 = fadd float %962, %965 + %967 = fdiv float 1.000000e+00, %966 + %968 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 24) + %969 = extractelement <4 x float> %968, i32 2 + %970 = fmul float %969, %967 + %971 = fsub float -0.000000e+00, %53 + %972 = fmul float %971, %53 + %973 = fmul float %972, %970 + %974 = fmul float %973, %970 + %975 = fmul float %974, 0x3FF7154760000000 + %976 = call float @llvm.AMDIL.exp.(float %975) + %977 = fcmp oeq float %53, 1.000000e+00 + %978 = sext i1 %977 to i32 + %979 = bitcast i32 %978 to float + %980 = bitcast float %979 to i32 + %981 = icmp ne i32 %980, 0 + %.184 = select i1 %981, float 1.000000e+00, float %976 + %982 = call float @llvm.AMDGPU.lrp(float %.184, float %957, float %47) + %983 = call float @llvm.AMDGPU.lrp(float %.184, float %958, float %49) + %984 = call float @llvm.AMDGPU.lrp(float %.184, float %959, float %51) + %985 = insertelement <4 x float> undef, float %982, i32 0 + %986 = insertelement <4 x float> %985, float %983, i32 1 + %987 = insertelement <4 x float> %986, float %984, i32 2 + %988 = insertelement <4 x float> %987, float %960, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %988, i32 0, i32 0) + ret void +} + +; Function Attrs: readnone +declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 + +; Function Attrs: readnone +declare float @llvm.AMDGPU.rsq.f32(float) #1 + +; Function Attrs: readnone +declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1 + +; Function Attrs: readonly +declare float @fabs(float) #2 + +; Function Attrs: readnone +declare float @llvm.AMDIL.exp.(float) #1 + +; Function Attrs: readnone +declare float @llvm.AMDGPU.lrp(float, float, float) #1 + +; Function Attrs: readnone +declare float @llvm.AMDIL.clamp.(float, float, float) #1 + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { readnone } +attributes #2 = { readonly } diff --git a/test/CodeGen/AMDGPU/bitcast.ll b/test/CodeGen/AMDGPU/bitcast.ll new file mode 100644 index 00000000000..fd56d956bf3 --- /dev/null +++ b/test/CodeGen/AMDGPU/bitcast.ll @@ -0,0 +1,79 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; This test just checks that the compiler doesn't crash. + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +; FUNC-LABEL: {{^}}v32i8_to_v8i32: +; SI: s_endpgm +define void @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 { +entry: + %1 = load <32 x i8>, <32 x i8> addrspace(2)* %0 + %2 = bitcast <32 x i8> %1 to <8 x i32> + %3 = extractelement <8 x i32> %2, i32 1 + %4 = icmp ne i32 %3, 0 + %5 = select i1 %4, float 0.0, float 1.0 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %5, float %5, float %5) + ret void +} + +; FUNC-LABEL: {{^}}i8ptr_v16i8ptr: +; SI: s_endpgm +define void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) { +entry: + %0 = bitcast i8 addrspace(1)* %in to <16 x i8> addrspace(1)* + %1 = load <16 x i8>, <16 x i8> addrspace(1)* %0 + store <16 x i8> %1, <16 x i8> addrspace(1)* %out + ret void +} + +define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind { + %load = load float, float addrspace(1)* %in, align 4 + %bc = bitcast float %load to <2 x i16> + store <2 x i16> %bc, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind { + %load = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4 + %bc = bitcast <2 x i16> %load to float + store float %bc, float addrspace(1)* %out, align 4 + ret void +} + +define void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { + %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + %bc = bitcast <4 x i8> %load to i32 + store i32 %bc, i32 addrspace(1)* %out, align 4 + ret void +} + +define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %load = load i32, i32 addrspace(1)* %in, align 4 + %bc = bitcast i32 %load to <4 x i8> + store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bitcast_v2i32_to_f64: +; SI: s_endpgm +define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8 + %add = add <2 x i32> %val, + %bc = bitcast <2 x i32> %add to double + store double %bc, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}bitcast_f64_to_v2i32: +; SI: s_endpgm +define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) { + %val = load double, double addrspace(1)* %in, align 8 + %add = fadd double %val, 4.0 + %bc = bitcast double %add to <2 x i32> + store <2 x i32> %bc, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +attributes #0 = { "ShaderType"="0" } diff --git a/test/CodeGen/AMDGPU/bswap.ll b/test/CodeGen/AMDGPU/bswap.ll new file mode 100644 index 00000000000..4cf8e4bfed5 --- /dev/null +++ b/test/CodeGen/AMDGPU/bswap.ll @@ -0,0 +1,115 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.bswap.i32(i32) nounwind readnone +declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) nounwind readnone +declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone +declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>) nounwind readnone +declare i64 @llvm.bswap.i64(i64) nounwind readnone +declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) nounwind readnone +declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone + +; FUNC-LABEL: @test_bswap_i32 +; SI: buffer_load_dword [[VAL:v[0-9]+]] +; SI-DAG: v_alignbit_b32 [[TMP0:v[0-9]+]], [[VAL]], [[VAL]], 8 +; SI-DAG: v_alignbit_b32 [[TMP1:v[0-9]+]], [[VAL]], [[VAL]], 24 +; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xff00ff +; SI: v_bfi_b32 [[RESULT:v[0-9]+]], [[K]], [[TMP1]], [[TMP0]] +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %val = load i32, i32 addrspace(1)* %in, align 4 + %bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone + store i32 %bswap, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_bswap_v2i32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_bfi_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_bfi_b32 +; SI: s_endpgm +define void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind { + %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8 + %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone + store <2 x i32> %bswap, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @test_bswap_v4i32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_bfi_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_bfi_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_bfi_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_bfi_b32 +; SI: s_endpgm +define void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind { + %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16 + %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val) nounwind readnone + store <4 x i32> %bswap, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: @test_bswap_v8i32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_bfi_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_bfi_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_bfi_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_bfi_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_bfi_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_bfi_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_bfi_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_bfi_b32 +; SI: s_endpgm +define void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) nounwind { + %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32 + %bswap = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %val) nounwind readnone + store <8 x i32> %bswap, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +define void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { + %val = load i64, i64 addrspace(1)* %in, align 8 + %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone + store i64 %bswap, i64 addrspace(1)* %out, align 8 + ret void +} + +define void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) nounwind { + %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 + %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone + store <2 x i64> %bswap, <2 x i64> addrspace(1)* %out, align 16 + ret void +} + +define void @test_bswap_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) nounwind { + %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32 + %bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone + store <4 x i64> %bswap, <4 x i64> addrspace(1)* %out, align 32 + ret void +} diff --git a/test/CodeGen/AMDGPU/build_vector.ll b/test/CodeGen/AMDGPU/build_vector.ll new file mode 100644 index 00000000000..65eacf5adc4 --- /dev/null +++ b/test/CodeGen/AMDGPU/build_vector.ll @@ -0,0 +1,35 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600 +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI + +; R600: {{^}}build_vector2: +; R600: MOV +; R600: MOV +; R600-NOT: MOV +; SI: {{^}}build_vector2: +; SI-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5 +; SI-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6 +; SI: buffer_store_dwordx2 v{{\[}}[[X]]:[[Y]]{{\]}} +define void @build_vector2 (<2 x i32> addrspace(1)* %out) { +entry: + store <2 x i32> , <2 x i32> addrspace(1)* %out + ret void +} + +; R600: {{^}}build_vector4: +; R600: MOV +; R600: MOV +; R600: MOV +; R600: MOV +; R600-NOT: MOV +; SI: {{^}}build_vector4: +; SI-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5 +; SI-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6 +; SI-DAG: v_mov_b32_e32 v[[Z:[0-9]]], 7 +; SI-DAG: v_mov_b32_e32 v[[W:[0-9]]], 8 +; SI: buffer_store_dwordx4 v{{\[}}[[X]]:[[W]]{{\]}} +define void @build_vector4 (<4 x i32> addrspace(1)* %out) { +entry: + store <4 x i32> , <4 x i32> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/call.ll b/test/CodeGen/AMDGPU/call.ll new file mode 100644 index 00000000000..e769fd11c28 --- /dev/null +++ b/test/CodeGen/AMDGPU/call.ll @@ -0,0 +1,33 @@ +; RUN: not llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s 2>&1 | FileCheck %s +; RUN: not llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s 2>&1 | FileCheck %s +; RUN: not llc -march=r600 -mcpu=cypress < %s 2>&1 | FileCheck %s + +; CHECK: error: unsupported call to function external_function in test_call_external + + +declare i32 @external_function(i32) nounwind + +define void @test_call_external(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %a = load i32, i32 addrspace(1)* %in + %b = load i32, i32 addrspace(1)* %b_ptr + %c = call i32 @external_function(i32 %b) nounwind + %result = add i32 %a, %c + store i32 %result, i32 addrspace(1)* %out + ret void +} + +define i32 @defined_function(i32 %x) nounwind noinline { + %y = add i32 %x, 8 + ret i32 %y +} + +define void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %a = load i32, i32 addrspace(1)* %in + %b = load i32, i32 addrspace(1)* %b_ptr + %c = call i32 @defined_function(i32 %b) nounwind + %result = add i32 %a, %c + store i32 %result, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/call_fs.ll b/test/CodeGen/AMDGPU/call_fs.ll new file mode 100644 index 00000000000..87bebbc49d5 --- /dev/null +++ b/test/CodeGen/AMDGPU/call_fs.ll @@ -0,0 +1,17 @@ + +; RUN: llc < %s -march=r600 -mcpu=redwood -show-mc-encoding -o - | FileCheck --check-prefix=EG %s +; RUN: llc < %s -march=r600 -mcpu=rv710 -show-mc-encoding -o - | FileCheck --check-prefix=R600 %s + +; EG: .long 257 +; EG: {{^}}call_fs: +; EG: CALL_FS ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0xc0,0x84] +; R600: .long 257 +; R600: {{^}}call_fs: +; R600:CALL_FS ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x80,0x89] + + +define void @call_fs() #0 { + ret void +} + +attributes #0 = { "ShaderType"="1" } ; Vertex Shader diff --git a/test/CodeGen/AMDGPU/cayman-loop-bug.ll b/test/CodeGen/AMDGPU/cayman-loop-bug.ll new file mode 100644 index 00000000000..c7b8c403731 --- /dev/null +++ b/test/CodeGen/AMDGPU/cayman-loop-bug.ll @@ -0,0 +1,32 @@ +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s + +; CHECK-LABEL: {{^}}main: +; CHECK: LOOP_START_DX10 +; CHECK: ALU_PUSH_BEFORE +; CHECK: LOOP_START_DX10 +; CHECK: PUSH +; CHECK-NOT: ALU_PUSH_BEFORE +; CHECK: END_LOOP +; CHECK: END_LOOP +define void @main (<4 x float> inreg %reg0) #0 { +entry: + br label %outer_loop +outer_loop: + %cnt = phi i32 [0, %entry], [%cnt_incr, %inner_loop] + %cond = icmp eq i32 %cnt, 16 + br i1 %cond, label %outer_loop_body, label %exit +outer_loop_body: + %cnt_incr = add i32 %cnt, 1 + br label %inner_loop +inner_loop: + %cnt2 = phi i32 [0, %outer_loop_body], [%cnt2_incr, %inner_loop_body] + %cond2 = icmp eq i32 %cnt2, 16 + br i1 %cond, label %inner_loop_body, label %outer_loop +inner_loop_body: + %cnt2_incr = add i32 %cnt2, 1 + br label %inner_loop +exit: + ret void +} + +attributes #0 = { "ShaderType"="0" } \ No newline at end of file diff --git a/test/CodeGen/AMDGPU/cf-stack-bug.ll b/test/CodeGen/AMDGPU/cf-stack-bug.ll new file mode 100644 index 00000000000..75b87e48622 --- /dev/null +++ b/test/CodeGen/AMDGPU/cf-stack-bug.ll @@ -0,0 +1,244 @@ +; RUN: llc -march=r600 -mcpu=redwood -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC +; RUN: FileCheck --check-prefix=BUG64 %s < %t + +; RUN: llc -march=r600 -mcpu=sumo -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC +; RUN: FileCheck --check-prefix=BUG64 %s < %t + +; RUN: llc -march=r600 -mcpu=barts -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC +; RUN: FileCheck --check-prefix=BUG64 %s < %t + +; RUN: llc -march=r600 -mcpu=turks -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC +; RUN: FileCheck --check-prefix=BUG64 %s < %t + +; RUN: llc -march=r600 -mcpu=caicos -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC +; RUN: FileCheck --check-prefix=BUG64 %s < %t + +; RUN: llc -march=r600 -mcpu=cedar -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC +; RUN: FileCheck --check-prefix=BUG32 %s < %t + +; RUN: llc -march=r600 -mcpu=juniper -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC +; RUN: FileCheck --check-prefix=NOBUG %s < %t + +; RUN: llc -march=r600 -mcpu=cypress -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC +; RUN: FileCheck --check-prefix=NOBUG %s < %t + +; RUN: llc -march=r600 -mcpu=cayman -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC +; RUN: FileCheck --check-prefix=NOBUG %s < %t + +; REQUIRES: asserts + +; We are currently allocating 2 extra sub-entries on Evergreen / NI for +; non-WQM push instructions if we change this to 1, then we will need to +; add one level of depth to each of these tests. + +; BUG64-NOT: Applying bug work-around +; BUG32-NOT: Applying bug work-around +; NOBUG-NOT: Applying bug work-around +; FUNC-LABEL: {{^}}nested3: +define void @nested3(i32 addrspace(1)* %out, i32 %cond) { +entry: + %0 = icmp sgt i32 %cond, 0 + br i1 %0, label %if.1, label %end + +if.1: + %1 = icmp sgt i32 %cond, 10 + br i1 %1, label %if.2, label %if.store.1 + +if.store.1: + store i32 1, i32 addrspace(1)* %out + br label %end + +if.2: + %2 = icmp sgt i32 %cond, 20 + br i1 %2, label %if.3, label %if.2.store + +if.2.store: + store i32 2, i32 addrspace(1)* %out + br label %end + +if.3: + store i32 3, i32 addrspace(1)* %out + br label %end + +end: + ret void +} + +; BUG64: Applying bug work-around +; BUG32-NOT: Applying bug work-around +; NOBUG-NOT: Applying bug work-around +; FUNC-LABEL: {{^}}nested4: +define void @nested4(i32 addrspace(1)* %out, i32 %cond) { +entry: + %0 = icmp sgt i32 %cond, 0 + br i1 %0, label %if.1, label %end + +if.1: + %1 = icmp sgt i32 %cond, 10 + br i1 %1, label %if.2, label %if.1.store + +if.1.store: + store i32 1, i32 addrspace(1)* %out + br label %end + +if.2: + %2 = icmp sgt i32 %cond, 20 + br i1 %2, label %if.3, label %if.2.store + +if.2.store: + store i32 2, i32 addrspace(1)* %out + br label %end + +if.3: + %3 = icmp sgt i32 %cond, 30 + br i1 %3, label %if.4, label %if.3.store + +if.3.store: + store i32 3, i32 addrspace(1)* %out + br label %end + +if.4: + store i32 4, i32 addrspace(1)* %out + br label %end + +end: + ret void +} + +; BUG64: Applying bug work-around +; BUG32-NOT: Applying bug work-around +; NOBUG-NOT: Applying bug work-around +; FUNC-LABEL: {{^}}nested7: +define void @nested7(i32 addrspace(1)* %out, i32 %cond) { +entry: + %0 = icmp sgt i32 %cond, 0 + br i1 %0, label %if.1, label %end + +if.1: + %1 = icmp sgt i32 %cond, 10 + br i1 %1, label %if.2, label %if.1.store + +if.1.store: + store i32 1, i32 addrspace(1)* %out + br label %end + +if.2: + %2 = icmp sgt i32 %cond, 20 + br i1 %2, label %if.3, label %if.2.store + +if.2.store: + store i32 2, i32 addrspace(1)* %out + br label %end + +if.3: + %3 = icmp sgt i32 %cond, 30 + br i1 %3, label %if.4, label %if.3.store + +if.3.store: + store i32 3, i32 addrspace(1)* %out + br label %end + +if.4: + %4 = icmp sgt i32 %cond, 40 + br i1 %4, label %if.5, label %if.4.store + +if.4.store: + store i32 4, i32 addrspace(1)* %out + br label %end + +if.5: + %5 = icmp sgt i32 %cond, 50 + br i1 %5, label %if.6, label %if.5.store + +if.5.store: + store i32 5, i32 addrspace(1)* %out + br label %end + +if.6: + %6 = icmp sgt i32 %cond, 60 + br i1 %6, label %if.7, label %if.6.store + +if.6.store: + store i32 6, i32 addrspace(1)* %out + br label %end + +if.7: + store i32 7, i32 addrspace(1)* %out + br label %end + +end: + ret void +} + +; BUG64: Applying bug work-around +; BUG32: Applying bug work-around +; NOBUG-NOT: Applying bug work-around +; FUNC-LABEL: {{^}}nested8: +define void @nested8(i32 addrspace(1)* %out, i32 %cond) { +entry: + %0 = icmp sgt i32 %cond, 0 + br i1 %0, label %if.1, label %end + +if.1: + %1 = icmp sgt i32 %cond, 10 + br i1 %1, label %if.2, label %if.1.store + +if.1.store: + store i32 1, i32 addrspace(1)* %out + br label %end + +if.2: + %2 = icmp sgt i32 %cond, 20 + br i1 %2, label %if.3, label %if.2.store + +if.2.store: + store i32 2, i32 addrspace(1)* %out + br label %end + +if.3: + %3 = icmp sgt i32 %cond, 30 + br i1 %3, label %if.4, label %if.3.store + +if.3.store: + store i32 3, i32 addrspace(1)* %out + br label %end + +if.4: + %4 = icmp sgt i32 %cond, 40 + br i1 %4, label %if.5, label %if.4.store + +if.4.store: + store i32 4, i32 addrspace(1)* %out + br label %end + +if.5: + %5 = icmp sgt i32 %cond, 50 + br i1 %5, label %if.6, label %if.5.store + +if.5.store: + store i32 5, i32 addrspace(1)* %out + br label %end + +if.6: + %6 = icmp sgt i32 %cond, 60 + br i1 %6, label %if.7, label %if.6.store + +if.6.store: + store i32 6, i32 addrspace(1)* %out + br label %end + +if.7: + %7 = icmp sgt i32 %cond, 70 + br i1 %7, label %if.8, label %if.7.store + +if.7.store: + store i32 7, i32 addrspace(1)* %out + br label %end + +if.8: + store i32 8, i32 addrspace(1)* %out + br label %end + +end: + ret void +} diff --git a/test/CodeGen/AMDGPU/cf_end.ll b/test/CodeGen/AMDGPU/cf_end.ll new file mode 100644 index 00000000000..c74ee22868d --- /dev/null +++ b/test/CodeGen/AMDGPU/cf_end.ll @@ -0,0 +1,9 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood --show-mc-encoding | FileCheck --check-prefix=EG %s +; RUN: llc < %s -march=r600 -mcpu=caicos --show-mc-encoding | FileCheck --check-prefix=EG %s +; RUN: llc < %s -march=r600 -mcpu=cayman --show-mc-encoding | FileCheck --check-prefix=CM %s + +; EG: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x80] +; CM: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x88] +define void @eop() { + ret void +} diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll new file mode 100644 index 00000000000..77f7bd01b7f --- /dev/null +++ b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll @@ -0,0 +1,242 @@ +; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown < %s | FileCheck -check-prefix=OPT %s +; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN %s + +declare i32 @llvm.r600.read.tidig.x() #0 + +; OPT-LABEL: @test_sink_global_small_offset_i32( +; OPT-NOT: getelementptr i32, i32 addrspace(1)* %in +; OPT: br i1 +; OPT: ptrtoint + +; GCN-LABEL: {{^}}test_sink_global_small_offset_i32: +; GCN: {{^}}BB0_2: +define void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 7 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i32, i32 addrspace(1)* %in.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_sink_global_small_max_i32_ds_offset( +; OPT: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535 +; OPT: br i1 + +; GCN-LABEL: {{^}}test_sink_global_small_max_i32_ds_offset: +; GCN: s_and_saveexec_b64 +; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} +; GCN: {{^}}BB1_2: +; GCN: s_or_b64 exec +define void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999 + %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i8, i8 addrspace(1)* %in.gep + %tmp2 = sext i8 %tmp1 to i32 + br label %endif + +endif: + %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} + +; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset: +; GCN: s_and_saveexec_b64 +; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}} +; GCN: {{^}}BB2_2: +; GCN: s_or_b64 exec +define void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024 + %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4095 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i8, i8 addrspace(1)* %in.gep + %tmp2 = sext i8 %tmp1 to i32 + br label %endif + +endif: + %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} + +; GCN-LABEL: {{^}}test_sink_global_small_max_plus_1_mubuf_offset: +; GCN: s_and_saveexec_b64 +; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} +; GCN: {{^}}BB3_2: +; GCN: s_or_b64 exec +define void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999 + %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4096 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i8, i8 addrspace(1)* %in.gep + %tmp2 = sext i8 %tmp1 to i32 + br label %endif + +endif: + %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_no_sink_flat_small_offset_i32( +; OPT: getelementptr i32, i32 addrspace(4)* %in +; OPT: br i1 +; OPT-NOT: ptrtoint + +; GCN-LABEL: {{^}}test_no_sink_flat_small_offset_i32: +; GCN: flat_load_dword +; GCN: {{^}}BB4_2: + +define void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999 + %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i32, i32 addrspace(4)* %in.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(4)* %out.gep + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_sink_scratch_small_offset_i32( +; OPT-NOT: getelementptr [512 x i32] +; OPT: br i1 +; OPT: ptrtoint + +; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32: +; GCN: s_and_saveexec_b64 +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}} +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}} +; GCN: {{^}}BB5_2: +define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) { +entry: + %alloca = alloca [512 x i32], align 4 + %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999 + %add.arg = add i32 %arg, 8 + %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1023 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + store volatile i32 123, i32* %alloca.gep + %tmp1 = load volatile i32, i32* %alloca.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep.0 + %load = load volatile i32, i32* %alloca.gep + store i32 %load, i32 addrspace(1)* %out.gep.1 + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_no_sink_scratch_large_offset_i32( +; OPT: %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1024 +; OPT: br i1 +; OPT-NOT: ptrtoint + +; GCN-LABEL: {{^}}test_no_sink_scratch_large_offset_i32: +; GCN: s_and_saveexec_b64 +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} +; GCN: {{^}}BB6_2: +define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) { +entry: + %alloca = alloca [512 x i32], align 4 + %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999 + %add.arg = add i32 %arg, 8 + %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1024 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + store volatile i32 123, i32* %alloca.gep + %tmp1 = load volatile i32, i32* %alloca.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep.0 + %load = load volatile i32, i32* %alloca.gep + store i32 %load, i32 addrspace(1)* %out.gep.1 + br label %done + +done: + ret void +} + +; GCN-LABEL: {{^}}test_sink_global_vreg_sreg_i32: +; GCN: s_and_saveexec_b64 +; GCN: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN: {{^}}BB7_2: +define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset, i32 %cond) { +entry: + %offset.ext = zext i32 %offset to i64 + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 %offset.ext + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i32, i32 addrspace(1)* %in.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/coalescer_remat.ll b/test/CodeGen/AMDGPU/coalescer_remat.ll new file mode 100644 index 00000000000..96730bcf2e8 --- /dev/null +++ b/test/CodeGen/AMDGPU/coalescer_remat.ll @@ -0,0 +1,57 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs -mtriple=amdgcn-- -o - %s | FileCheck %s + +declare float @llvm.fma.f32(float, float, float) + +; This checks that rematerialization support of the coalescer does not +; unnecessarily widen the register class. Without those fixes > 20 VGprs +; are used here +; Also check that some rematerialization of the 0 constant happened. +; CHECK-LABEL: foobar +; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 +; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 +; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 +; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 +; It's probably OK if this is slightly higher: +; CHECK: ; NumVgprs: 9 +define void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) { +entry: + %cmpflag = icmp eq i32 %flag, 1 + br i1 %cmpflag, label %loop, label %exit + +loop: + %c = phi i32 [0, %entry], [%cnext, %loop] + %v0 = phi float [0.0, %entry], [%fma.0, %loop] + %v1 = phi float [0.0, %entry], [%fma.1, %loop] + %v2 = phi float [0.0, %entry], [%fma.2, %loop] + %v3 = phi float [0.0, %entry], [%fma.3, %loop] + + ; Try to get the 0 constant to get coalesced into a wide register + %blup = insertelement <4 x float> undef, float %v0, i32 0 + store <4 x float> %blup, <4 x float> addrspace(1)* %out + + %load = load <4 x float>, <4 x float> addrspace(1)* %in + %load.0 = extractelement <4 x float> %load, i32 0 + %load.1 = extractelement <4 x float> %load, i32 1 + %load.2 = extractelement <4 x float> %load, i32 2 + %load.3 = extractelement <4 x float> %load, i32 3 + %fma.0 = call float @llvm.fma.f32(float %v0, float %load.0, float %v0) + %fma.1 = call float @llvm.fma.f32(float %v1, float %load.1, float %v1) + %fma.2 = call float @llvm.fma.f32(float %v2, float %load.2, float %v2) + %fma.3 = call float @llvm.fma.f32(float %v3, float %load.3, float %v3) + + %cnext = add nsw i32 %c, 1 + %cmp = icmp eq i32 %cnext, 42 + br i1 %cmp, label %exit, label %loop + +exit: + %ev0 = phi float [0.0, %entry], [%fma.0, %loop] + %ev1 = phi float [0.0, %entry], [%fma.1, %loop] + %ev2 = phi float [0.0, %entry], [%fma.2, %loop] + %ev3 = phi float [0.0, %entry], [%fma.3, %loop] + %dst.0 = insertelement <4 x float> undef, float %ev0, i32 0 + %dst.1 = insertelement <4 x float> %dst.0, float %ev1, i32 1 + %dst.2 = insertelement <4 x float> %dst.1, float %ev2, i32 2 + %dst.3 = insertelement <4 x float> %dst.2, float %ev3, i32 3 + store <4 x float> %dst.3, <4 x float> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll b/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll new file mode 100644 index 00000000000..58517209267 --- /dev/null +++ b/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll @@ -0,0 +1,18 @@ +; RUN: opt -mtriple=amdgcn-- -codegenprepare -S < %s | FileCheck -check-prefix=OPT %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI-LLC %s + +; OPT-LABEL: @test( +; OPT: mul nsw i32 +; OPT-NEXT: sext + +; SI-LLC-LABEL: {{^}}test: +; SI-LLC: s_mul_i32 +; SI-LLC-NOT: mul +define void @test(i8 addrspace(1)* nocapture readonly %in, i32 %a, i8 %b) { +entry: + %0 = mul nsw i32 %a, 3 + %1 = sext i32 %0 to i64 + %2 = getelementptr i8, i8 addrspace(1)* %in, i64 %1 + store i8 %b, i8 addrspace(1)* %2 + ret void +} diff --git a/test/CodeGen/AMDGPU/combine_vloads.ll b/test/CodeGen/AMDGPU/combine_vloads.ll new file mode 100644 index 00000000000..01572afa620 --- /dev/null +++ b/test/CodeGen/AMDGPU/combine_vloads.ll @@ -0,0 +1,42 @@ +; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s + +; +; kernel void combine_vloads(global char8* src, global char8* result) { +; for (int i = 0; i < 1024; ++i) +; result[i] = src[0] + src[1] + src[2] + src[3]; +; } +; + + +; 128-bit loads instead of many 8-bit +; EG-LABEL: {{^}}combine_vloads: +; EG: VTX_READ_128 +; EG: VTX_READ_128 +define void @combine_vloads(<8 x i8> addrspace(1)* nocapture %src, <8 x i8> addrspace(1)* nocapture %result) nounwind { +entry: + br label %for.body + +for.exit: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %i.01 = phi i32 [ 0, %entry ], [ %tmp19, %for.body ] + %arrayidx_v4 = bitcast <8 x i8> addrspace(1)* %src to <32 x i8> addrspace(1)* + %0 = bitcast <32 x i8> addrspace(1)* %arrayidx_v4 to <8 x i32> addrspace(1)* + %vecload2 = load <8 x i32>, <8 x i32> addrspace(1)* %0, align 32 + %1 = bitcast <8 x i32> %vecload2 to <32 x i8> + %tmp5 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> + %tmp8 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> + %tmp9 = add nsw <8 x i8> %tmp5, %tmp8 + %tmp12 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> + %tmp13 = add nsw <8 x i8> %tmp9, %tmp12 + %tmp16 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> + %tmp17 = add nsw <8 x i8> %tmp13, %tmp16 + %scevgep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %result, i32 %i.01 + %2 = bitcast <8 x i8> %tmp17 to <2 x i32> + %3 = bitcast <8 x i8> addrspace(1)* %scevgep to <2 x i32> addrspace(1)* + store <2 x i32> %2, <2 x i32> addrspace(1)* %3, align 8 + %tmp19 = add nsw i32 %i.01, 1 + %exitcond = icmp eq i32 %tmp19, 1024 + br i1 %exitcond, label %for.exit, label %for.body +} diff --git a/test/CodeGen/AMDGPU/commute-compares.ll b/test/CodeGen/AMDGPU/commute-compares.ll new file mode 100644 index 00000000000..31766047a35 --- /dev/null +++ b/test/CodeGen/AMDGPU/commute-compares.ll @@ -0,0 +1,697 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s + +declare i32 @llvm.r600.read.tidig.x() #0 + +; -------------------------------------------------------------------------------- +; i32 compares +; -------------------------------------------------------------------------------- + +; GCN-LABEL: {{^}}commute_eq_64_i32: +; GCN: v_cmp_eq_i32_e32 vcc, 64, v{{[0-9]+}} +define void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i32, i32 addrspace(1)* %gep.in + %cmp = icmp eq i32 %val, 64 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ne_64_i32: +; GCN: v_cmp_ne_i32_e32 vcc, 64, v{{[0-9]+}} +define void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i32, i32 addrspace(1)* %gep.in + %cmp = icmp ne i32 %val, 64 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; FIXME: Why isn't this being folded as a constant? +; GCN-LABEL: {{^}}commute_ne_litk_i32: +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3039 +; GCN: v_cmp_ne_i32_e32 vcc, [[K]], v{{[0-9]+}} +define void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i32, i32 addrspace(1)* %gep.in + %cmp = icmp ne i32 %val, 12345 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ugt_64_i32: +; GCN: v_cmp_lt_u32_e32 vcc, 64, v{{[0-9]+}} +define void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i32, i32 addrspace(1)* %gep.in + %cmp = icmp ugt i32 %val, 64 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_uge_64_i32: +; GCN: v_cmp_lt_u32_e32 vcc, 63, v{{[0-9]+}} +define void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i32, i32 addrspace(1)* %gep.in + %cmp = icmp uge i32 %val, 64 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ult_64_i32: +; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}} +define void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i32, i32 addrspace(1)* %gep.in + %cmp = icmp ult i32 %val, 64 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ule_63_i32: +; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}} +define void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i32, i32 addrspace(1)* %gep.in + %cmp = icmp ule i32 %val, 63 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; FIXME: Undo canonicalization to gt (x + 1) since it doesn't use the inline imm + +; GCN-LABEL: {{^}}commute_ule_64_i32: +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x41{{$}} +; GCN: v_cmp_gt_u32_e32 vcc, [[K]], v{{[0-9]+}} +define void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i32, i32 addrspace(1)* %gep.in + %cmp = icmp ule i32 %val, 64 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_sgt_neg1_i32: +; GCN: v_cmp_lt_i32_e32 vcc, -1, v{{[0-9]+}} +define void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i32, i32 addrspace(1)* %gep.in + %cmp = icmp sgt i32 %val, -1 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_sge_neg2_i32: +; GCN: v_cmp_lt_i32_e32 vcc, -3, v{{[0-9]+}} +define void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i32, i32 addrspace(1)* %gep.in + %cmp = icmp sge i32 %val, -2 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_slt_neg16_i32: +; GCN: v_cmp_gt_i32_e32 vcc, -16, v{{[0-9]+}} +define void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i32, i32 addrspace(1)* %gep.in + %cmp = icmp slt i32 %val, -16 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_sle_5_i32: +; GCN: v_cmp_gt_i32_e32 vcc, 6, v{{[0-9]+}} +define void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i32, i32 addrspace(1)* %gep.in + %cmp = icmp sle i32 %val, 5 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; -------------------------------------------------------------------------------- +; i64 compares +; -------------------------------------------------------------------------------- + +; GCN-LABEL: {{^}}commute_eq_64_i64: +; GCN: v_cmp_eq_i64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i64, i64 addrspace(1)* %gep.in + %cmp = icmp eq i64 %val, 64 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ne_64_i64: +; GCN: v_cmp_ne_i64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i64, i64 addrspace(1)* %gep.in + %cmp = icmp ne i64 %val, 64 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ugt_64_i64: +; GCN: v_cmp_lt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i64, i64 addrspace(1)* %gep.in + %cmp = icmp ugt i64 %val, 64 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_uge_64_i64: +; GCN: v_cmp_lt_u64_e32 vcc, 63, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i64, i64 addrspace(1)* %gep.in + %cmp = icmp uge i64 %val, 64 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ult_64_i64: +; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i64, i64 addrspace(1)* %gep.in + %cmp = icmp ult i64 %val, 64 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ule_63_i64: +; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i64, i64 addrspace(1)* %gep.in + %cmp = icmp ule i64 %val, 63 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; FIXME: Undo canonicalization to gt (x + 1) since it doesn't use the inline imm + +; GCN-LABEL: {{^}}commute_ule_64_i64: +; GCN-DAG: s_movk_i32 s[[KLO:[0-9]+]], 0x41{{$}} +; GCN: v_cmp_gt_u64_e32 vcc, s{{\[}}[[KLO]]:{{[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i64, i64 addrspace(1)* %gep.in + %cmp = icmp ule i64 %val, 64 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_sgt_neg1_i64: +; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i64, i64 addrspace(1)* %gep.in + %cmp = icmp sgt i64 %val, -1 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_sge_neg2_i64: +; GCN: v_cmp_lt_i64_e32 vcc, -3, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i64, i64 addrspace(1)* %gep.in + %cmp = icmp sge i64 %val, -2 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_slt_neg16_i64: +; GCN: v_cmp_gt_i64_e32 vcc, -16, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i64, i64 addrspace(1)* %gep.in + %cmp = icmp slt i64 %val, -16 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_sle_5_i64: +; GCN: v_cmp_gt_i64_e32 vcc, 6, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load i64, i64 addrspace(1)* %gep.in + %cmp = icmp sle i64 %val, 5 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; -------------------------------------------------------------------------------- +; f32 compares +; -------------------------------------------------------------------------------- + + +; GCN-LABEL: {{^}}commute_oeq_2.0_f32: +; GCN: v_cmp_eq_f32_e32 vcc, 2.0, v{{[0-9]+}} +define void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load float, float addrspace(1)* %gep.in + %cmp = fcmp oeq float %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + + +; GCN-LABEL: {{^}}commute_ogt_2.0_f32: +; GCN: v_cmp_lt_f32_e32 vcc, 2.0, v{{[0-9]+}} +define void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load float, float addrspace(1)* %gep.in + %cmp = fcmp ogt float %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_oge_2.0_f32: +; GCN: v_cmp_le_f32_e32 vcc, 2.0, v{{[0-9]+}} +define void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load float, float addrspace(1)* %gep.in + %cmp = fcmp oge float %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_olt_2.0_f32: +; GCN: v_cmp_gt_f32_e32 vcc, 2.0, v{{[0-9]+}} +define void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load float, float addrspace(1)* %gep.in + %cmp = fcmp olt float %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ole_2.0_f32: +; GCN: v_cmp_ge_f32_e32 vcc, 2.0, v{{[0-9]+}} +define void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load float, float addrspace(1)* %gep.in + %cmp = fcmp ole float %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_one_2.0_f32: +; GCN: v_cmp_lg_f32_e32 vcc, 2.0, v{{[0-9]+}} +define void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load float, float addrspace(1)* %gep.in + %cmp = fcmp one float %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ord_2.0_f32: +; GCN: v_cmp_o_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]] +define void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load float, float addrspace(1)* %gep.in + %cmp = fcmp ord float %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ueq_2.0_f32: +; GCN: v_cmp_nlg_f32_e32 vcc, 2.0, v{{[0-9]+}} +define void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load float, float addrspace(1)* %gep.in + %cmp = fcmp ueq float %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ugt_2.0_f32: +; GCN: v_cmp_nge_f32_e32 vcc, 2.0, v{{[0-9]+}} +define void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load float, float addrspace(1)* %gep.in + %cmp = fcmp ugt float %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_uge_2.0_f32: +; GCN: v_cmp_ngt_f32_e32 vcc, 2.0, v{{[0-9]+}} +define void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load float, float addrspace(1)* %gep.in + %cmp = fcmp uge float %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ult_2.0_f32: +; GCN: v_cmp_nle_f32_e32 vcc, 2.0, v{{[0-9]+}} +define void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load float, float addrspace(1)* %gep.in + %cmp = fcmp ult float %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ule_2.0_f32: +; GCN: v_cmp_nlt_f32_e32 vcc, 2.0, v{{[0-9]+}} +define void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load float, float addrspace(1)* %gep.in + %cmp = fcmp ule float %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_une_2.0_f32: +; GCN: v_cmp_neq_f32_e32 vcc, 2.0, v{{[0-9]+}} +define void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load float, float addrspace(1)* %gep.in + %cmp = fcmp une float %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_uno_2.0_f32: +; GCN: v_cmp_u_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]] +define void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load float, float addrspace(1)* %gep.in + %cmp = fcmp uno float %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; -------------------------------------------------------------------------------- +; f64 compares +; -------------------------------------------------------------------------------- + + +; GCN-LABEL: {{^}}commute_oeq_2.0_f64: +; GCN: v_cmp_eq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load double, double addrspace(1)* %gep.in + %cmp = fcmp oeq double %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + + +; GCN-LABEL: {{^}}commute_ogt_2.0_f64: +; GCN: v_cmp_lt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load double, double addrspace(1)* %gep.in + %cmp = fcmp ogt double %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_oge_2.0_f64: +; GCN: v_cmp_le_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load double, double addrspace(1)* %gep.in + %cmp = fcmp oge double %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_olt_2.0_f64: +; GCN: v_cmp_gt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load double, double addrspace(1)* %gep.in + %cmp = fcmp olt double %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ole_2.0_f64: +; GCN: v_cmp_ge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load double, double addrspace(1)* %gep.in + %cmp = fcmp ole double %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_one_2.0_f64: +; GCN: v_cmp_lg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load double, double addrspace(1)* %gep.in + %cmp = fcmp one double %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ord_2.0_f64: +; GCN: v_cmp_o_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]] +define void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load double, double addrspace(1)* %gep.in + %cmp = fcmp ord double %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ueq_2.0_f64: +; GCN: v_cmp_nlg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load double, double addrspace(1)* %gep.in + %cmp = fcmp ueq double %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ugt_2.0_f64: +; GCN: v_cmp_nge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load double, double addrspace(1)* %gep.in + %cmp = fcmp ugt double %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_uge_2.0_f64: +; GCN: v_cmp_ngt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load double, double addrspace(1)* %gep.in + %cmp = fcmp uge double %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ult_2.0_f64: +; GCN: v_cmp_nle_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load double, double addrspace(1)* %gep.in + %cmp = fcmp ult double %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_ule_2.0_f64: +; GCN: v_cmp_nlt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load double, double addrspace(1)* %gep.in + %cmp = fcmp ule double %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_une_2.0_f64: +; GCN: v_cmp_neq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} +define void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load double, double addrspace(1)* %gep.in + %cmp = fcmp une double %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}commute_uno_2.0_f64: +; GCN: v_cmp_u_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]] +define void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %val = load double, double addrspace(1)* %gep.in + %cmp = fcmp uno double %val, 2.0 + %ext = sext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %gep.out + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/commute_modifiers.ll b/test/CodeGen/AMDGPU/commute_modifiers.ll new file mode 100644 index 00000000000..7fc36eabb78 --- /dev/null +++ b/test/CodeGen/AMDGPU/commute_modifiers.ll @@ -0,0 +1,181 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() #1 +declare float @llvm.fabs.f32(float) #1 +declare float @llvm.fma.f32(float, float, float) nounwind readnone + +; FUNC-LABEL: @commute_add_imm_fabs_f32 +; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: v_add_f32_e64 [[REG:v[0-9]+]], 2.0, |[[X]]| +; SI-NEXT: buffer_store_dword [[REG]] +define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %x = load float, float addrspace(1)* %gep.0 + %x.fabs = call float @llvm.fabs.f32(float %x) #1 + %z = fadd float 2.0, %x.fabs + store float %z, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @commute_mul_imm_fneg_fabs_f32 +; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: v_mul_f32_e64 [[REG:v[0-9]+]], -4.0, |[[X]]| +; SI-NEXT: buffer_store_dword [[REG]] +define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %x = load float, float addrspace(1)* %gep.0 + %x.fabs = call float @llvm.fabs.f32(float %x) #1 + %x.fneg.fabs = fsub float -0.000000e+00, %x.fabs + %z = fmul float 4.0, %x.fneg.fabs + store float %z, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @commute_mul_imm_fneg_f32 +; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: v_mul_f32_e32 [[REG:v[0-9]+]], -4.0, [[X]] +; SI-NEXT: buffer_store_dword [[REG]] +define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %x = load float, float addrspace(1)* %gep.0 + %x.fneg = fsub float -0.000000e+00, %x + %z = fmul float 4.0, %x.fneg + store float %z, float addrspace(1)* %out + ret void +} + +; FIXME: Should use SGPR for literal. +; FUNC-LABEL: @commute_add_lit_fabs_f32 +; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x44800000 +; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, [[K]] +; SI-NEXT: buffer_store_dword [[REG]] +define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %x = load float, float addrspace(1)* %gep.0 + %x.fabs = call float @llvm.fabs.f32(float %x) #1 + %z = fadd float 1024.0, %x.fabs + store float %z, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @commute_add_fabs_f32 +; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[X]], |[[Y]]| +; SI-NEXT: buffer_store_dword [[REG]] +define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %x = load float, float addrspace(1)* %gep.0 + %y = load float, float addrspace(1)* %gep.1 + %y.fabs = call float @llvm.fabs.f32(float %y) #1 + %z = fadd float %x, %y.fabs + store float %z, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @commute_mul_fneg_f32 +; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -[[Y]] +; SI-NEXT: buffer_store_dword [[REG]] +define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %x = load float, float addrspace(1)* %gep.0 + %y = load float, float addrspace(1)* %gep.1 + %y.fneg = fsub float -0.000000e+00, %y + %z = fmul float %x, %y.fneg + store float %z, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @commute_mul_fabs_fneg_f32 +; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -|[[Y]]| +; SI-NEXT: buffer_store_dword [[REG]] +define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %x = load float, float addrspace(1)* %gep.0 + %y = load float, float addrspace(1)* %gep.1 + %y.fabs = call float @llvm.fabs.f32(float %y) #1 + %y.fabs.fneg = fsub float -0.000000e+00, %y.fabs + %z = fmul float %x, %y.fabs.fneg + store float %z, float addrspace(1)* %out + ret void +} + +; There's no reason to commute this. +; FUNC-LABEL: @commute_mul_fabs_x_fabs_y_f32 +; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, |[[Y]]| +; SI-NEXT: buffer_store_dword [[REG]] +define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %x = load float, float addrspace(1)* %gep.0 + %y = load float, float addrspace(1)* %gep.1 + %x.fabs = call float @llvm.fabs.f32(float %x) #1 + %y.fabs = call float @llvm.fabs.f32(float %y) #1 + %z = fmul float %x.fabs, %y.fabs + store float %z, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @commute_mul_fabs_x_fneg_fabs_y_f32 +; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -|[[Y]]| +; SI-NEXT: buffer_store_dword [[REG]] +define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %x = load float, float addrspace(1)* %gep.0 + %y = load float, float addrspace(1)* %gep.1 + %x.fabs = call float @llvm.fabs.f32(float %x) #1 + %y.fabs = call float @llvm.fabs.f32(float %y) #1 + %y.fabs.fneg = fsub float -0.000000e+00, %y.fabs + %z = fmul float %x.fabs, %y.fabs.fneg + store float %z, float addrspace(1)* %out + ret void +} + +; Make sure we commute the multiply part for the constant in src0 even +; though we have negate modifier on src2. + +; SI-LABEL: {{^}}fma_a_2.0_neg_b_f32 +; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], |[[R2]]| +; SI: buffer_store_dword [[RESULT]] +define void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load float, float addrspace(1)* %gep.0 + %r2 = load float, float addrspace(1)* %gep.1 + + %r2.fabs = call float @llvm.fabs.f32(float %r2) + + %r3 = tail call float @llvm.fma.f32(float %r1, float 2.0, float %r2.fabs) + store float %r3, float addrspace(1)* %gep.out + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/complex-folding.ll b/test/CodeGen/AMDGPU/complex-folding.ll new file mode 100644 index 00000000000..a5399a71324 --- /dev/null +++ b/test/CodeGen/AMDGPU/complex-folding.ll @@ -0,0 +1,19 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; CHECK: {{^}}main: +; CHECK-NOT: MOV +define void @main(<4 x float> inreg %reg0) #0 { +entry: + %0 = extractelement <4 x float> %reg0, i32 0 + %1 = call float @fabs(float %0) + %2 = fptoui float %1 to i32 + %3 = bitcast i32 %2 to float + %4 = insertelement <4 x float> undef, float %3, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %4, i32 0, i32 0) + ret void +} + +declare float @fabs(float ) readnone +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="0" } \ No newline at end of file diff --git a/test/CodeGen/AMDGPU/concat_vectors.ll b/test/CodeGen/AMDGPU/concat_vectors.ll new file mode 100644 index 00000000000..a09ed1f7385 --- /dev/null +++ b/test/CodeGen/AMDGPU/concat_vectors.ll @@ -0,0 +1,296 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}test_concat_v1i32: +; 0x80f000 is the high 32 bits of the resource descriptor used by MUBUF +; instructions that access scratch memory. Bit 23, which is the add_tid_enable +; bit, is only set for scratch access, so we can check for the absence of this +; value if we want to ensure scratch memory is not being used. +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind { + %concat = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> + store <2 x i32> %concat, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v2i32: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { + %concat = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> + store <4 x i32> %concat, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v4i32: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind { + %concat = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> + store <8 x i32> %concat, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v8i32: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind { + %concat = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> + store <16 x i32> %concat, <16 x i32> addrspace(1)* %out, align 64 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v16i32: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) nounwind { + %concat = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> + store <32 x i32> %concat, <32 x i32> addrspace(1)* %out, align 128 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v1f32: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <1 x float> %b) nounwind { + %concat = shufflevector <1 x float> %a, <1 x float> %b, <2 x i32> + store <2 x float> %concat, <2 x float> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v2f32: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind { + %concat = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> + store <4 x float> %concat, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v4f32: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind { + %concat = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> + store <8 x float> %concat, <8 x float> addrspace(1)* %out, align 32 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v8f32: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind { + %concat = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> + store <16 x float> %concat, <16 x float> addrspace(1)* %out, align 64 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v16f32: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind { + %concat = shufflevector <16 x float> %a, <16 x float> %b, <32 x i32> + store <32 x float> %concat, <32 x float> addrspace(1)* %out, align 128 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v1i64: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind { + %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> + store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v2i64: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { + %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> + store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v4i64: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { + %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> + store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v8i64: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { + %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> + store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v16i64: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { + %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> + store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v1f64: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind { + %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> + store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v2f64: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { + %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> + store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v4f64: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { + %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> + store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v8f64: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { + %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> + store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v16f64: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { + %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> + store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v1i1: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1> %b) nounwind { + %concat = shufflevector <1 x i1> %a, <1 x i1> %b, <2 x i32> + store <2 x i1> %concat, <2 x i1> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v2i1: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1> %b) nounwind { + %concat = shufflevector <2 x i1> %a, <2 x i1> %b, <4 x i32> + store <4 x i1> %concat, <4 x i1> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v4i1: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1> %b) nounwind { + %concat = shufflevector <4 x i1> %a, <4 x i1> %b, <8 x i32> + store <8 x i1> %concat, <8 x i1> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v8i1: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1> %b) nounwind { + %concat = shufflevector <8 x i1> %a, <8 x i1> %b, <16 x i32> + store <16 x i1> %concat, <16 x i1> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v16i1: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x i1> %b) nounwind { + %concat = shufflevector <16 x i1> %a, <16 x i1> %b, <32 x i32> + store <32 x i1> %concat, <32 x i1> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v32i1: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x i1> %b) nounwind { + %concat = shufflevector <32 x i1> %a, <32 x i1> %b, <64 x i32> + store <64 x i1> %concat, <64 x i1> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v1i16: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x i16> %b) nounwind { + %concat = shufflevector <1 x i16> %a, <1 x i16> %b, <2 x i32> + store <2 x i16> %concat, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v2i16: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) nounwind { + %concat = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> + store <4 x i16> %concat, <4 x i16> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v4i16: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind { + %concat = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> + store <8 x i16> %concat, <8 x i16> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v8i16: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind { + %concat = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> + store <16 x i16> %concat, <16 x i16> addrspace(1)* %out, align 32 + ret void +} + +; FUNC-LABEL: {{^}}test_concat_v16i16: +; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 +; SI-NOT: movrel +define void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <16 x i16> %b) nounwind { + %concat = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> + store <32 x i16> %concat, <32 x i16> addrspace(1)* %out, align 64 + ret void +} + +; FUNC-LABEL: {{^}}concat_vector_crash: +; SI: s_endpgm +define void @concat_vector_crash(<8 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) { +bb: + %tmp = load <2 x float>, <2 x float> addrspace(1)* %in, align 4 + %tmp1 = shufflevector <2 x float> %tmp, <2 x float> undef, <8 x i32> + %tmp2 = shufflevector <8 x float> undef, <8 x float> %tmp1, <8 x i32> + store <8 x float> %tmp2, <8 x float> addrspace(1)* %out, align 32 + ret void +} diff --git a/test/CodeGen/AMDGPU/copy-illegal-type.ll b/test/CodeGen/AMDGPU/copy-illegal-type.ll new file mode 100644 index 00000000000..8b397566066 --- /dev/null +++ b/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -0,0 +1,167 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}test_copy_v4i8: +; SI: buffer_load_dword [[REG:v[0-9]+]] +; SI: buffer_store_dword [[REG]] +; SI: s_endpgm +define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_copy_v4i8_x2: +; SI: buffer_load_dword [[REG:v[0-9]+]] +; SI: buffer_store_dword [[REG]] +; SI: buffer_store_dword [[REG]] +; SI: s_endpgm +define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_copy_v4i8_x3: +; SI: buffer_load_dword [[REG:v[0-9]+]] +; SI: buffer_store_dword [[REG]] +; SI: buffer_store_dword [[REG]] +; SI: buffer_store_dword [[REG]] +; SI: s_endpgm +define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_copy_v4i8_x4: +; SI: buffer_load_dword [[REG:v[0-9]+]] +; SI: buffer_store_dword [[REG]] +; SI: buffer_store_dword [[REG]] +; SI: buffer_store_dword [[REG]] +; SI: buffer_store_dword [[REG]] +; SI: s_endpgm +define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out3, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_copy_v4i8_extra_use: +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI-DAG: v_add +; SI-DAG: v_add +; SI-DAG: v_add +; SI-DAG: v_add +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_byte +; SI_DAG: buffer_store_byte + +; After scalarizing v4i8 loads is fixed. +; XSI: buffer_load_dword +; XSI: V_BFE +; XSI: V_ADD +; XSI: V_ADD +; XSI: V_ADD +; XSI: buffer_store_dword +; XSI: buffer_store_dword + +; SI: s_endpgm +define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + %add = add <4 x i8> %val, + store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 + store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_copy_v4i8_x2_extra_use: +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI-DAG: v_add +; SI-DAG: v_add +; SI-DAG: v_add +; SI-DAG: v_add +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_byte +; SI_DAG: buffer_store_byte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_byte +; SI_DAG: buffer_store_byte + +; XSI: buffer_load_dword +; XSI: BFE +; XSI: buffer_store_dword +; XSI: V_ADD +; XSI: buffer_store_dword +; XSI-NEXT: buffer_store_dword + +; SI: s_endpgm +define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + %add = add <4 x i8> %val, + store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 + store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_copy_v3i8: +; SI-NOT: bfe +; SI-NOT: bfi +; SI: s_endpgm +define void @test_copy_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { + %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 + store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_load: +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: s_endpgm +define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { + %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_store: +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: s_endpgm +define void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/copy-to-reg.ll b/test/CodeGen/AMDGPU/copy-to-reg.ll new file mode 100644 index 00000000000..fc875f6ef7a --- /dev/null +++ b/test/CodeGen/AMDGPU/copy-to-reg.ll @@ -0,0 +1,27 @@ +; RUN: llc -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s + +; Test that CopyToReg instructions don't have non-register operands prior +; to being emitted. + +; Make sure this doesn't crash +; CHECK-LABEL: {{^}}copy_to_reg_frameindex: +define void @copy_to_reg_frameindex(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +entry: + %alloca = alloca [16 x i32] + br label %loop + +loop: + %inc = phi i32 [0, %entry], [%inc.i, %loop] + %ptr = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %inc + store i32 %inc, i32* %ptr + %inc.i = add i32 %inc, 1 + %cnd = icmp uge i32 %inc.i, 16 + br i1 %cnd, label %done, label %loop + +done: + %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 0 + %tmp1 = load i32, i32* %tmp0 + store i32 %tmp1, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll new file mode 100644 index 00000000000..bd26c302fe5 --- /dev/null +++ b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -0,0 +1,71 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone +declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone +declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone + +; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i32: +; SI: s_load_dword [[VAL:s[0-9]+]], +; SI: s_flbit_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]] +; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] +; SI: buffer_store_dword [[VRESULT]], +; SI: s_endpgm +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] +; EG: FFBH_UINT {{\*? *}}[[RESULT]] +define void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { + %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone + store i32 %ctlz, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32: +; SI: buffer_load_dword [[VAL:v[0-9]+]], +; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] +; EG: FFBH_UINT {{\*? *}}[[RESULT]] +define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + %val = load i32, i32 addrspace(1)* %valptr, align 4 + %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone + store i32 %ctlz, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v2i32: +; SI: buffer_load_dwordx2 +; SI: v_ffbh_u32_e32 +; SI: v_ffbh_u32_e32 +; SI: buffer_store_dwordx2 +; SI: s_endpgm +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} +; EG: FFBH_UINT {{\*? *}}[[RESULT]] +; EG: FFBH_UINT {{\*? *}}[[RESULT]] +define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { + %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8 + %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone + store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v4i32: +; SI: buffer_load_dwordx4 +; SI: v_ffbh_u32_e32 +; SI: v_ffbh_u32_e32 +; SI: v_ffbh_u32_e32 +; SI: v_ffbh_u32_e32 +; SI: buffer_store_dwordx4 +; SI: s_endpgm +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} +; EG: FFBH_UINT {{\*? *}}[[RESULT]] +; EG: FFBH_UINT {{\*? *}}[[RESULT]] +; EG: FFBH_UINT {{\*? *}}[[RESULT]] +; EG: FFBH_UINT {{\*? *}}[[RESULT]] +define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { + %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16 + %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone + store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16 + ret void +} diff --git a/test/CodeGen/AMDGPU/ctpop.ll b/test/CodeGen/AMDGPU/ctpop.ll new file mode 100644 index 00000000000..0a031c5e24d --- /dev/null +++ b/test/CodeGen/AMDGPU/ctpop.ll @@ -0,0 +1,300 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC -check-prefix=VI %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare i32 @llvm.ctpop.i32(i32) nounwind readnone +declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone +declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone +declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) nounwind readnone +declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readnone + +; FUNC-LABEL: {{^}}s_ctpop_i32: +; GCN: s_load_dword [[SVAL:s[0-9]+]], +; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[SVAL]] +; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] +; GCN: buffer_store_dword [[VRESULT]], +; GCN: s_endpgm + +; EG: BCNT_INT +define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { + %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone + store i32 %ctpop, i32 addrspace(1)* %out, align 4 + ret void +} + +; XXX - Why 0 in register? +; FUNC-LABEL: {{^}}v_ctpop_i32: +; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 0 +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm + +; EG: BCNT_INT +define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { + %val = load i32, i32 addrspace(1)* %in, align 4 + %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone + store i32 %ctpop, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32: +; GCN: buffer_load_dword [[VAL1:v[0-9]+]], +; GCN: buffer_load_dword [[VAL0:v[0-9]+]], +; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], [[VAL1]], 0 +; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]] +; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]] +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm + +; EG: BCNT_INT +; EG: BCNT_INT +define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind { + %val0 = load i32, i32 addrspace(1)* %in0, align 4 + %val1 = load i32, i32 addrspace(1)* %in1, align 4 + %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone + %ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone + %add = add i32 %ctpop0, %ctpop1 + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_ctpop_add_sgpr_i32: +; GCN: buffer_load_dword [[VAL0:v[0-9]+]], +; GCN-NEXT: s_waitcnt +; GCN-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}} +; GCN-NEXT: buffer_store_dword [[RESULT]], +; GCN: s_endpgm +define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind { + %val0 = load i32, i32 addrspace(1)* %in0, align 4 + %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone + %add = add i32 %ctpop0, %sval + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_ctpop_v2i32: +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: s_endpgm + +; EG: BCNT_INT +; EG: BCNT_INT +define void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind { + %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8 + %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone + store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_ctpop_v4i32: +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: s_endpgm + +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +define void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind { + %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16 + %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone + store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}v_ctpop_v8i32: +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: s_endpgm + +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +define void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind { + %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32 + %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone + store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +; FUNC-LABEL: {{^}}v_ctpop_v16i32: +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32_e64 +; GCN: s_endpgm + +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +; EG: BCNT_INT +define void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind { + %val = load <16 x i32>, <16 x i32> addrspace(1)* %in, align 32 + %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone + store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32 + ret void +} + +; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant: +; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4 +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm + +; EG: BCNT_INT +define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { + %val = load i32, i32 addrspace(1)* %in, align 4 + %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone + %add = add i32 %ctpop, 4 + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant_inv: +; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4 +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm + +; EG: BCNT_INT +define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { + %val = load i32, i32 addrspace(1)* %in, align 4 + %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone + %add = add i32 4, %ctpop + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_ctpop_i32_add_literal: +; GCN: buffer_load_dword [[VAL:v[0-9]+]], +; GCN: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f +; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]] +; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]] +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm +define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { + %val = load i32, i32 addrspace(1)* %in, align 4 + %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone + %add = add i32 %ctpop, 99999 + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_ctpop_i32_add_var: +; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], +; GCN-DAG: s_load_dword [[VAR:s[0-9]+]], +; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm + +; EG: BCNT_INT +define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind { + %val = load i32, i32 addrspace(1)* %in, align 4 + %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone + %add = add i32 %ctpop, %const + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_ctpop_i32_add_var_inv: +; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], +; GCN-DAG: s_load_dword [[VAR:s[0-9]+]], +; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm + +; EG: BCNT_INT +define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind { + %val = load i32, i32 addrspace(1)* %in, align 4 + %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone + %add = add i32 %const, %ctpop + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_ctpop_i32_add_vvar_inv: +; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], {{0$}} +; GCN-DAG: buffer_load_dword [[VAR:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offset:16 +; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] +; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm + +; EG: BCNT_INT +define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind { + %val = load i32, i32 addrspace(1)* %in, align 4 + %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone + %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 4 + %const = load i32, i32 addrspace(1)* %gep, align 4 + %add = add i32 %const, %ctpop + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FIXME: We currently disallow SALU instructions in all branches, +; but there are some cases when the should be allowed. + +; FUNC-LABEL: {{^}}ctpop_i32_in_br: +; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xd +; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x34 +; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]] +; GCN: v_mov_b32_e32 [[RESULT]], [[SRESULT]] +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm +; EG: BCNT_INT +define void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) { +entry: + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %if, label %else + +if: + %tmp2 = call i32 @llvm.ctpop.i32(i32 %ctpop_arg) + br label %endif + +else: + %tmp3 = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %tmp4 = load i32, i32 addrspace(1)* %tmp3 + br label %endif + +endif: + %tmp5 = phi i32 [%tmp2, %if], [%tmp4, %else] + store i32 %tmp5, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/ctpop64.ll b/test/CodeGen/AMDGPU/ctpop64.ll new file mode 100644 index 00000000000..e1a0ee3ea21 --- /dev/null +++ b/test/CodeGen/AMDGPU/ctpop64.ll @@ -0,0 +1,124 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s + +declare i64 @llvm.ctpop.i64(i64) nounwind readnone +declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone +declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone +declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone +declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>) nounwind readnone + +; FUNC-LABEL: {{^}}s_ctpop_i64: +; SI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; GCN: s_bcnt1_i32_b64 [[SRESULT:s[0-9]+]], [[SVAL]] +; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] +; GCN: buffer_store_dword [[VRESULT]], +; GCN: s_endpgm +define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind { + %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone + %truncctpop = trunc i64 %ctpop to i32 + store i32 %truncctpop, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_ctpop_i64: +; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, +; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0 +; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] +; VI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm +define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { + %val = load i64, i64 addrspace(1)* %in, align 8 + %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone + %truncctpop = trunc i64 %ctpop to i32 + store i32 %truncctpop, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_ctpop_v2i64: +; GCN: s_bcnt1_i32_b64 +; GCN: s_bcnt1_i32_b64 +; GCN: s_endpgm +define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind { + %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone + %truncctpop = trunc <2 x i64> %ctpop to <2 x i32> + store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_ctpop_v4i64: +; GCN: s_bcnt1_i32_b64 +; GCN: s_bcnt1_i32_b64 +; GCN: s_bcnt1_i32_b64 +; GCN: s_bcnt1_i32_b64 +; GCN: s_endpgm +define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind { + %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone + %truncctpop = trunc <4 x i64> %ctpop to <4 x i32> + store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}v_ctpop_v2i64: +; GCN: v_bcnt_u32_b32 +; GCN: v_bcnt_u32_b32 +; GCN: v_bcnt_u32_b32 +; GCN: v_bcnt_u32_b32 +; GCN: s_endpgm +define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind { + %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 + %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone + %truncctpop = trunc <2 x i64> %ctpop to <2 x i32> + store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_ctpop_v4i64: +; GCN: v_bcnt_u32_b32 +; GCN: v_bcnt_u32_b32 +; GCN: v_bcnt_u32_b32 +; GCN: v_bcnt_u32_b32 +; GCN: v_bcnt_u32_b32 +; GCN: v_bcnt_u32_b32 +; GCN: v_bcnt_u32_b32 +; GCN: v_bcnt_u32_b32 +; GCN: s_endpgm +define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind { + %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32 + %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone + %truncctpop = trunc <4 x i64> %ctpop to <4 x i32> + store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +; FIXME: We currently disallow SALU instructions in all branches, +; but there are some cases when the should be allowed. + +; FUNC-LABEL: {{^}}ctpop_i64_in_br: +; SI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xd +; VI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x34 +; GCN: s_bcnt1_i32_b64 [[RESULT:s[0-9]+]], {{s\[}}[[LOVAL]]:[[HIVAL]]{{\]}} +; GCN: v_mov_b32_e32 v[[VLO:[0-9]+]], [[RESULT]] +; GCN: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HIVAL]] +; GCN: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]{{\]}} +; GCN: s_endpgm +define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) { +entry: + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %if, label %else + +if: + %tmp2 = call i64 @llvm.ctpop.i64(i64 %ctpop_arg) + br label %endif + +else: + %tmp3 = getelementptr i64, i64 addrspace(1)* %in, i32 1 + %tmp4 = load i64, i64 addrspace(1)* %tmp3 + br label %endif + +endif: + %tmp5 = phi i64 [%tmp2, %if], [%tmp4, %else] + store i64 %tmp5, i64 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/test/CodeGen/AMDGPU/cttz_zero_undef.ll new file mode 100644 index 00000000000..56fcb51fe14 --- /dev/null +++ b/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -0,0 +1,71 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone +declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone +declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone + +; FUNC-LABEL: {{^}}s_cttz_zero_undef_i32: +; SI: s_load_dword [[VAL:s[0-9]+]], +; SI: s_ff1_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]] +; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] +; SI: buffer_store_dword [[VRESULT]], +; SI: s_endpgm +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] +; EG: FFBL_INT {{\*? *}}[[RESULT]] +define void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { + %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone + store i32 %cttz, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_cttz_zero_undef_i32: +; SI: buffer_load_dword [[VAL:v[0-9]+]], +; SI: v_ffbl_b32_e32 [[RESULT:v[0-9]+]], [[VAL]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] +; EG: FFBL_INT {{\*? *}}[[RESULT]] +define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + %val = load i32, i32 addrspace(1)* %valptr, align 4 + %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone + store i32 %cttz, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_cttz_zero_undef_v2i32: +; SI: buffer_load_dwordx2 +; SI: v_ffbl_b32_e32 +; SI: v_ffbl_b32_e32 +; SI: buffer_store_dwordx2 +; SI: s_endpgm +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} +; EG: FFBL_INT {{\*? *}}[[RESULT]] +; EG: FFBL_INT {{\*? *}}[[RESULT]] +define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { + %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8 + %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone + store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_cttz_zero_undef_v4i32: +; SI: buffer_load_dwordx4 +; SI: v_ffbl_b32_e32 +; SI: v_ffbl_b32_e32 +; SI: v_ffbl_b32_e32 +; SI: v_ffbl_b32_e32 +; SI: buffer_store_dwordx4 +; SI: s_endpgm +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} +; EG: FFBL_INT {{\*? *}}[[RESULT]] +; EG: FFBL_INT {{\*? *}}[[RESULT]] +; EG: FFBL_INT {{\*? *}}[[RESULT]] +; EG: FFBL_INT {{\*? *}}[[RESULT]] +define void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { + %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16 + %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone + store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16 + ret void +} diff --git a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll new file mode 100644 index 00000000000..3399d9da29e --- /dev/null +++ b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -0,0 +1,196 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}load_i8_to_f32: +; SI: buffer_load_ubyte [[LOADREG:v[0-9]+]], +; SI-NOT: bfe +; SI-NOT: lshr +; SI: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]] +; SI: buffer_store_dword [[CONV]], +define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { + %load = load i8, i8 addrspace(1)* %in, align 1 + %cvt = uitofp i8 %load to float + store float %cvt, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}load_v2i8_to_v2f32: +; SI: buffer_load_ushort [[LOADREG:v[0-9]+]], +; SI-NOT: bfe +; SI-NOT: lshr +; SI-NOT: and +; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]] +; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]] +; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, +define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind { + %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2 + %cvt = uitofp <2 x i8> %load to <2 x float> + store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}load_v3i8_to_v3f32: +; SI-NOT: bfe +; SI-NOT: v_cvt_f32_ubyte3_e32 +; SI-DAG: v_cvt_f32_ubyte2_e32 +; SI-DAG: v_cvt_f32_ubyte1_e32 +; SI-DAG: v_cvt_f32_ubyte0_e32 +; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, +define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { + %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 + %cvt = uitofp <3 x i8> %load to <3 x float> + store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}load_v4i8_to_v4f32: +; SI: buffer_load_dword [[LOADREG:v[0-9]+]] +; SI-NOT: bfe +; SI-NOT: lshr +; SI-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]] +; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[LOADREG]] +; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]] +; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]] +; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, +define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { + %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + %cvt = uitofp <4 x i8> %load to <4 x float> + store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; This should not be adding instructions to shift into the correct +; position in the word for the component. + +; SI-LABEL: {{^}}load_v4i8_to_v4f32_unaligned: +; SI: buffer_load_ubyte [[LOADREG3:v[0-9]+]] +; SI: buffer_load_ubyte [[LOADREG2:v[0-9]+]] +; SI: buffer_load_ubyte [[LOADREG1:v[0-9]+]] +; SI: buffer_load_ubyte [[LOADREG0:v[0-9]+]] +; SI-NOT: v_lshlrev_b32 +; SI-NOT: v_or_b32 + +; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG0]] +; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG1]] +; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG2]] +; SI-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]], [[LOADREG3]] + +; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, +define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { + %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1 + %cvt = uitofp <4 x i8> %load to <4 x float> + store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; XXX - This should really still be able to use the v_cvt_f32_ubyte0 +; for each component, but computeKnownBits doesn't handle vectors very +; well. + +; SI-LABEL: {{^}}load_v4i8_to_v4f32_2_uses: +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: v_cvt_f32_ubyte0_e32 +; SI: v_cvt_f32_ubyte0_e32 +; SI: v_cvt_f32_ubyte0_e32 +; SI: v_cvt_f32_ubyte0_e32 + +; XXX - replace with this when v4i8 loads aren't scalarized anymore. +; XSI: buffer_load_dword +; XSI: v_cvt_f32_u32_e32 +; XSI: v_cvt_f32_u32_e32 +; XSI: v_cvt_f32_u32_e32 +; XSI: v_cvt_f32_u32_e32 +; SI: s_endpgm +define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind { + %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 + %cvt = uitofp <4 x i8> %load to <4 x float> + store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 + %add = add <4 x i8> %load, ; Second use of %load + store <4 x i8> %add, <4 x i8> addrspace(1)* %out2, align 4 + ret void +} + +; Make sure this doesn't crash. +; SI-LABEL: {{^}}load_v7i8_to_v7f32: +; SI: s_endpgm +define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind { + %load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1 + %cvt = uitofp <7 x i8> %load to <7 x float> + store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}load_v8i8_to_v8f32: +; SI: buffer_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}}, +; SI-NOT: bfe +; SI-NOT: lshr +; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]] +; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[LOLOAD]] +; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[LOLOAD]] +; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[LOLOAD]] +; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[HILOAD]] +; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[HILOAD]] +; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[HILOAD]] +; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[HILOAD]] +; SI-NOT: bfe +; SI-NOT: lshr +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind { + %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8 + %cvt = uitofp <8 x i8> %load to <8 x float> + store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}i8_zext_inreg_i32_to_f32: +; SI: buffer_load_dword [[LOADREG:v[0-9]+]], +; SI: v_add_i32_e32 [[ADD:v[0-9]+]], 2, [[LOADREG]] +; SI-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]] +; SI: buffer_store_dword [[CONV]], +define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { + %load = load i32, i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 2 + %inreg = and i32 %add, 255 + %cvt = uitofp i32 %inreg to float + store float %cvt, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}i8_zext_inreg_hi1_to_f32: +define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { + %load = load i32, i32 addrspace(1)* %in, align 4 + %inreg = and i32 %load, 65280 + %shr = lshr i32 %inreg, 8 + %cvt = uitofp i32 %shr to float + store float %cvt, float addrspace(1)* %out, align 4 + ret void +} + + +; We don't get these ones because of the zext, but instcombine removes +; them so it shouldn't really matter. +define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { + %load = load i8, i8 addrspace(1)* %in, align 1 + %ext = zext i8 %load to i32 + %cvt = uitofp i32 %ext to float + store float %cvt, float addrspace(1)* %out, align 4 + ret void +} + +define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { + %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1 + %ext = zext <4 x i8> %load to <4 x i32> + %cvt = uitofp <4 x i32> %ext to <4 x float> + store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 + ret void +} diff --git a/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll b/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll new file mode 100644 index 00000000000..2dd3a9f2a77 --- /dev/null +++ b/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll @@ -0,0 +1,86 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -enable-no-nans-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare float @llvm.fabs.f32(float) #1 +declare float @llvm.floor.f32(float) #1 + +; FUNC-LABEL: {{^}}cvt_flr_i32_f32_0: +; SI-SAFE-NOT: v_cvt_flr_i32_f32 +; SI-NOT: add +; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; SI: s_endpgm +define void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 { + %floor = call float @llvm.floor.f32(float %x) #1 + %cvt = fptosi float %floor to i32 + store i32 %cvt, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}cvt_flr_i32_f32_1: +; SI: v_add_f32_e64 [[TMP:v[0-9]+]], 1.0, s{{[0-9]+}} +; SI-SAFE-NOT: v_cvt_flr_i32_f32 +; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, [[TMP]] +; SI: s_endpgm +define void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 { + %fadd = fadd float %x, 1.0 + %floor = call float @llvm.floor.f32(float %fadd) #1 + %cvt = fptosi float %floor to i32 + store i32 %cvt, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}cvt_flr_i32_f32_fabs: +; SI-NOT: add +; SI-SAFE-NOT: v_cvt_flr_i32_f32 +; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}| +; SI: s_endpgm +define void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 { + %x.fabs = call float @llvm.fabs.f32(float %x) #1 + %floor = call float @llvm.floor.f32(float %x.fabs) #1 + %cvt = fptosi float %floor to i32 + store i32 %cvt, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}cvt_flr_i32_f32_fneg: +; SI-NOT: add +; SI-SAFE-NOT: v_cvt_flr_i32_f32 +; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}} +; SI: s_endpgm +define void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 { + %x.fneg = fsub float -0.000000e+00, %x + %floor = call float @llvm.floor.f32(float %x.fneg) #1 + %cvt = fptosi float %floor to i32 + store i32 %cvt, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}cvt_flr_i32_f32_fabs_fneg: +; SI-NOT: add +; SI-SAFE-NOT: v_cvt_flr_i32_f32 +; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -|s{{[0-9]+}}| +; SI: s_endpgm +define void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 { + %x.fabs = call float @llvm.fabs.f32(float %x) #1 + %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs + %floor = call float @llvm.floor.f32(float %x.fabs.fneg) #1 + %cvt = fptosi float %floor to i32 + store i32 %cvt, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}no_cvt_flr_i32_f32_0: +; SI-NOT: v_cvt_flr_i32_f32 +; SI: v_floor_f32 +; SI: v_cvt_u32_f32_e32 +; SI: s_endpgm +define void @no_cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 { + %floor = call float @llvm.floor.f32(float %x) #1 + %cvt = fptoui float %floor to i32 + store i32 %cvt, i32 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll b/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll new file mode 100644 index 00000000000..864ac40260b --- /dev/null +++ b/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll @@ -0,0 +1,83 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -enable-no-nans-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s + +declare float @llvm.fabs.f32(float) #1 +declare float @llvm.floor.f32(float) #1 + +; FUNC-LABEL: {{^}}cvt_rpi_i32_f32: +; SI-SAFE-NOT: v_cvt_rpi_i32_f32 +; SI-NONAN: v_cvt_rpi_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; SI: s_endpgm +define void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 { + %fadd = fadd float %x, 0.5 + %floor = call float @llvm.floor.f32(float %fadd) #1 + %cvt = fptosi float %floor to i32 + store i32 %cvt, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}cvt_rpi_i32_f32_fabs: +; SI-SAFE-NOT: v_cvt_rpi_i32_f32 +; SI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}} +; SI: s_endpgm +define void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 { + %x.fabs = call float @llvm.fabs.f32(float %x) #1 + %fadd = fadd float %x.fabs, 0.5 + %floor = call float @llvm.floor.f32(float %fadd) #1 + %cvt = fptosi float %floor to i32 + store i32 %cvt, i32 addrspace(1)* %out + ret void +} + +; FIXME: This doesn't work because it forms fsub 0.5, x +; FUNC-LABEL: {{^}}cvt_rpi_i32_f32_fneg: +; XSI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}} +; SI: v_sub_f32_e64 [[TMP:v[0-9]+]], 0.5, s{{[0-9]+}} +; SI-SAFE-NOT: v_cvt_flr_i32_f32 +; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]] +; SI: s_endpgm +define void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 { + %x.fneg = fsub float -0.000000e+00, %x + %fadd = fadd float %x.fneg, 0.5 + %floor = call float @llvm.floor.f32(float %fadd) #1 + %cvt = fptosi float %floor to i32 + store i32 %cvt, i32 addrspace(1)* %out + ret void +} + +; FIXME: This doesn't work for same reason as above +; FUNC-LABEL: {{^}}cvt_rpi_i32_f32_fabs_fneg: +; SI-SAFE-NOT: v_cvt_rpi_i32_f32 +; XSI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, -|s{{[0-9]+}}| + +; SI: v_sub_f32_e64 [[TMP:v[0-9]+]], 0.5, |s{{[0-9]+}}| +; SI-SAFE-NOT: v_cvt_flr_i32_f32 +; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]] +; SI: s_endpgm +define void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 { + %x.fabs = call float @llvm.fabs.f32(float %x) #1 + %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs + %fadd = fadd float %x.fabs.fneg, 0.5 + %floor = call float @llvm.floor.f32(float %fadd) #1 + %cvt = fptosi float %floor to i32 + store i32 %cvt, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}no_cvt_rpi_i32_f32_0: +; SI-NOT: v_cvt_rpi_i32_f32 +; SI: v_add_f32 +; SI: v_floor_f32 +; SI: v_cvt_u32_f32 +; SI: s_endpgm +define void @no_cvt_rpi_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 { + %fadd = fadd float %x, 0.5 + %floor = call float @llvm.floor.f32(float %fadd) #1 + %cvt = fptoui float %floor to i32 + store i32 %cvt, i32 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll b/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll new file mode 100644 index 00000000000..fb43ff4fbdd --- /dev/null +++ b/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll @@ -0,0 +1,36 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; This test is for a bug in +; DAGCombiner::reduceBuildVecConvertToConvertBuildVec() where +; the wrong type was being passed to +; TargetLowering::getOperationAction() when checking the legality of +; ISD::UINT_TO_FP and ISD::SINT_TO_FP opcodes. + + +; CHECK: {{^}}sint: +; CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %sint = load i32, i32 addrspace(1) * %in + %conv = sitofp i32 %sint to float + %0 = insertelement <4 x float> undef, float %conv, i32 0 + %splat = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer + store <4 x float> %splat, <4 x float> addrspace(1)* %out + ret void +} + +;CHECK: {{^}}uint: +;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %uint = load i32, i32 addrspace(1) * %in + %conv = uitofp i32 %uint to float + %0 = insertelement <4 x float> undef, float %conv, i32 0 + %splat = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer + store <4 x float> %splat, <4 x float> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/debug.ll b/test/CodeGen/AMDGPU/debug.ll new file mode 100644 index 00000000000..a2e0e878b74 --- /dev/null +++ b/test/CodeGen/AMDGPU/debug.ll @@ -0,0 +1,10 @@ +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs -mattr=dumpcode -filetype=obj | FileCheck --check-prefix=SI --check-prefix=FUNC %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=dumpcode -filetype=obj | FileCheck --check-prefix=SI --check-prefix=FUNC %s + +; Test for a crash in the custom assembly dump code. + +; SI: s_endpgm +define void @test(i32 addrspace(1)* %out) { + store i32 0, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/default-fp-mode.ll b/test/CodeGen/AMDGPU/default-fp-mode.ll new file mode 100644 index 00000000000..da8e91454b9 --- /dev/null +++ b/test/CodeGen/AMDGPU/default-fp-mode.ll @@ -0,0 +1,36 @@ +; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=FP64-DENORMAL -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=FP32-DENORMAL -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=BOTH-DENORMAL -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=NO-DENORMAL -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -mattr=+fp64-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=FP64-DENORMAL -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=FP32-DENORMAL -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=BOTH-DENORMAL -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=NO-DENORMAL -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp64-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}test_kernel: + +; DEFAULT: FloatMode: 192 +; DEFAULT: IeeeMode: 0 + +; FP64-DENORMAL: FloatMode: 192 +; FP64-DENORMAL: IeeeMode: 0 + +; FP32-DENORMAL: FloatMode: 48 +; FP32-DENORMAL: IeeeMode: 0 + +; BOTH-DENORMAL: FloatMode: 240 +; BOTH-DENORMAL: IeeeMode: 0 + +; NO-DENORMAL: FloatMode: 0 +; NO-DENORMAL: IeeeMode: 0 +define void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind { + store float 0.0, float addrspace(1)* %out0 + store double 0.0, double addrspace(1)* %out1 + ret void +} diff --git a/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll b/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll new file mode 100644 index 00000000000..cdd2c0cd4f4 --- /dev/null +++ b/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll @@ -0,0 +1,29 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; PRED_SET* instructions must be tied to any instruction that uses their +; result. This tests that there are no instructions between the PRED_SET* +; and the PREDICATE_BREAK in this loop. + +; CHECK: {{^}}loop_ge: +; CHECK: LOOP_START_DX10 +; CHECK: ALU_PUSH_BEFORE +; CHECK-NEXT: JUMP +; CHECK-NEXT: LOOP_BREAK +define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) nounwind { +entry: + %cmp5 = icmp sgt i32 %iterations, 0 + br i1 %cmp5, label %for.body, label %for.end + +for.body: ; preds = %for.body, %entry + %i.07.in = phi i32 [ %i.07, %for.body ], [ %iterations, %entry ] + %ai.06 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %i.07 = add nsw i32 %i.07.in, -1 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %ai.06 + store i32 %i.07, i32 addrspace(1)* %arrayidx, align 4 + %add = add nsw i32 %ai.06, 1 + %exitcond = icmp eq i32 %add, %iterations + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} diff --git a/test/CodeGen/AMDGPU/dot4-folding.ll b/test/CodeGen/AMDGPU/dot4-folding.ll new file mode 100644 index 00000000000..4df7b63bf98 --- /dev/null +++ b/test/CodeGen/AMDGPU/dot4-folding.ll @@ -0,0 +1,27 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; Exactly one constant vector can be folded into dot4, which means exactly +; 4 MOV instructions +; CHECK: {{^}}main: +; CHECK: MOV +; CHECK: MOV +; CHECK: MOV +; CHECK: MOV +; CHECK-NOT: MOV +; CHECK-NOT: MOV +; CHECK-NOT: MOV +; CHECK-NOT: MOV + +define void @main(float addrspace(1)* %out) { +main_body: + %0 = load <4 x float>, <4 x float> addrspace(8)* null + %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %2 = call float @llvm.AMDGPU.dp4(<4 x float> %0,<4 x float> %1) + %3 = insertelement <4 x float> undef, float %2, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %3, i32 0, i32 0) + ret void +} + +declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) +attributes #1 = { readnone } diff --git a/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll b/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll new file mode 100644 index 00000000000..e7e13d6178c --- /dev/null +++ b/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll @@ -0,0 +1,69 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI --check-prefix=CHECK %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=CI --check-prefix=CHECK %s + +declare i32 @llvm.r600.read.tidig.x() #0 +declare void @llvm.AMDGPU.barrier.local() #1 + +; Function Attrs: nounwind +; CHECK-LABEL: {{^}}signed_ds_offset_addressing_loop: +; CHECK: BB0_1: +; CHECK: v_add_i32_e32 [[VADDR:v[0-9]+]], +; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] +; SI-DAG: v_add_i32_e32 [[VADDR4:v[0-9]+]], 4, [[VADDR]] +; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR4]] +; SI-DAG: v_add_i32_e32 [[VADDR0x80:v[0-9]+]], 0x80, [[VADDR]] +; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x80]] +; SI-DAG: v_add_i32_e32 [[VADDR0x84:v[0-9]+]], 0x84, [[VADDR]] +; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x84]] +; SI-DAG: v_add_i32_e32 [[VADDR0x100:v[0-9]+]], 0x100, [[VADDR]] +; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x100]] + +; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset1:1 +; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:32 offset1:33 +; CI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] offset:256 +; CHECK: s_endpgm +define void @signed_ds_offset_addressing_loop(float addrspace(1)* noalias nocapture %out, float addrspace(3)* noalias nocapture readonly %lptr, i32 %n) #2 { +entry: + %x.i = tail call i32 @llvm.r600.read.tidig.x() #0 + %mul = shl nsw i32 %x.i, 1 + br label %for.body + +for.body: ; preds = %for.body, %entry + %sum.03 = phi float [ 0.000000e+00, %entry ], [ %add13, %for.body ] + %offset.02 = phi i32 [ %mul, %entry ], [ %add14, %for.body ] + %k.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + tail call void @llvm.AMDGPU.barrier.local() #1 + %arrayidx = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %offset.02 + %tmp = load float, float addrspace(3)* %arrayidx, align 4 + %add1 = add nsw i32 %offset.02, 1 + %arrayidx2 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add1 + %tmp1 = load float, float addrspace(3)* %arrayidx2, align 4 + %add3 = add nsw i32 %offset.02, 32 + %arrayidx4 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add3 + %tmp2 = load float, float addrspace(3)* %arrayidx4, align 4 + %add5 = add nsw i32 %offset.02, 33 + %arrayidx6 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add5 + %tmp3 = load float, float addrspace(3)* %arrayidx6, align 4 + %add7 = add nsw i32 %offset.02, 64 + %arrayidx8 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add7 + %tmp4 = load float, float addrspace(3)* %arrayidx8, align 4 + %add9 = fadd float %tmp, %tmp1 + %add10 = fadd float %add9, %tmp2 + %add11 = fadd float %add10, %tmp3 + %add12 = fadd float %add11, %tmp4 + %add13 = fadd float %sum.03, %add12 + %inc = add nsw i32 %k.01, 1 + %add14 = add nsw i32 %offset.02, 97 + %exitcond = icmp eq i32 %inc, 8 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + %tmp5 = sext i32 %x.i to i64 + %arrayidx15 = getelementptr inbounds float, float addrspace(1)* %out, i64 %tmp5 + store float %add13, float addrspace(1)* %arrayidx15, align 4 + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { noduplicate nounwind } +attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/test/CodeGen/AMDGPU/ds_read2.ll b/test/CodeGen/AMDGPU/ds_read2.ll new file mode 100644 index 00000000000..5929898f8bd --- /dev/null +++ b/test/CodeGen/AMDGPU/ds_read2.ll @@ -0,0 +1,515 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s + +; FIXME: We don't get cases where the address was an SGPR because we +; get a copy to the address register for each one. + +@lds = addrspace(3) global [512 x float] undef, align 4 + @lds.f64 = addrspace(3) global [512 x double] undef, align 8 + +; SI-LABEL: @simple_read2_f32 +; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:8 +; SI: s_waitcnt lgkmcnt(0) +; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @simple_read2_f32(float addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; SI-LABEL: @simple_read2_f32_max_offset +; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:255 +; SI: s_waitcnt lgkmcnt(0) +; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 255 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; SI-LABEL: @simple_read2_f32_too_far +; SI-NOT ds_read2_b32 +; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} +; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028 +; SI: s_endpgm +define void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 257 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; SI-LABEL: @simple_read2_f32_x2 +; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8 +; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27 +; SI: s_endpgm +define void @simple_read2_f32_x2(float addrspace(1)* %out) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 0 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + + %idx.1 = add nsw i32 %tid.x, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + %sum.0 = fadd float %val0, %val1 + + %idx.2 = add nsw i32 %tid.x, 11 + %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 + %val2 = load float, float addrspace(3)* %arrayidx2, align 4 + + %idx.3 = add nsw i32 %tid.x, 27 + %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 + %val3 = load float, float addrspace(3)* %arrayidx3, align 4 + %sum.1 = fadd float %val2, %val3 + + %sum = fadd float %sum.0, %sum.1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0 + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; Make sure there is an instruction between the two sets of reads. +; SI-LABEL: @simple_read2_f32_x2_barrier +; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8 +; SI: s_barrier +; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27 +; SI: s_endpgm +define void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 0 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + + %idx.1 = add nsw i32 %tid.x, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + %sum.0 = fadd float %val0, %val1 + + call void @llvm.AMDGPU.barrier.local() #2 + + %idx.2 = add nsw i32 %tid.x, 11 + %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 + %val2 = load float, float addrspace(3)* %arrayidx2, align 4 + + %idx.3 = add nsw i32 %tid.x, 27 + %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 + %val3 = load float, float addrspace(3)* %arrayidx3, align 4 + %sum.1 = fadd float %val2, %val3 + + %sum = fadd float %sum.0, %sum.1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0 + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; For some reason adding something to the base address for the first +; element results in only folding the inner pair. + +; SI-LABEL: @simple_read2_f32_x2_nonzero_base +; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:2 offset1:8 +; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27 +; SI: s_endpgm +define void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + + %idx.1 = add nsw i32 %tid.x, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + %sum.0 = fadd float %val0, %val1 + + %idx.2 = add nsw i32 %tid.x, 11 + %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 + %val2 = load float, float addrspace(3)* %arrayidx2, align 4 + + %idx.3 = add nsw i32 %tid.x, 27 + %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 + %val3 = load float, float addrspace(3)* %arrayidx3, align 4 + %sum.1 = fadd float %val2, %val3 + + %sum = fadd float %sum.0, %sum.1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0 + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; Be careful of vectors of pointers. We don't know if the 2 pointers +; in the vectors are really the same base, so this is not safe to +; merge. +; Base pointers come from different subregister of same super +; register. We can't safely merge this. + +; SI-LABEL: @read2_ptr_is_subreg_arg_f32 +; SI-NOT: ds_read2_b32 +; SI: ds_read_b32 +; SI: ds_read_b32 +; SI: s_endpgm +define void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 + %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 + %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1 + %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 + %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 + %val0 = load float, float addrspace(3)* %gep.0, align 4 + %val1 = load float, float addrspace(3)* %gep.1, align 4 + %add.x = add nsw i32 %x.i, 8 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; Apply a constant scalar offset after the pointer vector extract. We +; are rejecting merges that have the same, constant 0 offset, so make +; sure we are really rejecting it because of the different +; subregisters. + +; SI-LABEL: @read2_ptr_is_subreg_arg_offset_f32 +; SI-NOT: ds_read2_b32 +; SI: ds_read_b32 +; SI: ds_read_b32 +; SI: s_endpgm +define void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 + %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 + %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1 + %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 + %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 + + ; Apply an additional offset after the vector that will be more obviously folded. + %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8 + + %val0 = load float, float addrspace(3)* %gep.0, align 4 + %val1 = load float, float addrspace(3)* %gep.1.offset, align 4 + %add.x = add nsw i32 %x.i, 8 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; We should be able to merge in this case, but probably not worth the effort. +; SI-NOT: ds_read2_b32 +; SI: ds_read_b32 +; SI: ds_read_b32 +; SI: s_endpgm +define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %ptr.0 = insertelement <2 x [512 x float] addrspace(3)*> undef, [512 x float] addrspace(3)* @lds, i32 0 + %ptr.1 = insertelement <2 x [512 x float] addrspace(3)*> %ptr.0, [512 x float] addrspace(3)* @lds, i32 1 + %x.i.v.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 + %x.i.v.1 = insertelement <2 x i32> %x.i.v.0, i32 %x.i, i32 1 + %idx = add <2 x i32> %x.i.v.1, + %gep = getelementptr inbounds [512 x float], <2 x [512 x float] addrspace(3)*> %ptr.1, <2 x i32> , <2 x i32> %idx + %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 + %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 + %val0 = load float, float addrspace(3)* %gep.0, align 4 + %val1 = load float, float addrspace(3)* %gep.1, align 4 + %add.x = add nsw i32 %x.i, 8 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; SI-LABEL: @simple_read2_f32_volatile_0 +; SI-NOT ds_read2_b32 +; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} +; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32 +; SI: s_endpgm +define void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + %val0 = load volatile float, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; SI-LABEL: @simple_read2_f32_volatile_1 +; SI-NOT ds_read2_b32 +; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} +; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32 +; SI: s_endpgm +define void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + %val1 = load volatile float, float addrspace(3)* %arrayidx1, align 4 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; Can't fold since not correctly aligned. +; XXX: This isn't really testing anything useful now. I think CI +; allows unaligned LDS accesses, which would be a problem here. +; SI-LABEL: @unaligned_read2_f32 +; SI-NOT: ds_read2_b32 +; SI: s_endpgm +define void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i + %val0 = load float, float addrspace(3)* %arrayidx0, align 1 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x + %val1 = load float, float addrspace(3)* %arrayidx1, align 1 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; SI-LABEL: @misaligned_2_simple_read2_f32 +; SI-NOT: ds_read2_b32 +; SI: s_endpgm +define void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i + %val0 = load float, float addrspace(3)* %arrayidx0, align 2 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x + %val1 = load float, float addrspace(3)* %arrayidx1, align 2 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; SI-LABEL: @simple_read2_f64 +; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, {{v[0-9]+}} +; SI: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset1:8 +; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} +; SI: buffer_store_dwordx2 [[RESULT]] +; SI: s_endpgm +define void @simple_read2_f64(double addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i + %val0 = load double, double addrspace(3)* %arrayidx0, align 8 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x + %val1 = load double, double addrspace(3)* %arrayidx1, align 8 + %sum = fadd double %val0, %val1 + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i + store double %sum, double addrspace(1)* %out.gep, align 8 + ret void +} + +; SI-LABEL: @simple_read2_f64_max_offset +; SI: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:255 +; SI: s_endpgm +define void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i + %val0 = load double, double addrspace(3)* %arrayidx0, align 8 + %add.x = add nsw i32 %x.i, 255 + %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x + %val1 = load double, double addrspace(3)* %arrayidx1, align 8 + %sum = fadd double %val0, %val1 + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i + store double %sum, double addrspace(1)* %out.gep, align 8 + ret void +} + +; SI-LABEL: @simple_read2_f64_too_far +; SI-NOT ds_read2_b64 +; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:2056 +; SI: s_endpgm +define void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i + %val0 = load double, double addrspace(3)* %arrayidx0, align 8 + %add.x = add nsw i32 %x.i, 257 + %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x + %val1 = load double, double addrspace(3)* %arrayidx1, align 8 + %sum = fadd double %val0, %val1 + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i + store double %sum, double addrspace(1)* %out.gep, align 8 + ret void +} + +; Alignment only 4 +; SI-LABEL: @misaligned_read2_f64 +; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1 +; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:14 offset1:15 +; SI: s_endpgm +define void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i + %val0 = load double, double addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 7 + %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x + %val1 = load double, double addrspace(3)* %arrayidx1, align 4 + %sum = fadd double %val0, %val1 + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i + store double %sum, double addrspace(1)* %out.gep, align 4 + ret void +} + +@foo = addrspace(3) global [4 x i32] undef, align 4 + +; SI-LABEL: @load_constant_adjacent_offsets +; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1 +define void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) { + %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 + %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 + %sum = add i32 %val0, %val1 + store i32 %sum, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: @load_constant_disjoint_offsets +; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2 +define void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) { + %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 + %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 + %sum = add i32 %val0, %val1 + store i32 %sum, i32 addrspace(1)* %out, align 4 + ret void +} + +@bar = addrspace(3) global [4 x i64] undef, align 4 + +; SI-LABEL: @load_misaligned64_constant_offsets +; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1 +; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3 +define void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) { + %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 + %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 + %sum = add i64 %val0, %val1 + store i64 %sum, i64 addrspace(1)* %out, align 8 + ret void +} + +@bar.large = addrspace(3) global [4096 x i64] undef, align 4 + +; SI-LABEL: @load_misaligned64_constant_large_offsets +; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}} +; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000 +; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1 +; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1 +; SI: s_endpgm +define void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) { + %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 + %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4 + %sum = add i64 %val0, %val1 + store i64 %sum, i64 addrspace(1)* %out, align 8 + ret void +} + +@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4 +@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4 + +define void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 { + %x.i = tail call i32 @llvm.r600.read.tgid.x() #1 + %y.i = tail call i32 @llvm.r600.read.tidig.y() #1 + %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i + %tmp16 = load float, float addrspace(3)* %arrayidx44, align 4 + %add47 = add nsw i32 %x.i, 1 + %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47 + %tmp17 = load float, float addrspace(3)* %arrayidx48, align 4 + %add51 = add nsw i32 %x.i, 16 + %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51 + %tmp18 = load float, float addrspace(3)* %arrayidx52, align 4 + %add55 = add nsw i32 %x.i, 17 + %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55 + %tmp19 = load float, float addrspace(3)* %arrayidx56, align 4 + %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i + %tmp20 = load float, float addrspace(3)* %arrayidx60, align 4 + %add63 = add nsw i32 %y.i, 1 + %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63 + %tmp21 = load float, float addrspace(3)* %arrayidx64, align 4 + %add67 = add nsw i32 %y.i, 32 + %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67 + %tmp22 = load float, float addrspace(3)* %arrayidx68, align 4 + %add71 = add nsw i32 %y.i, 33 + %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71 + %tmp23 = load float, float addrspace(3)* %arrayidx72, align 4 + %add75 = add nsw i32 %y.i, 64 + %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75 + %tmp24 = load float, float addrspace(3)* %arrayidx76, align 4 + %add79 = add nsw i32 %y.i, 65 + %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79 + %tmp25 = load float, float addrspace(3)* %arrayidx80, align 4 + %sum.0 = fadd float %tmp16, %tmp17 + %sum.1 = fadd float %sum.0, %tmp18 + %sum.2 = fadd float %sum.1, %tmp19 + %sum.3 = fadd float %sum.2, %tmp20 + %sum.4 = fadd float %sum.3, %tmp21 + %sum.5 = fadd float %sum.4, %tmp22 + %sum.6 = fadd float %sum.5, %tmp23 + %sum.7 = fadd float %sum.6, %tmp24 + %sum.8 = fadd float %sum.7, %tmp25 + store float %sum.8, float addrspace(1)* %C, align 4 + ret void +} + +define void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 { + %load = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4 + store <2 x i32> %load, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +define void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 { + %load = load i64, i64 addrspace(3)* %in, align 4 + store i64 %load, i64 addrspace(1)* %out, align 8 + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tgid.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tgid.y() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.y() #1 + +; Function Attrs: noduplicate nounwind +declare void @llvm.AMDGPU.barrier.local() #2 + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } +attributes #2 = { noduplicate nounwind } diff --git a/test/CodeGen/AMDGPU/ds_read2_offset_order.ll b/test/CodeGen/AMDGPU/ds_read2_offset_order.ll new file mode 100644 index 00000000000..9ea9a5a2617 --- /dev/null +++ b/test/CodeGen/AMDGPU/ds_read2_offset_order.ll @@ -0,0 +1,45 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s + +; XFAIL: * + +@lds = addrspace(3) global [512 x float] undef, align 4 + +; SI-LABEL: {{^}}offset_order: + +; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:56 +; SI: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:0 offset1:4 +; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3 +; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:11 offset1:1 + +define void @offset_order(float addrspace(1)* %out) { +entry: + %ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 0 + %val0 = load float, float addrspace(3)* %ptr0 + + %ptr1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 256 + %val1 = load float, float addrspace(3)* %ptr1 + %add1 = fadd float %val0, %val1 + + %ptr2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 3 + %val2 = load float, float addrspace(3)* %ptr2 + %add2 = fadd float %add1, %val2 + + %ptr3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 2 + %val3 = load float, float addrspace(3)* %ptr3 + %add3 = fadd float %add2, %val3 + + %ptr4 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 12 + %val4 = load float, float addrspace(3)* %ptr4 + %add4 = fadd float %add3, %val4 + + %ptr5 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 14 + %val5 = load float, float addrspace(3)* %ptr5 + %add5 = fadd float %add4, %val5 + + %ptr6 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 11 + %val6 = load float, float addrspace(3)* %ptr6 + %add6 = fadd float %add5, %val6 + store float %add6, float addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/ds_read2st64.ll b/test/CodeGen/AMDGPU/ds_read2st64.ll new file mode 100644 index 00000000000..54b3b45636d --- /dev/null +++ b/test/CodeGen/AMDGPU/ds_read2st64.ll @@ -0,0 +1,272 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s + +@lds = addrspace(3) global [512 x float] undef, align 4 +@lds.f64 = addrspace(3) global [512 x double] undef, align 8 + + +; SI-LABEL: @simple_read2st64_f32_0_1 +; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1 +; SI: s_waitcnt lgkmcnt(0) +; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 64 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; SI-LABEL: @simple_read2st64_f32_1_2 +; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2 +; SI: s_waitcnt lgkmcnt(0) +; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %add.x.0 = add nsw i32 %x.i, 64 + %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0 + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + %add.x.1 = add nsw i32 %x.i, 128 + %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1 + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; SI-LABEL: @simple_read2st64_f32_max_offset +; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:255 +; SI: s_waitcnt lgkmcnt(0) +; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %add.x.0 = add nsw i32 %x.i, 64 + %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0 + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + %add.x.1 = add nsw i32 %x.i, 16320 + %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1 + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; SI-LABEL: @simple_read2st64_f32_over_max_offset +; SI-NOT: ds_read2st64_b32 +; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}} +; SI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256 +; SI: ds_read_b32 {{v[0-9]+}}, [[BIGADD]] +; SI: s_endpgm +define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %add.x.0 = add nsw i32 %x.i, 64 + %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0 + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + %add.x.1 = add nsw i32 %x.i, 16384 + %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1 + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; SI-LABEL: @odd_invalid_read2st64_f32_0 +; SI-NOT: ds_read2st64_b32 +; SI: s_endpgm +define void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 63 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; SI-LABEL: @odd_invalid_read2st64_f32_1 +; SI-NOT: ds_read2st64_b32 +; SI: s_endpgm +define void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %add.x.0 = add nsw i32 %x.i, 64 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0 + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + %add.x.1 = add nsw i32 %x.i, 127 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.1 + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %sum, float addrspace(1)* %out.gep, align 4 + ret void +} + +; SI-LABEL: @simple_read2st64_f64_0_1 +; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1 +; SI: s_waitcnt lgkmcnt(0) +; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} +; SI: buffer_store_dwordx2 [[RESULT]] +; SI: s_endpgm +define void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i + %val0 = load double, double addrspace(3)* %arrayidx0, align 8 + %add.x = add nsw i32 %x.i, 64 + %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x + %val1 = load double, double addrspace(3)* %arrayidx1, align 8 + %sum = fadd double %val0, %val1 + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i + store double %sum, double addrspace(1)* %out.gep, align 8 + ret void +} + +; SI-LABEL: @simple_read2st64_f64_1_2 +; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2 +; SI: s_waitcnt lgkmcnt(0) +; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} +; SI: buffer_store_dwordx2 [[RESULT]] +; SI: s_endpgm +define void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %add.x.0 = add nsw i32 %x.i, 64 + %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 + %val0 = load double, double addrspace(3)* %arrayidx0, align 8 + %add.x.1 = add nsw i32 %x.i, 128 + %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1 + %val1 = load double, double addrspace(3)* %arrayidx1, align 8 + %sum = fadd double %val0, %val1 + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i + store double %sum, double addrspace(1)* %out.gep, align 8 + ret void +} + +; Alignment only + +; SI-LABEL: @misaligned_read2st64_f64 +; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1 +; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129 +; SI: s_endpgm +define void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i + %val0 = load double, double addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 64 + %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x + %val1 = load double, double addrspace(3)* %arrayidx1, align 4 + %sum = fadd double %val0, %val1 + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i + store double %sum, double addrspace(1)* %out.gep, align 4 + ret void +} + +; The maximum is not the usual 0xff because 0xff * 8 * 64 > 0xffff +; SI-LABEL: @simple_read2st64_f64_max_offset +; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4 offset1:127 +; SI: s_waitcnt lgkmcnt(0) +; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} +; SI: buffer_store_dwordx2 [[RESULT]] +; SI: s_endpgm +define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %add.x.0 = add nsw i32 %x.i, 256 + %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 + %val0 = load double, double addrspace(3)* %arrayidx0, align 8 + %add.x.1 = add nsw i32 %x.i, 8128 + %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1 + %val1 = load double, double addrspace(3)* %arrayidx1, align 8 + %sum = fadd double %val0, %val1 + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i + store double %sum, double addrspace(1)* %out.gep, align 8 + ret void +} + +; SI-LABEL: @simple_read2st64_f64_over_max_offset +; SI-NOT: ds_read2st64_b64 +; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}} +; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512 +; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]] +; SI: s_endpgm +define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %add.x.0 = add nsw i32 %x.i, 64 + %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 + %val0 = load double, double addrspace(3)* %arrayidx0, align 8 + %add.x.1 = add nsw i32 %x.i, 8192 + %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1 + %val1 = load double, double addrspace(3)* %arrayidx1, align 8 + %sum = fadd double %val0, %val1 + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i + store double %sum, double addrspace(1)* %out.gep, align 8 + ret void +} + +; SI-LABEL: @invalid_read2st64_f64_odd_offset +; SI-NOT: ds_read2st64_b64 +; SI: s_endpgm +define void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %add.x.0 = add nsw i32 %x.i, 64 + %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 + %val0 = load double, double addrspace(3)* %arrayidx0, align 8 + %add.x.1 = add nsw i32 %x.i, 8129 + %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1 + %val1 = load double, double addrspace(3)* %arrayidx1, align 8 + %sum = fadd double %val0, %val1 + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i + store double %sum, double addrspace(1)* %out.gep, align 8 + ret void +} + +; The stride of 8 elements is 8 * 8 bytes. We need to make sure the +; stride in elements, not bytes, is a multiple of 64. + +; SI-LABEL: @byte_size_only_divisible_64_read2_f64 +; SI-NOT: ds_read2st_b64 +; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8 +; SI: s_endpgm +define void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i + %val0 = load double, double addrspace(3)* %arrayidx0, align 8 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x + %val1 = load double, double addrspace(3)* %arrayidx1, align 8 + %sum = fadd double %val0, %val1 + %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i + store double %sum, double addrspace(1)* %out.gep, align 4 + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tgid.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tgid.y() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.y() #1 + +; Function Attrs: noduplicate nounwind +declare void @llvm.AMDGPU.barrier.local() #2 + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } +attributes #2 = { noduplicate nounwind } diff --git a/test/CodeGen/AMDGPU/ds_write2.ll b/test/CodeGen/AMDGPU/ds_write2.ll new file mode 100644 index 00000000000..b553d3459e4 --- /dev/null +++ b/test/CodeGen/AMDGPU/ds_write2.ll @@ -0,0 +1,425 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s + +@lds = addrspace(3) global [512 x float] undef, align 4 +@lds.f64 = addrspace(3) global [512 x double] undef, align 8 + + +; SI-LABEL: @simple_write2_one_val_f32 +; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]] +; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8 +; SI: s_endpgm +define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i + %val = load float, float addrspace(1)* %in.gep, align 4 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + store float %val, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + store float %val, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; SI-LABEL: @simple_write2_two_val_f32 +; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 +; SI: s_endpgm +define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i + %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 + %val0 = load float, float addrspace(1)* %in.gep.0, align 4 + %val1 = load float, float addrspace(1)* %in.gep.1, align 4 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + store float %val0, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + store float %val1, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; SI-LABEL: @simple_write2_two_val_f32_volatile_0 +; SI-NOT: ds_write2_b32 +; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} +; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32 +; SI: s_endpgm +define void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i + %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i + %val0 = load float, float addrspace(1)* %in0.gep, align 4 + %val1 = load float, float addrspace(1)* %in1.gep, align 4 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + store volatile float %val0, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + store float %val1, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; SI-LABEL: @simple_write2_two_val_f32_volatile_1 +; SI-NOT: ds_write2_b32 +; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} +; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32 +; SI: s_endpgm +define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i + %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i + %val0 = load float, float addrspace(1)* %in0.gep, align 4 + %val1 = load float, float addrspace(1)* %in1.gep, align 4 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + store float %val0, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + store volatile float %val1, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; 2 data subregisters from different super registers. +; SI-LABEL: @simple_write2_two_val_subreg2_mixed_f32 +; SI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}} +; SI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}} +; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 +; SI: s_endpgm +define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i + %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1 + %val0 = load <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8 + %val1 = load <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8 + %val0.0 = extractelement <2 x float> %val0, i32 0 + %val1.1 = extractelement <2 x float> %val1, i32 1 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + store float %val0.0, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + store float %val1.1, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; SI-LABEL: @simple_write2_two_val_subreg2_f32 +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} +; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 +; SI: s_endpgm +define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i + %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8 + %val0 = extractelement <2 x float> %val, i32 0 + %val1 = extractelement <2 x float> %val, i32 1 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + store float %val0, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + store float %val1, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; SI-LABEL: @simple_write2_two_val_subreg4_f32 +; SI-DAG: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} +; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 +; SI: s_endpgm +define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i + %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16 + %val0 = extractelement <4 x float> %val, i32 0 + %val1 = extractelement <4 x float> %val, i32 3 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + store float %val0, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + store float %val1, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; SI-LABEL: @simple_write2_two_val_max_offset_f32 +; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255 +; SI: s_endpgm +define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i + %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 + %val0 = load float, float addrspace(1)* %in.gep.0, align 4 + %val1 = load float, float addrspace(1)* %in.gep.1, align 4 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + store float %val0, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 255 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + store float %val1, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; SI-LABEL: @simple_write2_two_val_too_far_f32 +; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} +; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028 +; SI: s_endpgm +define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i + %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i + %val0 = load float, float addrspace(1)* %in0.gep, align 4 + %val1 = load float, float addrspace(1)* %in1.gep, align 4 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + store float %val0, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 257 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + store float %val1, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; SI-LABEL: @simple_write2_two_val_f32_x2 +; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8 +; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 +; SI: s_endpgm +define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x + %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x + %val0 = load float, float addrspace(1)* %in0.gep, align 4 + %val1 = load float, float addrspace(1)* %in1.gep, align 4 + + %idx.0 = add nsw i32 %tid.x, 0 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 + store float %val0, float addrspace(3)* %arrayidx0, align 4 + + %idx.1 = add nsw i32 %tid.x, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 + store float %val1, float addrspace(3)* %arrayidx1, align 4 + + %idx.2 = add nsw i32 %tid.x, 11 + %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 + store float %val0, float addrspace(3)* %arrayidx2, align 4 + + %idx.3 = add nsw i32 %tid.x, 27 + %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 + store float %val1, float addrspace(3)* %arrayidx3, align 4 + + ret void +} + +; SI-LABEL: @simple_write2_two_val_f32_x2_nonzero_base +; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8 +; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 +; SI: s_endpgm +define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x + %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x + %val0 = load float, float addrspace(1)* %in0.gep, align 4 + %val1 = load float, float addrspace(1)* %in1.gep, align 4 + + %idx.0 = add nsw i32 %tid.x, 3 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 + store float %val0, float addrspace(3)* %arrayidx0, align 4 + + %idx.1 = add nsw i32 %tid.x, 8 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 + store float %val1, float addrspace(3)* %arrayidx1, align 4 + + %idx.2 = add nsw i32 %tid.x, 11 + %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 + store float %val0, float addrspace(3)* %arrayidx2, align 4 + + %idx.3 = add nsw i32 %tid.x, 27 + %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 + store float %val1, float addrspace(3)* %arrayidx3, align 4 + + ret void +} + +; SI-LABEL: @write2_ptr_subreg_arg_two_val_f32 +; SI-NOT: ds_write2_b32 +; SI: ds_write_b32 +; SI: ds_write_b32 +; SI: s_endpgm +define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i + %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i + %val0 = load float, float addrspace(1)* %in0.gep, align 4 + %val1 = load float, float addrspace(1)* %in1.gep, align 4 + + %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 + %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 + %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1 + %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 + %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 + + ; Apply an additional offset after the vector that will be more obviously folded. + %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8 + store float %val0, float addrspace(3)* %gep.0, align 4 + + %add.x = add nsw i32 %x.i, 8 + store float %val1, float addrspace(3)* %gep.1.offset, align 4 + ret void +} + +; SI-LABEL: @simple_write2_one_val_f64 +; SI: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], +; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} +; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8 +; SI: s_endpgm +define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i + %val = load double, double addrspace(1)* %in.gep, align 8 + %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i + store double %val, double addrspace(3)* %arrayidx0, align 8 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x + store double %val, double addrspace(3)* %arrayidx1, align 8 + ret void +} + +; SI-LABEL: @misaligned_simple_write2_one_val_f64 +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} +; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} +; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:1 +; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15 +; SI: s_endpgm +define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i + %val = load double, double addrspace(1)* %in.gep, align 8 + %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i + store double %val, double addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 7 + %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x + store double %val, double addrspace(3)* %arrayidx1, align 4 + ret void +} + +; SI-LABEL: @simple_write2_two_val_f64 +; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 +; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} +; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 +; SI: s_endpgm +define void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i + %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1 + %val0 = load double, double addrspace(1)* %in.gep.0, align 8 + %val1 = load double, double addrspace(1)* %in.gep.1, align 8 + %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i + store double %val0, double addrspace(3)* %arrayidx0, align 8 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x + store double %val1, double addrspace(3)* %arrayidx1, align 8 + ret void +} + +@foo = addrspace(3) global [4 x i32] undef, align 4 + +; SI-LABEL: @store_constant_adjacent_offsets +; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +define void @store_constant_adjacent_offsets() { + store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 + store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 + ret void +} + +; SI-LABEL: @store_constant_disjoint_offsets +; SI-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}} +; SI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; SI: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2 +define void @store_constant_disjoint_offsets() { + store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 + store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 + ret void +} + +@bar = addrspace(3) global [4 x i64] undef, align 4 + +; SI-LABEL: @store_misaligned64_constant_offsets +; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 +define void @store_misaligned64_constant_offsets() { + store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 + store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 + ret void +} + +@bar.large = addrspace(3) global [4096 x i64] undef, align 4 + +; SI-LABEL: @store_misaligned64_constant_large_offsets +; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}} +; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}} +; SI-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; SI-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; SI: s_endpgm +define void @store_misaligned64_constant_large_offsets() { + store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 + store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4 + ret void +} + +@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4 +@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4 + +define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 { + %x.i = tail call i32 @llvm.r600.read.tgid.x() #1 + %y.i = tail call i32 @llvm.r600.read.tidig.y() #1 + %val = load float, float addrspace(1)* %in + %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i + store float %val, float addrspace(3)* %arrayidx44, align 4 + %add47 = add nsw i32 %x.i, 1 + %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47 + store float %val, float addrspace(3)* %arrayidx48, align 4 + %add51 = add nsw i32 %x.i, 16 + %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51 + store float %val, float addrspace(3)* %arrayidx52, align 4 + %add55 = add nsw i32 %x.i, 17 + %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55 + store float %val, float addrspace(3)* %arrayidx56, align 4 + %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i + store float %val, float addrspace(3)* %arrayidx60, align 4 + %add63 = add nsw i32 %y.i, 1 + %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63 + store float %val, float addrspace(3)* %arrayidx64, align 4 + %add67 = add nsw i32 %y.i, 32 + %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67 + store float %val, float addrspace(3)* %arrayidx68, align 4 + %add71 = add nsw i32 %y.i, 33 + %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71 + store float %val, float addrspace(3)* %arrayidx72, align 4 + %add75 = add nsw i32 %y.i, 64 + %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75 + store float %val, float addrspace(3)* %arrayidx76, align 4 + %add79 = add nsw i32 %y.i, 65 + %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79 + store float %val, float addrspace(3)* %arrayidx80, align 4 + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tgid.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tgid.y() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.y() #1 + +; Function Attrs: noduplicate nounwind +declare void @llvm.AMDGPU.barrier.local() #2 + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } +attributes #2 = { noduplicate nounwind } diff --git a/test/CodeGen/AMDGPU/ds_write2st64.ll b/test/CodeGen/AMDGPU/ds_write2st64.ll new file mode 100644 index 00000000000..1d9d881c5c7 --- /dev/null +++ b/test/CodeGen/AMDGPU/ds_write2st64.ll @@ -0,0 +1,119 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s + + +@lds = addrspace(3) global [512 x float] undef, align 4 + + +; SI-LABEL: @simple_write2st64_one_val_f32_0_1 +; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]] +; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; SI: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:1 +; SI: s_endpgm +define void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i + %val = load float, float addrspace(1)* %in.gep, align 4 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + store float %val, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 64 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x + store float %val, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; SI-LABEL: @simple_write2st64_two_val_f32_2_5 +; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5 +; SI: s_endpgm +define void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i + %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 + %val0 = load float, float addrspace(1)* %in.gep.0, align 4 + %val1 = load float, float addrspace(1)* %in.gep.1, align 4 + %add.x.0 = add nsw i32 %x.i, 128 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0 + store float %val0, float addrspace(3)* %arrayidx0, align 4 + %add.x.1 = add nsw i32 %x.i, 320 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.1 + store float %val1, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; SI-LABEL: @simple_write2st64_two_val_max_offset_f32 +; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255 +; SI: s_endpgm +define void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i + %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 + %val0 = load float, float addrspace(1)* %in.gep.0, align 4 + %val1 = load float, float addrspace(1)* %in.gep.1, align 4 + %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i + store float %val0, float addrspace(3)* %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 16320 + %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x + store float %val1, float addrspace(3)* %arrayidx1, align 4 + ret void +} + +; SI-LABEL: @simple_write2st64_two_val_max_offset_f64 +; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 +; SI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], +; SI: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127 +; SI: s_endpgm +define void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i + %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1 + %val0 = load double, double addrspace(1)* %in.gep.0, align 8 + %val1 = load double, double addrspace(1)* %in.gep.1, align 8 + %add.x.0 = add nsw i32 %x.i, 256 + %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 + store double %val0, double addrspace(3)* %arrayidx0, align 8 + %add.x.1 = add nsw i32 %x.i, 8128 + %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1 + store double %val1, double addrspace(3)* %arrayidx1, align 8 + ret void +} + +; SI-LABEL: @byte_size_only_divisible_64_write2st64_f64 +; SI-NOT: ds_write2st64_b64 +; SI: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:8 +; SI: s_endpgm +define void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i + %val = load double, double addrspace(1)* %in.gep, align 8 + %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i + store double %val, double addrspace(3)* %arrayidx0, align 8 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x + store double %val, double addrspace(3)* %arrayidx1, align 8 + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tgid.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tgid.y() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.y() #1 + +; Function Attrs: noduplicate nounwind +declare void @llvm.AMDGPU.barrier.local() #2 + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } +attributes #2 = { noduplicate nounwind } diff --git a/test/CodeGen/AMDGPU/elf.ll b/test/CodeGen/AMDGPU/elf.ll new file mode 100644 index 00000000000..d0fd06a3437 --- /dev/null +++ b/test/CodeGen/AMDGPU/elf.ll @@ -0,0 +1,34 @@ +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TYPICAL %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TONGA %s +; RUN: llc < %s -march=amdgcn -mcpu=carrizo -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s +; RUN: llc < %s -march=amdgcn -mcpu=carrizo -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TYPICAL %s + +; Test that we don't try to produce a COFF file on windows +; RUN: llc < %s -mtriple=amdgcn-pc-mingw -mcpu=SI -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s + +; ELF: Format: ELF32 +; ELF: Name: .AMDGPU.config +; ELF: Type: SHT_PROGBITS + +; ELF: Symbol { +; ELF: Name: test +; ELF: Binding: Global + +; CONFIG: .section .AMDGPU.config +; CONFIG-NEXT: .long 45096 +; TYPICAL-NEXT: .long 0 +; TONGA-NEXT: .long 576 +; CONFIG: .align 256 +; CONFIG: test: +define void @test(i32 %p) #0 { + %i = add i32 %p, 2 + %r = bitcast i32 %i to float + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) + ret void +} + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } ; Pixel Shader diff --git a/test/CodeGen/AMDGPU/elf.r600.ll b/test/CodeGen/AMDGPU/elf.r600.ll new file mode 100644 index 00000000000..51cd0850093 --- /dev/null +++ b/test/CodeGen/AMDGPU/elf.r600.ll @@ -0,0 +1,17 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood -filetype=obj | llvm-readobj -s - | FileCheck --check-prefix=ELF %s +; RUN: llc < %s -march=r600 -mcpu=redwood -o - | FileCheck --check-prefix=CONFIG %s + +; ELF: Format: ELF32 +; ELF: Name: .AMDGPU.config + +; CONFIG: .section .AMDGPU.config +; CONFIG-NEXT: .long 166100 +; CONFIG-NEXT: .long 2 +; CONFIG-NEXT: .long 165900 +; CONFIG-NEXT: .long 0 +define void @test(float addrspace(1)* %out, i32 %p) { + %i = add i32 %p, 2 + %r = bitcast i32 %i to float + store float %r, float addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/empty-function.ll b/test/CodeGen/AMDGPU/empty-function.ll new file mode 100644 index 00000000000..a060900811e --- /dev/null +++ b/test/CodeGen/AMDGPU/empty-function.ll @@ -0,0 +1,21 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; Make sure we don't assert on empty functions + +; SI: .text +; SI-LABEL: {{^}}empty_function_ret: +; SI: s_endpgm +; SI: codeLenInByte = 4 +define void @empty_function_ret() #0 { + ret void +} + +; SI: .text +; SI-LABEL: {{^}}empty_function_unreachable: +; SI: codeLenInByte = 0 +define void @empty_function_unreachable() #0 { + unreachable +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/endcf-loop-header.ll b/test/CodeGen/AMDGPU/endcf-loop-header.ll new file mode 100644 index 00000000000..267a323c506 --- /dev/null +++ b/test/CodeGen/AMDGPU/endcf-loop-header.ll @@ -0,0 +1,34 @@ +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s + +; This tests that the llvm.SI.end.cf intrinsic is not inserted into the +; loop block. This intrinsic will be lowered to s_or_b64 by the code +; generator. + +; CHECK-LABEL: {{^}}test: + +; This is was lowered from the llvm.SI.end.cf intrinsic: +; CHECK: s_or_b64 exec, exec + +; CHECK: [[LOOP_LABEL:[0-9A-Za-z_]+]]: ; %loop{{$}} +; CHECK-NOT: s_or_b64 exec, exec +; CHECK: s_cbranch_execnz [[LOOP_LABEL]] +define void @test(i32 addrspace(1)* %out, i32 %cond) { +entry: + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %if, label %loop + +if: + store i32 0, i32 addrspace(1)* %out + br label %loop + +loop: + %tmp1 = phi i32 [0, %entry], [0, %if], [%inc, %loop] + %inc = add i32 %tmp1, %cond + %tmp2 = icmp ugt i32 %inc, 10 + br i1 %tmp2, label %done, label %loop + +done: + %tmp3 = getelementptr i32, i32 addrspace(1)* %out, i64 1 + store i32 %inc, i32 addrspace(1)* %tmp3 + ret void +} diff --git a/test/CodeGen/AMDGPU/extload-private.ll b/test/CodeGen/AMDGPU/extload-private.ll new file mode 100644 index 00000000000..294c3a9c678 --- /dev/null +++ b/test/CodeGen/AMDGPU/extload-private.ll @@ -0,0 +1,46 @@ +; RUN: llc < %s -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}load_i8_sext_private: +; SI: buffer_load_sbyte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen +define void @load_i8_sext_private(i32 addrspace(1)* %out) { +entry: + %tmp0 = alloca i8 + %tmp1 = load i8, i8* %tmp0 + %tmp2 = sext i8 %tmp1 to i32 + store i32 %tmp2, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_i8_zext_private: +; SI: buffer_load_ubyte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen +define void @load_i8_zext_private(i32 addrspace(1)* %out) { +entry: + %tmp0 = alloca i8 + %tmp1 = load i8, i8* %tmp0 + %tmp2 = zext i8 %tmp1 to i32 + store i32 %tmp2, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_i16_sext_private: +; SI: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen +define void @load_i16_sext_private(i32 addrspace(1)* %out) { +entry: + %tmp0 = alloca i16 + %tmp1 = load i16, i16* %tmp0 + %tmp2 = sext i16 %tmp1 to i32 + store i32 %tmp2, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_i16_zext_private: +; SI: buffer_load_ushort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen +define void @load_i16_zext_private(i32 addrspace(1)* %out) { +entry: + %tmp0 = alloca i16 + %tmp1 = load i16, i16* %tmp0 + %tmp2 = zext i16 %tmp1 to i32 + store i32 %tmp2, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/extload.ll b/test/CodeGen/AMDGPU/extload.ll new file mode 100644 index 00000000000..662eb7a9716 --- /dev/null +++ b/test/CodeGen/AMDGPU/extload.ll @@ -0,0 +1,53 @@ +; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}anyext_load_i8: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]], +; EG: VTX_READ_32 [[VAL]] + +define void @anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind { + %cast = bitcast i8 addrspace(1)* %src to i32 addrspace(1)* + %load = load i32, i32 addrspace(1)* %cast, align 1 + %x = bitcast i32 %load to <4 x i8> + %castOut = bitcast i8 addrspace(1)* %out to <4 x i8> addrspace(1)* + store <4 x i8> %x, <4 x i8> addrspace(1)* %castOut, align 1 + ret void +} + +; FUNC-LABEL: {{^}}anyext_load_i16: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]], +; EG: VTX_READ_32 [[VAL]] + +define void @anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind { + %cast = bitcast i16 addrspace(1)* %src to i32 addrspace(1)* + %load = load i32, i32 addrspace(1)* %cast, align 1 + %x = bitcast i32 %load to <2 x i16> + %castOut = bitcast i16 addrspace(1)* %out to <2 x i16> addrspace(1)* + store <2 x i16> %x, <2 x i16> addrspace(1)* %castOut, align 1 + ret void +} + +; FUNC-LABEL: {{^}}anyext_load_lds_i8: +; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]] +; EG: LDS_WRITE * [[VAL]] +define void @anyext_load_lds_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind { + %cast = bitcast i8 addrspace(3)* %src to i32 addrspace(3)* + %load = load i32, i32 addrspace(3)* %cast, align 1 + %x = bitcast i32 %load to <4 x i8> + %castOut = bitcast i8 addrspace(3)* %out to <4 x i8> addrspace(3)* + store <4 x i8> %x, <4 x i8> addrspace(3)* %castOut, align 1 + ret void +} + +; FUNC-LABEL: {{^}}anyext_load_lds_i16: +; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]] +; EG: LDS_WRITE * [[VAL]] +define void @anyext_load_lds_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind { + %cast = bitcast i16 addrspace(3)* %src to i32 addrspace(3)* + %load = load i32, i32 addrspace(3)* %cast, align 1 + %x = bitcast i32 %load to <2 x i16> + %castOut = bitcast i16 addrspace(3)* %out to <2 x i16> addrspace(3)* + store <2 x i16> %x, <2 x i16> addrspace(3)* %castOut, align 1 + ret void +} diff --git a/test/CodeGen/AMDGPU/extract_vector_elt_i16.ll b/test/CodeGen/AMDGPU/extract_vector_elt_i16.ll new file mode 100644 index 00000000000..c7572efc6f5 --- /dev/null +++ b/test/CodeGen/AMDGPU/extract_vector_elt_i16.ll @@ -0,0 +1,30 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}extract_vector_elt_v2i16: +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_store_short +; SI: buffer_store_short +define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> %foo) nounwind { + %p0 = extractelement <2 x i16> %foo, i32 0 + %p1 = extractelement <2 x i16> %foo, i32 1 + %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 + store i16 %p1, i16 addrspace(1)* %out, align 2 + store i16 %p0, i16 addrspace(1)* %out1, align 2 + ret void +} + +; FUNC-LABEL: {{^}}extract_vector_elt_v4i16: +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_store_short +; SI: buffer_store_short +define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) nounwind { + %p0 = extractelement <4 x i16> %foo, i32 0 + %p1 = extractelement <4 x i16> %foo, i32 2 + %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 + store i16 %p1, i16 addrspace(1)* %out, align 2 + store i16 %p0, i16 addrspace(1)* %out1, align 2 + ret void +} diff --git a/test/CodeGen/AMDGPU/fabs.f64.ll b/test/CodeGen/AMDGPU/fabs.f64.ll new file mode 100644 index 00000000000..3c6136c1a7b --- /dev/null +++ b/test/CodeGen/AMDGPU/fabs.f64.ll @@ -0,0 +1,97 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +declare double @fabs(double) readnone +declare double @llvm.fabs.f64(double) readnone +declare <2 x double> @llvm.fabs.v2f64(<2 x double>) readnone +declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone + +; FUNC-LABEL: {{^}}v_fabs_f64: +; SI: v_and_b32 +; SI: s_endpgm +define void @v_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %tidext = sext i32 %tid to i64 + %gep = getelementptr double, double addrspace(1)* %in, i64 %tidext + %val = load double, double addrspace(1)* %gep, align 8 + %fabs = call double @llvm.fabs.f64(double %val) + store double %fabs, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fabs_f64: +; SI: v_and_b32 +; SI-NOT: v_and_b32 +; SI: s_endpgm +define void @fabs_f64(double addrspace(1)* %out, double %in) { + %fabs = call double @llvm.fabs.f64(double %in) + store double %fabs, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fabs_v2f64: +; SI: v_and_b32 +; SI: v_and_b32 +; SI: s_endpgm +define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { + %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in) + store <2 x double> %fabs, <2 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fabs_v4f64: +; SI: v_and_b32 +; SI: v_and_b32 +; SI: v_and_b32 +; SI: v_and_b32 +; SI: s_endpgm +define void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { + %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in) + store <4 x double> %fabs, <4 x double> addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}fabs_fold_f64: +; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI-NOT: and +; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}} +; SI: s_endpgm +define void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) { + %fabs = call double @llvm.fabs.f64(double %in0) + %fmul = fmul double %fabs, %in1 + store double %fmul, double addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}fabs_fn_fold_f64: +; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI-NOT: and +; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}} +; SI: s_endpgm +define void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) { + %fabs = call double @fabs(double %in0) + %fmul = fmul double %fabs, %in1 + store double %fmul, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fabs_free_f64: +; SI: v_and_b32 +; SI: s_endpgm +define void @fabs_free_f64(double addrspace(1)* %out, i64 %in) { + %bc= bitcast i64 %in to double + %fabs = call double @llvm.fabs.f64(double %bc) + store double %fabs, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fabs_fn_free_f64: +; SI: v_and_b32 +; SI: s_endpgm +define void @fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) { + %bc= bitcast i64 %in to double + %fabs = call double @fabs(double %bc) + store double %fabs, double addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/fabs.ll b/test/CodeGen/AMDGPU/fabs.ll new file mode 100644 index 00000000000..419a73d0266 --- /dev/null +++ b/test/CodeGen/AMDGPU/fabs.ll @@ -0,0 +1,101 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s + + +; DAGCombiner will transform: +; (fabs (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF)) +; unless isFabsFree returns true + +; FUNC-LABEL: {{^}}fabs_fn_free: +; R600-NOT: AND +; R600: |PV.{{[XYZW]}}| + +; GCN: v_and_b32 + +define void @fabs_fn_free(float addrspace(1)* %out, i32 %in) { + %bc= bitcast i32 %in to float + %fabs = call float @fabs(float %bc) + store float %fabs, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fabs_free: +; R600-NOT: AND +; R600: |PV.{{[XYZW]}}| + +; GCN: v_and_b32 + +define void @fabs_free(float addrspace(1)* %out, i32 %in) { + %bc= bitcast i32 %in to float + %fabs = call float @llvm.fabs.f32(float %bc) + store float %fabs, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fabs_f32: +; R600: |{{(PV|T[0-9])\.[XYZW]}}| + +; GCN: v_and_b32 +define void @fabs_f32(float addrspace(1)* %out, float %in) { + %fabs = call float @llvm.fabs.f32(float %in) + store float %fabs, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fabs_v2f32: +; R600: |{{(PV|T[0-9])\.[XYZW]}}| +; R600: |{{(PV|T[0-9])\.[XYZW]}}| + +; GCN: v_and_b32 +; GCN: v_and_b32 +define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { + %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) + store <2 x float> %fabs, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fabs_v4f32: +; R600: |{{(PV|T[0-9])\.[XYZW]}}| +; R600: |{{(PV|T[0-9])\.[XYZW]}}| +; R600: |{{(PV|T[0-9])\.[XYZW]}}| +; R600: |{{(PV|T[0-9])\.[XYZW]}}| + +; GCN: v_and_b32 +; GCN: v_and_b32 +; GCN: v_and_b32 +; GCN: v_and_b32 +define void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { + %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) + store <4 x float> %fabs, <4 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fabs_fn_fold: +; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb +; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c +; GCN-NOT: and +; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}} +define void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) { + %fabs = call float @fabs(float %in0) + %fmul = fmul float %fabs, %in1 + store float %fmul, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fabs_fold: +; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb +; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c +; GCN-NOT: and +; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}} +define void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) { + %fabs = call float @llvm.fabs.f32(float %in0) + %fmul = fmul float %fabs, %in1 + store float %fmul, float addrspace(1)* %out + ret void +} + +declare float @fabs(float) readnone +declare float @llvm.fabs.f32(float) readnone +declare <2 x float> @llvm.fabs.v2f32(<2 x float>) readnone +declare <4 x float> @llvm.fabs.v4f32(<4 x float>) readnone diff --git a/test/CodeGen/AMDGPU/fadd.ll b/test/CodeGen/AMDGPU/fadd.ll new file mode 100644 index 00000000000..5fac328c598 --- /dev/null +++ b/test/CodeGen/AMDGPU/fadd.ll @@ -0,0 +1,64 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC + +; FUNC-LABEL: {{^}}fadd_f32: +; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W +; SI: v_add_f32 +define void @fadd_f32(float addrspace(1)* %out, float %a, float %b) { + %add = fadd float %a, %b + store float %add, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}fadd_v2f32: +; R600-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z +; R600-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y +; SI: v_add_f32 +; SI: v_add_f32 +define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { + %add = fadd <2 x float> %a, %b + store <2 x float> %add, <2 x float> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}fadd_v4f32: +; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +define void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 + %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16 + %b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16 + %result = fadd <4 x float> %a, %b + store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}fadd_v8f32: +; R600: ADD +; R600: ADD +; R600: ADD +; R600: ADD +; R600: ADD +; R600: ADD +; R600: ADD +; R600: ADD +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +define void @fadd_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) { + %add = fadd <8 x float> %a, %b + store <8 x float> %add, <8 x float> addrspace(1)* %out, align 32 + ret void +} diff --git a/test/CodeGen/AMDGPU/fadd64.ll b/test/CodeGen/AMDGPU/fadd64.ll new file mode 100644 index 00000000000..485c55870c4 --- /dev/null +++ b/test/CodeGen/AMDGPU/fadd64.ll @@ -0,0 +1,14 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +; CHECK: {{^}}fadd_f64: +; CHECK: v_add_f64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}} + +define void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r2 = fadd double %r0, %r1 + store double %r2, double addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/fceil.ll b/test/CodeGen/AMDGPU/fceil.ll new file mode 100644 index 00000000000..f23e8919d73 --- /dev/null +++ b/test/CodeGen/AMDGPU/fceil.ll @@ -0,0 +1,132 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare float @llvm.ceil.f32(float) nounwind readnone +declare <2 x float> @llvm.ceil.v2f32(<2 x float>) nounwind readnone +declare <3 x float> @llvm.ceil.v3f32(<3 x float>) nounwind readnone +declare <4 x float> @llvm.ceil.v4f32(<4 x float>) nounwind readnone +declare <8 x float> @llvm.ceil.v8f32(<8 x float>) nounwind readnone +declare <16 x float> @llvm.ceil.v16f32(<16 x float>) nounwind readnone + +; FUNC-LABEL: {{^}}fceil_f32: +; SI: v_ceil_f32_e32 +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] +; EG: CEIL {{\*? *}}[[RESULT]] +define void @fceil_f32(float addrspace(1)* %out, float %x) { + %y = call float @llvm.ceil.f32(float %x) nounwind readnone + store float %y, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fceil_v2f32: +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} +; EG: CEIL {{\*? *}}[[RESULT]] +; EG: CEIL {{\*? *}}[[RESULT]] +define void @fceil_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) { + %y = call <2 x float> @llvm.ceil.v2f32(<2 x float> %x) nounwind readnone + store <2 x float> %y, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fceil_v3f32: +; FIXME-SI: v_ceil_f32_e32 +; FIXME-SI: v_ceil_f32_e32 +; FIXME-SI: v_ceil_f32_e32 +; FIXME-EG: v3 is treated as v2 and v1, hence 2 stores +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}} +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}} +; EG-DAG: CEIL {{\*? *}}[[RESULT1]] +; EG-DAG: CEIL {{\*? *}}[[RESULT2]] +; EG-DAG: CEIL {{\*? *}}[[RESULT2]] +define void @fceil_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) { + %y = call <3 x float> @llvm.ceil.v3f32(<3 x float> %x) nounwind readnone + store <3 x float> %y, <3 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fceil_v4f32: +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} +; EG: CEIL {{\*? *}}[[RESULT]] +; EG: CEIL {{\*? *}}[[RESULT]] +; EG: CEIL {{\*? *}}[[RESULT]] +; EG: CEIL {{\*? *}}[[RESULT]] +define void @fceil_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) { + %y = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) nounwind readnone + store <4 x float> %y, <4 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fceil_v8f32: +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}} +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}} +; EG-DAG: CEIL {{\*? *}}[[RESULT1]] +; EG-DAG: CEIL {{\*? *}}[[RESULT1]] +; EG-DAG: CEIL {{\*? *}}[[RESULT1]] +; EG-DAG: CEIL {{\*? *}}[[RESULT1]] +; EG-DAG: CEIL {{\*? *}}[[RESULT2]] +; EG-DAG: CEIL {{\*? *}}[[RESULT2]] +; EG-DAG: CEIL {{\*? *}}[[RESULT2]] +; EG-DAG: CEIL {{\*? *}}[[RESULT2]] +define void @fceil_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) { + %y = call <8 x float> @llvm.ceil.v8f32(<8 x float> %x) nounwind readnone + store <8 x float> %y, <8 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fceil_v16f32: +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; SI: v_ceil_f32_e32 +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}} +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}} +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT3:T[0-9]+]]{{\.[XYZW]}} +; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT4:T[0-9]+]]{{\.[XYZW]}} +; EG-DAG: CEIL {{\*? *}}[[RESULT1]] +; EG-DAG: CEIL {{\*? *}}[[RESULT1]] +; EG-DAG: CEIL {{\*? *}}[[RESULT1]] +; EG-DAG: CEIL {{\*? *}}[[RESULT1]] +; EG-DAG: CEIL {{\*? *}}[[RESULT2]] +; EG-DAG: CEIL {{\*? *}}[[RESULT2]] +; EG-DAG: CEIL {{\*? *}}[[RESULT2]] +; EG-DAG: CEIL {{\*? *}}[[RESULT2]] +; EG-DAG: CEIL {{\*? *}}[[RESULT3]] +; EG-DAG: CEIL {{\*? *}}[[RESULT3]] +; EG-DAG: CEIL {{\*? *}}[[RESULT3]] +; EG-DAG: CEIL {{\*? *}}[[RESULT3]] +; EG-DAG: CEIL {{\*? *}}[[RESULT4]] +; EG-DAG: CEIL {{\*? *}}[[RESULT4]] +; EG-DAG: CEIL {{\*? *}}[[RESULT4]] +; EG-DAG: CEIL {{\*? *}}[[RESULT4]] +define void @fceil_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) { + %y = call <16 x float> @llvm.ceil.v16f32(<16 x float> %x) nounwind readnone + store <16 x float> %y, <16 x float> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/fceil64.ll b/test/CodeGen/AMDGPU/fceil64.ll new file mode 100644 index 00000000000..e8c34f0141e --- /dev/null +++ b/test/CodeGen/AMDGPU/fceil64.ll @@ -0,0 +1,105 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s + +declare double @llvm.ceil.f64(double) nounwind readnone +declare <2 x double> @llvm.ceil.v2f64(<2 x double>) nounwind readnone +declare <3 x double> @llvm.ceil.v3f64(<3 x double>) nounwind readnone +declare <4 x double> @llvm.ceil.v4f64(<4 x double>) nounwind readnone +declare <8 x double> @llvm.ceil.v8f64(<8 x double>) nounwind readnone +declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone + +; FUNC-LABEL: {{^}}fceil_f64: +; CI: v_ceil_f64_e32 +; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014 +; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 +; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01 +; SI: s_lshr_b64 +; SI: s_not_b64 +; SI: s_and_b64 +; SI: cmp_gt_i32 +; SI: cndmask_b32 +; SI: cndmask_b32 +; SI: cmp_lt_i32 +; SI: cndmask_b32 +; SI: cndmask_b32 +; SI-DAG: v_cmp_lt_f64 +; SI-DAG: v_cmp_lg_f64 +; SI: s_and_b64 +; SI: v_cndmask_b32 +; SI: v_cndmask_b32 +; SI: v_add_f64 +; SI: s_endpgm +define void @fceil_f64(double addrspace(1)* %out, double %x) { + %y = call double @llvm.ceil.f64(double %x) nounwind readnone + store double %y, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fceil_v2f64: +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +define void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) { + %y = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x) nounwind readnone + store <2 x double> %y, <2 x double> addrspace(1)* %out + ret void +} + +; FIXME-FUNC-LABEL: {{^}}fceil_v3f64: +; FIXME-CI: v_ceil_f64_e32 +; FIXME-CI: v_ceil_f64_e32 +; FIXME-CI: v_ceil_f64_e32 +; define void @fceil_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) { +; %y = call <3 x double> @llvm.ceil.v3f64(<3 x double> %x) nounwind readnone +; store <3 x double> %y, <3 x double> addrspace(1)* %out +; ret void +; } + +; FUNC-LABEL: {{^}}fceil_v4f64: +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +define void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) { + %y = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) nounwind readnone + store <4 x double> %y, <4 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fceil_v8f64: +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +define void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) { + %y = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x) nounwind readnone + store <8 x double> %y, <8 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fceil_v16f64: +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +; CI: v_ceil_f64_e32 +define void @fceil_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) { + %y = call <16 x double> @llvm.ceil.v16f64(<16 x double> %x) nounwind readnone + store <16 x double> %y, <16 x double> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/fcmp-cnd.ll b/test/CodeGen/AMDGPU/fcmp-cnd.ll new file mode 100644 index 00000000000..530274f920f --- /dev/null +++ b/test/CodeGen/AMDGPU/fcmp-cnd.ll @@ -0,0 +1,14 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +;Not checking arguments 2 and 3 to CNDE, because they may change between +;registers and literal.x depending on what the optimizer does. +;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) { +entry: + %0 = load float, float addrspace(1)* %in + %cmp = fcmp oeq float %0, 0.000000e+00 + %value = select i1 %cmp, i32 2, i32 3 + store i32 %value, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll b/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll new file mode 100644 index 00000000000..c402805feb3 --- /dev/null +++ b/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll @@ -0,0 +1,16 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; This test checks a bug in R600TargetLowering::LowerSELECT_CC where the +; chance to optimize the fcmp + select instructions to SET* was missed +; due to the fact that the operands to fcmp and select had different types + +; CHECK: SET{{[A-Z]+}}_DX10 + +define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) { +entry: + %0 = load float, float addrspace(1)* %in + %cmp = fcmp oeq float %0, 0.000000e+00 + %value = select i1 %cmp, i32 -1, i32 0 + store i32 %value, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/fcmp.ll b/test/CodeGen/AMDGPU/fcmp.ll new file mode 100644 index 00000000000..5207ab57bad --- /dev/null +++ b/test/CodeGen/AMDGPU/fcmp.ll @@ -0,0 +1,38 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; CHECK: {{^}}fcmp_sext: +; CHECK: SETE_DX10 T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @fcmp_sext(i32 addrspace(1)* %out, float addrspace(1)* %in) { +entry: + %0 = load float, float addrspace(1)* %in + %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %in, i32 1 + %1 = load float, float addrspace(1)* %arrayidx1 + %cmp = fcmp oeq float %0, %1 + %sext = sext i1 %cmp to i32 + store i32 %sext, i32 addrspace(1)* %out + ret void +} + +; This test checks that a setcc node with f32 operands is lowered to a +; SET*_DX10 instruction. Previously we were lowering this to: +; SET* + FP_TO_SINT + +; CHECK: {{^}}fcmp_br: +; CHECK: SET{{[N]*}}E_DX10 * T{{[0-9]+\.[XYZW],}} +; CHECK-NEXT {{[0-9]+(5.0}} + +define void @fcmp_br(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp oeq float %in, 5.0 + br i1 %0, label %IF, label %ENDIF + +IF: + %1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + store i32 0, i32 addrspace(1)* %1 + br label %ENDIF + +ENDIF: + store i32 0, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/fcmp64.ll b/test/CodeGen/AMDGPU/fcmp64.ll new file mode 100644 index 00000000000..053ab0ed7aa --- /dev/null +++ b/test/CodeGen/AMDGPU/fcmp64.ll @@ -0,0 +1,74 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +; CHECK-LABEL: {{^}}flt_f64: +; CHECK: v_cmp_nge_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +define void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r2 = fcmp ult double %r0, %r1 + %r3 = zext i1 %r2 to i32 + store i32 %r3, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}fle_f64: +; CHECK: v_cmp_ngt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +define void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r2 = fcmp ule double %r0, %r1 + %r3 = zext i1 %r2 to i32 + store i32 %r3, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}fgt_f64: +; CHECK: v_cmp_nle_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +define void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r2 = fcmp ugt double %r0, %r1 + %r3 = zext i1 %r2 to i32 + store i32 %r3, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}fge_f64: +; CHECK: v_cmp_nlt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +define void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r2 = fcmp uge double %r0, %r1 + %r3 = zext i1 %r2 to i32 + store i32 %r3, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}fne_f64: +; CHECK: v_cmp_neq_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r2 = fcmp une double %r0, %r1 + %r3 = select i1 %r2, double %r0, double %r1 + store double %r3, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}feq_f64: +; CHECK: v_cmp_nlg_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} +define void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r2 = fcmp ueq double %r0, %r1 + %r3 = select i1 %r2, double %r0, double %r1 + store double %r3, double addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/fconst64.ll b/test/CodeGen/AMDGPU/fconst64.ll new file mode 100644 index 00000000000..89af37545c9 --- /dev/null +++ b/test/CodeGen/AMDGPU/fconst64.ll @@ -0,0 +1,13 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +; CHECK: {{^}}fconst_f64: +; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0x40140000 +; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0 + +define void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) { + %r1 = load double, double addrspace(1)* %in + %r2 = fadd double %r1, 5.000000e+00 + store double %r2, double addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/fcopysign.f32.ll b/test/CodeGen/AMDGPU/fcopysign.f32.ll new file mode 100644 index 00000000000..b719d5a3978 --- /dev/null +++ b/test/CodeGen/AMDGPU/fcopysign.f32.ll @@ -0,0 +1,53 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + + +declare float @llvm.copysign.f32(float, float) nounwind readnone +declare <2 x float> @llvm.copysign.v2f32(<2 x float>, <2 x float>) nounwind readnone +declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) nounwind readnone + +; Try to identify arg based on higher address. +; FUNC-LABEL: {{^}}test_copysign_f32: +; SI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0xb +; SI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0xc +; VI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0x2c +; VI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0x30 +; GCN-DAG: v_mov_b32_e32 [[VSIGN:v[0-9]+]], [[SSIGN]] +; GCN-DAG: v_mov_b32_e32 [[VMAG:v[0-9]+]], [[SMAG]] +; GCN-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff +; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[SCONST]], [[VMAG]], [[VSIGN]] +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm + +; EG: BFI_INT +define void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign) nounwind { + %result = call float @llvm.copysign.f32(float %mag, float %sign) + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_copysign_v2f32: +; GCN: s_endpgm + +; EG: BFI_INT +; EG: BFI_INT +define void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %mag, <2 x float> %sign) nounwind { + %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign) + store <2 x float> %result, <2 x float> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}test_copysign_v4f32: +; GCN: s_endpgm + +; EG: BFI_INT +; EG: BFI_INT +; EG: BFI_INT +; EG: BFI_INT +define void @test_copysign_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %mag, <4 x float> %sign) nounwind { + %result = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign) + store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16 + ret void +} + diff --git a/test/CodeGen/AMDGPU/fcopysign.f64.ll b/test/CodeGen/AMDGPU/fcopysign.f64.ll new file mode 100644 index 00000000000..3d8c5599308 --- /dev/null +++ b/test/CodeGen/AMDGPU/fcopysign.f64.ll @@ -0,0 +1,40 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s + +declare double @llvm.copysign.f64(double, double) nounwind readnone +declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) nounwind readnone +declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind readnone + +; FUNC-LABEL: {{^}}test_copysign_f64: +; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd +; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; GCN-DAG: v_mov_b32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]] +; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]] +; GCN-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff +; GCN: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]] +; GCN: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]] +; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}} +; GCN: s_endpgm +define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind { + %result = call double @llvm.copysign.f64(double %mag, double %sign) + store double %result, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}test_copysign_v2f64: +; GCN: s_endpgm +define void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %mag, <2 x double> %sign) nounwind { + %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign) + store <2 x double> %result, <2 x double> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}test_copysign_v4f64: +; GCN: s_endpgm +define void @test_copysign_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %mag, <4 x double> %sign) nounwind { + %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign) + store <4 x double> %result, <4 x double> addrspace(1)* %out, align 8 + ret void +} diff --git a/test/CodeGen/AMDGPU/fdiv.f64.ll b/test/CodeGen/AMDGPU/fdiv.f64.ll new file mode 100644 index 00000000000..7c022e38c80 --- /dev/null +++ b/test/CodeGen/AMDGPU/fdiv.f64.ll @@ -0,0 +1,96 @@ +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=COMMON %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=COMMON %s + + +; COMMON-LABEL: {{^}}fdiv_f64: +; COMMON-DAG: buffer_load_dwordx2 [[NUM:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0 +; COMMON-DAG: buffer_load_dwordx2 [[DEN:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0 offset:8 +; CI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[DEN]], [[DEN]], [[NUM]] +; CI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], vcc, [[NUM]], [[DEN]], [[NUM]] + +; Check for div_scale bug workaround on SI +; SI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[DEN]], [[DEN]], [[NUM]] +; SI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[NUM]], [[DEN]], [[NUM]] + +; COMMON-DAG: v_rcp_f64_e32 [[RCP_SCALE0:v\[[0-9]+:[0-9]+\]]], [[SCALE0]] + +; SI-DAG: v_cmp_eq_i32_e32 vcc, {{v[0-9]+}}, {{v[0-9]+}} +; SI-DAG: v_cmp_eq_i32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}} +; SI-DAG: s_xor_b64 vcc, [[CMP0]], vcc + +; COMMON-DAG: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[RCP_SCALE0]], 1.0 +; COMMON-DAG: v_fma_f64 [[FMA1:v\[[0-9]+:[0-9]+\]]], [[RCP_SCALE0]], [[FMA0]], [[RCP_SCALE0]] +; COMMON-DAG: v_fma_f64 [[FMA2:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[FMA1]], 1.0 +; COMMON-DAG: v_fma_f64 [[FMA3:v\[[0-9]+:[0-9]+\]]], [[FMA1]], [[FMA2]], [[FMA1]] +; COMMON-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[SCALE1]], [[FMA3]] +; COMMON-DAG: v_fma_f64 [[FMA4:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[MUL]], [[SCALE1]] +; COMMON: v_div_fmas_f64 [[FMAS:v\[[0-9]+:[0-9]+\]]], [[FMA4]], [[FMA3]], [[MUL]] +; COMMON: v_div_fixup_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[FMAS]], [[DEN]], [[NUM]] +; COMMON: buffer_store_dwordx2 [[RESULT]] +; COMMON: s_endpgm +define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) nounwind { + %gep.1 = getelementptr double, double addrspace(1)* %in, i32 1 + %num = load double, double addrspace(1)* %in + %den = load double, double addrspace(1)* %gep.1 + %result = fdiv double %num, %den + store double %result, double addrspace(1)* %out + ret void +} + +; COMMON-LABEL: {{^}}fdiv_f64_s_v: +define void @fdiv_f64_s_v(double addrspace(1)* %out, double addrspace(1)* %in, double %num) nounwind { + %den = load double, double addrspace(1)* %in + %result = fdiv double %num, %den + store double %result, double addrspace(1)* %out + ret void +} + +; COMMON-LABEL: {{^}}fdiv_f64_v_s: +define void @fdiv_f64_v_s(double addrspace(1)* %out, double addrspace(1)* %in, double %den) nounwind { + %num = load double, double addrspace(1)* %in + %result = fdiv double %num, %den + store double %result, double addrspace(1)* %out + ret void +} + +; COMMON-LABEL: {{^}}fdiv_f64_s_s: +define void @fdiv_f64_s_s(double addrspace(1)* %out, double %num, double %den) nounwind { + %result = fdiv double %num, %den + store double %result, double addrspace(1)* %out + ret void +} + +; COMMON-LABEL: {{^}}v_fdiv_v2f64: +define void @v_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) nounwind { + %gep.1 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in, i32 1 + %num = load <2 x double>, <2 x double> addrspace(1)* %in + %den = load <2 x double>, <2 x double> addrspace(1)* %gep.1 + %result = fdiv <2 x double> %num, %den + store <2 x double> %result, <2 x double> addrspace(1)* %out + ret void +} + +; COMMON-LABEL: {{^}}s_fdiv_v2f64: +define void @s_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %num, <2 x double> %den) { + %result = fdiv <2 x double> %num, %den + store <2 x double> %result, <2 x double> addrspace(1)* %out + ret void +} + +; COMMON-LABEL: {{^}}v_fdiv_v4f64: +define void @v_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) nounwind { + %gep.1 = getelementptr <4 x double>, <4 x double> addrspace(1)* %in, i32 1 + %num = load <4 x double>, <4 x double> addrspace(1)* %in + %den = load <4 x double>, <4 x double> addrspace(1)* %gep.1 + %result = fdiv <4 x double> %num, %den + store <4 x double> %result, <4 x double> addrspace(1)* %out + ret void +} + +; COMMON-LABEL: {{^}}s_fdiv_v4f64: +define void @s_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %num, <4 x double> %den) { + %result = fdiv <4 x double> %num, %den + store <4 x double> %result, <4 x double> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/fdiv.ll b/test/CodeGen/AMDGPU/fdiv.ll new file mode 100644 index 00000000000..7cbf8733639 --- /dev/null +++ b/test/CodeGen/AMDGPU/fdiv.ll @@ -0,0 +1,68 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; These tests check that fdiv is expanded correctly and also test that the +; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate +; instruction groups. + +; FUNC-LABEL: {{^}}fdiv_f32: +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS + +; SI-DAG: v_rcp_f32 +; SI-DAG: v_mul_f32 +define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fdiv float %a, %b + store float %0, float addrspace(1)* %out + ret void +} + + + +; FUNC-LABEL: {{^}}fdiv_v2f32: +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS + +; SI-DAG: v_rcp_f32 +; SI-DAG: v_mul_f32 +; SI-DAG: v_rcp_f32 +; SI-DAG: v_mul_f32 +define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { +entry: + %0 = fdiv <2 x float> %a, %b + store <2 x float> %0, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fdiv_v4f32: +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS + +; SI-DAG: v_rcp_f32 +; SI-DAG: v_mul_f32 +; SI-DAG: v_rcp_f32 +; SI-DAG: v_mul_f32 +; SI-DAG: v_rcp_f32 +; SI-DAG: v_mul_f32 +; SI-DAG: v_rcp_f32 +; SI-DAG: v_mul_f32 +define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 + %a = load <4 x float>, <4 x float> addrspace(1) * %in + %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr + %result = fdiv <4 x float> %a, %b + store <4 x float> %result, <4 x float> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/fetch-limits.r600.ll b/test/CodeGen/AMDGPU/fetch-limits.r600.ll new file mode 100644 index 00000000000..e7160ef5d72 --- /dev/null +++ b/test/CodeGen/AMDGPU/fetch-limits.r600.ll @@ -0,0 +1,48 @@ +; RUN: llc < %s -march=r600 -mcpu=r600 | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=rs880 | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=rv670 | FileCheck %s + +; R600 supports 8 fetches in a clause +; CHECK: {{^}}fetch_limits_r600: +; CHECK: Fetch clause +; CHECK: Fetch clause + +define void @fetch_limits_r600() #0 { +entry: + %0 = load <4 x float>, <4 x float> addrspace(8)* null + %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) + %5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) + %6 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6) + %7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) + %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) + %res0 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %0, i32 0, i32 0, i32 1) + %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %1, i32 0, i32 0, i32 1) + %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %2, i32 0, i32 0, i32 1) + %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %3, i32 0, i32 0, i32 1) + %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %4, i32 0, i32 0, i32 1) + %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %5, i32 0, i32 0, i32 1) + %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %6, i32 0, i32 0, i32 1) + %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %7, i32 0, i32 0, i32 1) + %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1) + %a = fadd <4 x float> %res0, %res1 + %b = fadd <4 x float> %res2, %res3 + %c = fadd <4 x float> %res4, %res5 + %d = fadd <4 x float> %res6, %res7 + %e = fadd <4 x float> %res8, %a + + %bc = fadd <4 x float> %b, %c + %de = fadd <4 x float> %d, %e + + %bcde = fadd <4 x float> %bc, %de + + call void @llvm.R600.store.swizzle(<4 x float> %bcde, i32 0, i32 1) + ret void +} + +attributes #0 = { "ShaderType"="0" } ; Pixel Shader + +declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) diff --git a/test/CodeGen/AMDGPU/fetch-limits.r700+.ll b/test/CodeGen/AMDGPU/fetch-limits.r700+.ll new file mode 100644 index 00000000000..acaea2aa794 --- /dev/null +++ b/test/CodeGen/AMDGPU/fetch-limits.r700+.ll @@ -0,0 +1,81 @@ +; RUN: llc < %s -march=r600 -mcpu=rv710 | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=rv730 | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=rv770 | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=cedar | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=sumo | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=juniper | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=cypress | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=barts | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=turks | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=caicos | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s + +; r700+ supports 16 fetches in a clause +; CHECK: {{^}}fetch_limits_r700: +; CHECK: Fetch clause +; CHECK: Fetch clause + +define void @fetch_limits_r700() #0 { +entry: + %0 = load <4 x float>, <4 x float> addrspace(8)* null + %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) + %5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) + %6 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6) + %7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) + %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) + %9 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) + %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10) + %11 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11) + %12 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12) + %13 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 13) + %14 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14) + %15 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 15) + %16 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16) + %res0 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %0, i32 0, i32 0, i32 1) + %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %1, i32 0, i32 0, i32 1) + %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %2, i32 0, i32 0, i32 1) + %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %3, i32 0, i32 0, i32 1) + %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %4, i32 0, i32 0, i32 1) + %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %5, i32 0, i32 0, i32 1) + %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %6, i32 0, i32 0, i32 1) + %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %7, i32 0, i32 0, i32 1) + %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1) + %res9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %9, i32 0, i32 0, i32 1) + %res10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %10, i32 0, i32 0, i32 1) + %res11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %11, i32 0, i32 0, i32 1) + %res12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %12, i32 0, i32 0, i32 1) + %res13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %13, i32 0, i32 0, i32 1) + %res14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %14, i32 0, i32 0, i32 1) + %res15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %15, i32 0, i32 0, i32 1) + %res16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %16, i32 0, i32 0, i32 1) + %a = fadd <4 x float> %res0, %res1 + %b = fadd <4 x float> %res2, %res3 + %c = fadd <4 x float> %res4, %res5 + %d = fadd <4 x float> %res6, %res7 + %e = fadd <4 x float> %res8, %res9 + %f = fadd <4 x float> %res10, %res11 + %g = fadd <4 x float> %res12, %res13 + %h = fadd <4 x float> %res14, %res15 + %i = fadd <4 x float> %res16, %a + + %bc = fadd <4 x float> %b, %c + %de = fadd <4 x float> %d, %e + %fg = fadd <4 x float> %f, %g + %hi = fadd <4 x float> %h, %i + + %bcde = fadd <4 x float> %bc, %de + %fghi = fadd <4 x float> %fg, %hi + + %bcdefghi = fadd <4 x float> %bcde, %fghi + call void @llvm.R600.store.swizzle(<4 x float> %bcdefghi, i32 0, i32 1) + ret void +} + +attributes #0 = { "ShaderType"="0" } ; Pixel Shader + +declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) diff --git a/test/CodeGen/AMDGPU/ffloor.f64.ll b/test/CodeGen/AMDGPU/ffloor.f64.ll new file mode 100644 index 00000000000..45f8382c392 --- /dev/null +++ b/test/CodeGen/AMDGPU/ffloor.f64.ll @@ -0,0 +1,127 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s + +declare double @llvm.fabs.f64(double %Val) +declare double @llvm.floor.f64(double) nounwind readnone +declare <2 x double> @llvm.floor.v2f64(<2 x double>) nounwind readnone +declare <3 x double> @llvm.floor.v3f64(<3 x double>) nounwind readnone +declare <4 x double> @llvm.floor.v4f64(<4 x double>) nounwind readnone +declare <8 x double> @llvm.floor.v8f64(<8 x double>) nounwind readnone +declare <16 x double> @llvm.floor.v16f64(<16 x double>) nounwind readnone + +; FUNC-LABEL: {{^}}ffloor_f64: +; CI: v_floor_f64_e32 +; SI: v_fract_f64_e32 +; SI: v_min_f64 +; SI: v_cmp_class_f64_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_add_f64 +; SI: s_endpgm +define void @ffloor_f64(double addrspace(1)* %out, double %x) { + %y = call double @llvm.floor.f64(double %x) nounwind readnone + store double %y, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ffloor_f64_neg: +; CI: v_floor_f64_e64 +; SI: v_fract_f64_e64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT:s[[0-9]+:[0-9]+]]] +; SI: v_min_f64 +; SI: v_cmp_class_f64_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT]] +; SI: s_endpgm +define void @ffloor_f64_neg(double addrspace(1)* %out, double %x) { + %neg = fsub double 0.0, %x + %y = call double @llvm.floor.f64(double %neg) nounwind readnone + store double %y, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ffloor_f64_neg_abs: +; CI: v_floor_f64_e64 +; SI: v_fract_f64_e64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT:s[[0-9]+:[0-9]+]]]| +; SI: v_min_f64 +; SI: v_cmp_class_f64_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT]]| +; SI: s_endpgm +define void @ffloor_f64_neg_abs(double addrspace(1)* %out, double %x) { + %abs = call double @llvm.fabs.f64(double %x) + %neg = fsub double 0.0, %abs + %y = call double @llvm.floor.f64(double %neg) nounwind readnone + store double %y, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ffloor_v2f64: +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) { + %y = call <2 x double> @llvm.floor.v2f64(<2 x double> %x) nounwind readnone + store <2 x double> %y, <2 x double> addrspace(1)* %out + ret void +} + +; FIXME-FUNC-LABEL: {{^}}ffloor_v3f64: +; FIXME-CI: v_floor_f64_e32 +; FIXME-CI: v_floor_f64_e32 +; FIXME-CI: v_floor_f64_e32 +; define void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) { +; %y = call <3 x double> @llvm.floor.v3f64(<3 x double> %x) nounwind readnone +; store <3 x double> %y, <3 x double> addrspace(1)* %out +; ret void +; } + +; FUNC-LABEL: {{^}}ffloor_v4f64: +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +define void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) { + %y = call <4 x double> @llvm.floor.v4f64(<4 x double> %x) nounwind readnone + store <4 x double> %y, <4 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ffloor_v8f64: +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +define void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) { + %y = call <8 x double> @llvm.floor.v8f64(<8 x double> %x) nounwind readnone + store <8 x double> %y, <8 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ffloor_v16f64: +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +; CI: v_floor_f64_e32 +define void @ffloor_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) { + %y = call <16 x double> @llvm.floor.v16f64(<16 x double> %x) nounwind readnone + store <16 x double> %y, <16 x double> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/ffloor.ll b/test/CodeGen/AMDGPU/ffloor.ll new file mode 100644 index 00000000000..61c46ac2bc0 --- /dev/null +++ b/test/CodeGen/AMDGPU/ffloor.ll @@ -0,0 +1,49 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}floor_f32: +; SI: v_floor_f32_e32 +; R600: FLOOR +define void @floor_f32(float addrspace(1)* %out, float %in) { + %tmp = call float @llvm.floor.f32(float %in) #0 + store float %tmp, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}floor_v2f32: +; SI: v_floor_f32_e32 +; SI: v_floor_f32_e32 + +define void @floor_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { + %tmp = call <2 x float> @llvm.floor.v2f32(<2 x float> %in) #0 + store <2 x float> %tmp, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}floor_v4f32: +; SI: v_floor_f32_e32 +; SI: v_floor_f32_e32 +; SI: v_floor_f32_e32 +; SI: v_floor_f32_e32 + +; R600: FLOOR +; R600: FLOOR +; R600: FLOOR +; R600: FLOOR +define void @floor_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { + %tmp = call <4 x float> @llvm.floor.v4f32(<4 x float> %in) #0 + store <4 x float> %tmp, <4 x float> addrspace(1)* %out + ret void +} + +; Function Attrs: nounwind readonly +declare float @llvm.floor.f32(float) #0 + +; Function Attrs: nounwind readonly +declare <2 x float> @llvm.floor.v2f32(<2 x float>) #0 + +; Function Attrs: nounwind readonly +declare <4 x float> @llvm.floor.v4f32(<4 x float>) #0 + +attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/flat-address-space.ll b/test/CodeGen/AMDGPU/flat-address-space.ll new file mode 100644 index 00000000000..8ceca078f2d --- /dev/null +++ b/test/CodeGen/AMDGPU/flat-address-space.ll @@ -0,0 +1,184 @@ +; RUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s +; RUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s +; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s +; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s + +; Disable optimizations in case there are optimizations added that +; specialize away generic pointer accesses. + + +; CHECK-LABEL: {{^}}branch_use_flat_i32: +; CHECK: flat_store_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} +; CHECK: s_endpgm +define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 { +entry: + %cmp = icmp ne i32 %c, 0 + br i1 %cmp, label %local, label %global + +local: + %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32 addrspace(4)* + br label %end + +global: + %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* + br label %end + +end: + %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ] + store i32 %x, i32 addrspace(4)* %fptr, align 4 +; %val = load i32, i32 addrspace(4)* %fptr, align 4 +; store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + + + +; These testcases might become useless when there are optimizations to +; remove generic pointers. + +; CHECK-LABEL: {{^}}store_flat_i32: +; CHECK: v_mov_b32_e32 v[[DATA:[0-9]+]], {{s[0-9]+}} +; CHECK: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], {{s[0-9]+}} +; CHECK: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], {{s[0-9]+}} +; CHECK: flat_store_dword v[[DATA]], v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} +define void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 { + %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* + store i32 %x, i32 addrspace(4)* %fptr, align 4 + ret void +} + +; CHECK-LABEL: {{^}}store_flat_i64: +; CHECK: flat_store_dwordx2 +define void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 { + %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)* + store i64 %x, i64 addrspace(4)* %fptr, align 8 + ret void +} + +; CHECK-LABEL: {{^}}store_flat_v4i32: +; CHECK: flat_store_dwordx4 +define void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 { + %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)* + store <4 x i32> %x, <4 x i32> addrspace(4)* %fptr, align 16 + ret void +} + +; CHECK-LABEL: {{^}}store_flat_trunc_i16: +; CHECK: flat_store_short +define void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 { + %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)* + %y = trunc i32 %x to i16 + store i16 %y, i16 addrspace(4)* %fptr, align 2 + ret void +} + +; CHECK-LABEL: {{^}}store_flat_trunc_i8: +; CHECK: flat_store_byte +define void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 { + %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)* + %y = trunc i32 %x to i8 + store i8 %y, i8 addrspace(4)* %fptr, align 2 + ret void +} + + + +; CHECK-LABEL @load_flat_i32: +; CHECK: flat_load_dword +define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 { + %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* + %fload = load i32, i32 addrspace(4)* %fptr, align 4 + store i32 %fload, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL @load_flat_i64: +; CHECK: flat_load_dwordx2 +define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 { + %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)* + %fload = load i64, i64 addrspace(4)* %fptr, align 4 + store i64 %fload, i64 addrspace(1)* %out, align 8 + ret void +} + +; CHECK-LABEL @load_flat_v4i32: +; CHECK: flat_load_dwordx4 +define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 { + %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)* + %fload = load <4 x i32>, <4 x i32> addrspace(4)* %fptr, align 4 + store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8 + ret void +} + +; CHECK-LABEL @sextload_flat_i8: +; CHECK: flat_load_sbyte +define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 { + %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)* + %fload = load i8, i8 addrspace(4)* %fptr, align 4 + %ext = sext i8 %fload to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL @zextload_flat_i8: +; CHECK: flat_load_ubyte +define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 { + %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)* + %fload = load i8, i8 addrspace(4)* %fptr, align 4 + %ext = zext i8 %fload to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL @sextload_flat_i16: +; CHECK: flat_load_sshort +define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 { + %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)* + %fload = load i16, i16 addrspace(4)* %fptr, align 4 + %ext = sext i16 %fload to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL @zextload_flat_i16: +; CHECK: flat_load_ushort +define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 { + %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)* + %fload = load i16, i16 addrspace(4)* %fptr, align 4 + %ext = zext i16 %fload to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + + + +; TODO: This should not be zero when registers are used for small +; scratch allocations again. + +; Check for prologue initializing special SGPRs pointing to scratch. +; CHECK-LABEL: {{^}}store_flat_scratch: +; CHECK: s_movk_i32 flat_scratch_lo, 0 +; CHECK-NO-PROMOTE: s_movk_i32 flat_scratch_hi, 0x28{{$}} +; CHECK-PROMOTE: s_movk_i32 flat_scratch_hi, 0x0{{$}} +; CHECK: flat_store_dword +; CHECK: s_barrier +; CHECK: flat_load_dword +define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 { + %alloca = alloca i32, i32 9, align 4 + %x = call i32 @llvm.r600.read.tidig.x() #3 + %pptr = getelementptr i32, i32* %alloca, i32 %x + %fptr = addrspacecast i32* %pptr to i32 addrspace(4)* + store i32 %x, i32 addrspace(4)* %fptr + ; Dummy call + call void @llvm.AMDGPU.barrier.local() #1 + %reload = load i32, i32 addrspace(4)* %fptr, align 4 + store i32 %reload, i32 addrspace(1)* %out, align 4 + ret void +} + +declare void @llvm.AMDGPU.barrier.local() #1 +declare i32 @llvm.r600.read.tidig.x() #3 + +attributes #0 = { nounwind } +attributes #1 = { nounwind noduplicate } +attributes #3 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/floor.ll b/test/CodeGen/AMDGPU/floor.ll new file mode 100644 index 00000000000..c6bfb8567a0 --- /dev/null +++ b/test/CodeGen/AMDGPU/floor.ll @@ -0,0 +1,15 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s + +; CHECK: FLOOR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +define void @test(<4 x float> inreg %reg0) #0 { + %r0 = extractelement <4 x float> %reg0, i32 0 + %r1 = call float @floor(float %r0) + %vec = insertelement <4 x float> undef, float %r1, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) + ret void +} + +declare float @floor(float) readonly +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="0" } diff --git a/test/CodeGen/AMDGPU/fma-combine.ll b/test/CodeGen/AMDGPU/fma-combine.ll new file mode 100644 index 00000000000..bd574b87711 --- /dev/null +++ b/test/CodeGen/AMDGPU/fma-combine.ll @@ -0,0 +1,368 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-FASTFMAF -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-SLOWFMAF -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() #0 +declare double @llvm.fabs.f64(double) #0 +declare double @llvm.fma.f64(double, double, double) #0 +declare float @llvm.fma.f32(float, float, float) #0 + +; (fadd (fmul x, y), z) -> (fma x, y, z) +; FUNC-LABEL: {{^}}combine_to_fma_f64_0: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] +; SI: buffer_store_dwordx2 [[RESULT]] +define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 + %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid + + %a = load double, double addrspace(1)* %gep.0 + %b = load double, double addrspace(1)* %gep.1 + %c = load double, double addrspace(1)* %gep.2 + + %mul = fmul double %a, %b + %fma = fadd double %mul, %c + store double %fma, double addrspace(1)* %gep.out + ret void +} + +; (fadd (fmul x, y), z) -> (fma x, y, z) +; FUNC-LABEL: {{^}}combine_to_fma_f64_0_2use: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} +; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] +; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[D]] +; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI: s_endpgm +define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 + %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid + %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0 + %b = load double, double addrspace(1)* %gep.1 + %c = load double, double addrspace(1)* %gep.2 + %d = load double, double addrspace(1)* %gep.3 + + %mul = fmul double %a, %b + %fma0 = fadd double %mul, %c + %fma1 = fadd double %mul, %d + store double %fma0, double addrspace(1)* %gep.out.0 + store double %fma1, double addrspace(1)* %gep.out.1 + ret void +} + +; (fadd x, (fmul y, z)) -> (fma y, z, x) +; FUNC-LABEL: {{^}}combine_to_fma_f64_1: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] +; SI: buffer_store_dwordx2 [[RESULT]] +define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 + %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid + + %a = load double, double addrspace(1)* %gep.0 + %b = load double, double addrspace(1)* %gep.1 + %c = load double, double addrspace(1)* %gep.2 + + %mul = fmul double %a, %b + %fma = fadd double %c, %mul + store double %fma, double addrspace(1)* %gep.out + ret void +} + +; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) +; FUNC-LABEL: {{^}}combine_to_fma_fsub_0_f64: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]] +; SI: buffer_store_dwordx2 [[RESULT]] +define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 + %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid + + %a = load double, double addrspace(1)* %gep.0 + %b = load double, double addrspace(1)* %gep.1 + %c = load double, double addrspace(1)* %gep.2 + + %mul = fmul double %a, %b + %fma = fsub double %mul, %c + store double %fma, double addrspace(1)* %gep.out + ret void +} + +; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) +; FUNC-LABEL: {{^}}combine_to_fma_fsub_f64_0_2use: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} +; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]] +; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]] +; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI: s_endpgm +define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 + %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid + %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0 + %b = load double, double addrspace(1)* %gep.1 + %c = load double, double addrspace(1)* %gep.2 + %d = load double, double addrspace(1)* %gep.3 + + %mul = fmul double %a, %b + %fma0 = fsub double %mul, %c + %fma1 = fsub double %mul, %d + store double %fma0, double addrspace(1)* %gep.out.0 + store double %fma1, double addrspace(1)* %gep.out.1 + ret void +} + +; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) +; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]] +; SI: buffer_store_dwordx2 [[RESULT]] +define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 + %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid + + %a = load double, double addrspace(1)* %gep.0 + %b = load double, double addrspace(1)* %gep.1 + %c = load double, double addrspace(1)* %gep.2 + + %mul = fmul double %a, %b + %fma = fsub double %c, %mul + store double %fma, double addrspace(1)* %gep.out + ret void +} + +; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) +; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64_2use: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} +; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]] +; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[D]] +; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI: s_endpgm +define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 + %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid + %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0 + %b = load double, double addrspace(1)* %gep.1 + %c = load double, double addrspace(1)* %gep.2 + %d = load double, double addrspace(1)* %gep.3 + + %mul = fmul double %a, %b + %fma0 = fsub double %c, %mul + %fma1 = fsub double %d, %mul + store double %fma0, double addrspace(1)* %gep.out.0 + store double %fma1, double addrspace(1)* %gep.out.1 + ret void +} + +; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) +; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] +; SI: buffer_store_dwordx2 [[RESULT]] +define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 + %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid + + %a = load double, double addrspace(1)* %gep.0 + %b = load double, double addrspace(1)* %gep.1 + %c = load double, double addrspace(1)* %gep.2 + + %mul = fmul double %a, %b + %mul.neg = fsub double -0.0, %mul + %fma = fsub double %mul.neg, %c + + store double %fma, double addrspace(1)* %gep.out + ret void +} + +; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) +; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_neg: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] +; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[D]] +; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI: s_endpgm +define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 + %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid + %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0 + %b = load double, double addrspace(1)* %gep.1 + %c = load double, double addrspace(1)* %gep.2 + %d = load double, double addrspace(1)* %gep.3 + + %mul = fmul double %a, %b + %mul.neg = fsub double -0.0, %mul + %fma0 = fsub double %mul.neg, %c + %fma1 = fsub double %mul.neg, %d + + store double %fma0, double addrspace(1)* %gep.out.0 + store double %fma1, double addrspace(1)* %gep.out.1 + ret void +} + +; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) +; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_mul: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] +; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]] +; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI: s_endpgm +define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 + %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid + %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0 + %b = load double, double addrspace(1)* %gep.1 + %c = load double, double addrspace(1)* %gep.2 + %d = load double, double addrspace(1)* %gep.3 + + %mul = fmul double %a, %b + %mul.neg = fsub double -0.0, %mul + %fma0 = fsub double %mul.neg, %c + %fma1 = fsub double %mul, %d + + store double %fma0, double addrspace(1)* %gep.out.0 + store double %fma1, double addrspace(1)* %gep.out.1 + ret void +} + +; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) + +; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_0_f64: +; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} +; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}} +; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]], -[[Z]] +; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]] +; SI: buffer_store_dwordx2 [[RESULT]] +define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 + %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4 + %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid + + %x = load double, double addrspace(1)* %gep.0 + %y = load double, double addrspace(1)* %gep.1 + %z = load double, double addrspace(1)* %gep.2 + %u = load double, double addrspace(1)* %gep.3 + %v = load double, double addrspace(1)* %gep.4 + + %tmp0 = fmul double %u, %v + %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0 + %tmp2 = fsub double %tmp1, %z + + store double %tmp2, double addrspace(1)* %gep.out + ret void +} + +; fold (fsub x, (fma y, z, (fmul u, v))) +; -> (fma (fneg y), z, (fma (fneg u), v, x)) + +; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_1_f64: +; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} +; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} +; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}} +; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[U]], [[V]], [[X]] +; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]] +; SI: buffer_store_dwordx2 [[RESULT]] +define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 + %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4 + %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid + + %x = load double, double addrspace(1)* %gep.0 + %y = load double, double addrspace(1)* %gep.1 + %z = load double, double addrspace(1)* %gep.2 + %u = load double, double addrspace(1)* %gep.3 + %v = load double, double addrspace(1)* %gep.4 + + %tmp0 = fmul double %u, %v + %tmp1 = call double @llvm.fma.f64(double %y, double %z, double %tmp0) #0 + %tmp2 = fsub double %x, %tmp1 + + store double %tmp2, double addrspace(1)* %gep.out + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/fma.f64.ll b/test/CodeGen/AMDGPU/fma.f64.ll new file mode 100644 index 00000000000..0a55ef77855 --- /dev/null +++ b/test/CodeGen/AMDGPU/fma.f64.ll @@ -0,0 +1,47 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare double @llvm.fma.f64(double, double, double) nounwind readnone +declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone +declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) nounwind readnone + + +; FUNC-LABEL: {{^}}fma_f64: +; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +define void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2, double addrspace(1)* %in3) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r2 = load double, double addrspace(1)* %in3 + %r3 = tail call double @llvm.fma.f64(double %r0, double %r1, double %r2) + store double %r3, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fma_v2f64: +; SI: v_fma_f64 +; SI: v_fma_f64 +define void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, + <2 x double> addrspace(1)* %in2, <2 x double> addrspace(1)* %in3) { + %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1 + %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2 + %r2 = load <2 x double>, <2 x double> addrspace(1)* %in3 + %r3 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2) + store <2 x double> %r3, <2 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fma_v4f64: +; SI: v_fma_f64 +; SI: v_fma_f64 +; SI: v_fma_f64 +; SI: v_fma_f64 +define void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1, + <4 x double> addrspace(1)* %in2, <4 x double> addrspace(1)* %in3) { + %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1 + %r1 = load <4 x double>, <4 x double> addrspace(1)* %in2 + %r2 = load <4 x double>, <4 x double> addrspace(1)* %in3 + %r3 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %r0, <4 x double> %r1, <4 x double> %r2) + store <4 x double> %r3, <4 x double> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/fma.ll b/test/CodeGen/AMDGPU/fma.ll new file mode 100644 index 00000000000..d6024aa0b4c --- /dev/null +++ b/test/CodeGen/AMDGPU/fma.ll @@ -0,0 +1,92 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare float @llvm.fma.f32(float, float, float) nounwind readnone +declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone +declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; FUNC-LABEL: {{^}}fma_f32: +; SI: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}} + +; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, +; EG: FMA {{\*? *}}[[RES]] +define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1, + float addrspace(1)* %in2, float addrspace(1)* %in3) { + %r0 = load float, float addrspace(1)* %in1 + %r1 = load float, float addrspace(1)* %in2 + %r2 = load float, float addrspace(1)* %in3 + %r3 = tail call float @llvm.fma.f32(float %r0, float %r1, float %r2) + store float %r3, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fma_v2f32: +; SI: v_fma_f32 +; SI: v_fma_f32 + +; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].[[CHLO:[XYZW]]][[CHHI:[XYZW]]], {{T[0-9]\.[XYZW]}}, +; EG-DAG: FMA {{\*? *}}[[RES]].[[CHLO]] +; EG-DAG: FMA {{\*? *}}[[RES]].[[CHHI]] +define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1, + <2 x float> addrspace(1)* %in2, <2 x float> addrspace(1)* %in3) { + %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1 + %r1 = load <2 x float>, <2 x float> addrspace(1)* %in2 + %r2 = load <2 x float>, <2 x float> addrspace(1)* %in3 + %r3 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2) + store <2 x float> %r3, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fma_v4f32: +; SI: v_fma_f32 +; SI: v_fma_f32 +; SI: v_fma_f32 +; SI: v_fma_f32 + +; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].{{[XYZW][XYZW][XYZW][XYZW]}}, {{T[0-9]\.[XYZW]}}, +; EG-DAG: FMA {{\*? *}}[[RES]].X +; EG-DAG: FMA {{\*? *}}[[RES]].Y +; EG-DAG: FMA {{\*? *}}[[RES]].Z +; EG-DAG: FMA {{\*? *}}[[RES]].W +define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1, + <4 x float> addrspace(1)* %in2, <4 x float> addrspace(1)* %in3) { + %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1 + %r1 = load <4 x float>, <4 x float> addrspace(1)* %in2 + %r2 = load <4 x float>, <4 x float> addrspace(1)* %in3 + %r3 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %r0, <4 x float> %r1, <4 x float> %r2) + store <4 x float> %r3, <4 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @fma_commute_mul_inline_imm_f32 +; SI: v_fma_f32 {{v[0-9]+}}, 2.0, {{v[0-9]+}}, {{v[0-9]+}} +define void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid + %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %in.a.gep, align 4 + %b = load float, float addrspace(1)* %in.b.gep, align 4 + + %fma = call float @llvm.fma.f32(float %a, float 2.0, float %b) + store float %fma, float addrspace(1)* %out.gep, align 4 + ret void +} + +; FUNC-LABEL: @fma_commute_mul_s_f32 +define void @fma_commute_mul_s_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b, float %b) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid + %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %in.a.gep, align 4 + %c = load float, float addrspace(1)* %in.b.gep, align 4 + + %fma = call float @llvm.fma.f32(float %a, float %b, float %c) + store float %fma, float addrspace(1)* %out.gep, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/fmad.ll b/test/CodeGen/AMDGPU/fmad.ll new file mode 100644 index 00000000000..935e35123f4 --- /dev/null +++ b/test/CodeGen/AMDGPU/fmad.ll @@ -0,0 +1,19 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +;CHECK: MULADD_IEEE * {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @test(<4 x float> inreg %reg0) #0 { + %r0 = extractelement <4 x float> %reg0, i32 0 + %r1 = extractelement <4 x float> %reg0, i32 1 + %r2 = extractelement <4 x float> %reg0, i32 2 + %r3 = fmul float %r0, %r1 + %r4 = fadd float %r3, %r2 + %vec = insertelement <4 x float> undef, float %r4, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) + ret void +} + +declare float @fabs(float ) readnone +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="0" } \ No newline at end of file diff --git a/test/CodeGen/AMDGPU/fmax.ll b/test/CodeGen/AMDGPU/fmax.ll new file mode 100644 index 00000000000..d7127f485c7 --- /dev/null +++ b/test/CodeGen/AMDGPU/fmax.ll @@ -0,0 +1,17 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +;CHECK: MAX * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @test(<4 x float> inreg %reg0) #0 { + %r0 = extractelement <4 x float> %reg0, i32 0 + %r1 = extractelement <4 x float> %reg0, i32 1 + %r2 = fcmp oge float %r0, %r1 + %r3 = select i1 %r2, float %r0, float %r1 + %vec = insertelement <4 x float> undef, float %r3, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) + ret void +} + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="0" } \ No newline at end of file diff --git a/test/CodeGen/AMDGPU/fmax3.f64.ll b/test/CodeGen/AMDGPU/fmax3.f64.ll new file mode 100644 index 00000000000..f78c71b2826 --- /dev/null +++ b/test/CodeGen/AMDGPU/fmax3.f64.ll @@ -0,0 +1,24 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare double @llvm.maxnum.f64(double, double) nounwind readnone + +; SI-LABEL: {{^}}test_fmax3_f64: +; SI-DAG: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}], 0{{$}} +; SI-DAG: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}], 0 offset:8 +; SI-DAG: buffer_load_dwordx2 [[REGC:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}], 0 offset:16 +; SI: v_max_f64 [[REGA]], [[REGA]], [[REGB]] +; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGC]] +; SI: buffer_store_dwordx2 [[RESULT]], +; SI: s_endpgm +define void @test_fmax3_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind { + %bptr = getelementptr double, double addrspace(1)* %aptr, i32 1 + %cptr = getelementptr double, double addrspace(1)* %aptr, i32 2 + %a = load double, double addrspace(1)* %aptr, align 8 + %b = load double, double addrspace(1)* %bptr, align 8 + %c = load double, double addrspace(1)* %cptr, align 8 + %f0 = call double @llvm.maxnum.f64(double %a, double %b) nounwind readnone + %f1 = call double @llvm.maxnum.f64(double %f0, double %c) nounwind readnone + store double %f1, double addrspace(1)* %out, align 8 + ret void +} diff --git a/test/CodeGen/AMDGPU/fmax3.ll b/test/CodeGen/AMDGPU/fmax3.ll new file mode 100644 index 00000000000..c3028a6217d --- /dev/null +++ b/test/CodeGen/AMDGPU/fmax3.ll @@ -0,0 +1,39 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare float @llvm.maxnum.f32(float, float) nounwind readnone + +; SI-LABEL: {{^}}test_fmax3_olt_0: +; SI: buffer_load_dword [[REGC:v[0-9]+]] +; SI: buffer_load_dword [[REGB:v[0-9]+]] +; SI: buffer_load_dword [[REGA:v[0-9]+]] +; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { + %a = load float, float addrspace(1)* %aptr, align 4 + %b = load float, float addrspace(1)* %bptr, align 4 + %c = load float, float addrspace(1)* %cptr, align 4 + %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone + %f1 = call float @llvm.maxnum.f32(float %f0, float %c) nounwind readnone + store float %f1, float addrspace(1)* %out, align 4 + ret void +} + +; Commute operand of second fmax +; SI-LABEL: {{^}}test_fmax3_olt_1: +; SI: buffer_load_dword [[REGB:v[0-9]+]] +; SI: buffer_load_dword [[REGA:v[0-9]+]] +; SI: buffer_load_dword [[REGC:v[0-9]+]] +; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { + %a = load float, float addrspace(1)* %aptr, align 4 + %b = load float, float addrspace(1)* %bptr, align 4 + %c = load float, float addrspace(1)* %cptr, align 4 + %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone + %f1 = call float @llvm.maxnum.f32(float %c, float %f0) nounwind readnone + store float %f1, float addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/fmax_legacy.f64.ll b/test/CodeGen/AMDGPU/fmax_legacy.f64.ll new file mode 100644 index 00000000000..828243888ac --- /dev/null +++ b/test/CodeGen/AMDGPU/fmax_legacy.f64.ll @@ -0,0 +1,67 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; Make sure we don't try to form FMAX_LEGACY nodes with f64 + +declare i32 @llvm.r600.read.tidig.x() #1 + +; FUNC-LABEL: @test_fmax_legacy_uge_f64 +define void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0, align 8 + %b = load double, double addrspace(1)* %gep.1, align 8 + + %cmp = fcmp uge double %a, %b + %val = select i1 %cmp, double %a, double %b + store double %val, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @test_fmax_legacy_oge_f64 +define void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0, align 8 + %b = load double, double addrspace(1)* %gep.1, align 8 + + %cmp = fcmp oge double %a, %b + %val = select i1 %cmp, double %a, double %b + store double %val, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @test_fmax_legacy_ugt_f64 +define void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0, align 8 + %b = load double, double addrspace(1)* %gep.1, align 8 + + %cmp = fcmp ugt double %a, %b + %val = select i1 %cmp, double %a, double %b + store double %val, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @test_fmax_legacy_ogt_f64 +define void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0, align 8 + %b = load double, double addrspace(1)* %gep.1, align 8 + + %cmp = fcmp ogt double %a, %b + %val = select i1 %cmp, double %a, double %b + store double %val, double addrspace(1)* %out, align 8 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fmax_legacy.ll b/test/CodeGen/AMDGPU/fmax_legacy.ll new file mode 100644 index 00000000000..413957d2982 --- /dev/null +++ b/test/CodeGen/AMDGPU/fmax_legacy.ll @@ -0,0 +1,116 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s +; RUN: llc -enable-no-nans-fp-math -enable-unsafe-fp-math -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FIXME: Should replace unsafe-fp-math with no signed zeros. + +declare i32 @llvm.r600.read.tidig.x() #1 + +; FUNC-LABEL: @test_fmax_legacy_uge_f32 +; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] + +; EG: MAX +define void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %cmp = fcmp uge float %a, %b + %val = select i1 %cmp, float %a, float %b + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_fmax_legacy_oge_f32 +; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; EG: MAX +define void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %cmp = fcmp oge float %a, %b + %val = select i1 %cmp, float %a, float %b + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_fmax_legacy_ugt_f32 +; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; EG: MAX +define void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %cmp = fcmp ugt float %a, %b + %val = select i1 %cmp, float %a, float %b + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_fmax_legacy_ogt_f32 +; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; EG: MAX +define void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %cmp = fcmp ogt float %a, %b + %val = select i1 %cmp, float %a, float %b + store float %val, float addrspace(1)* %out, align 4 + ret void +} + + +; FUNC-LABEL: @test_fmax_legacy_ogt_f32_multi_use +; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-NOT: v_max_ +; SI: v_cmp_gt_f32 +; SI-NEXT: v_cndmask_b32 +; SI-NOT: v_max_ + +; EG: MAX +define void @test_fmax_legacy_ogt_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %cmp = fcmp ogt float %a, %b + %val = select i1 %cmp, float %a, float %b + store float %val, float addrspace(1)* %out0, align 4 + store i1 %cmp, i1addrspace(1)* %out1 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fmaxnum.f64.ll b/test/CodeGen/AMDGPU/fmaxnum.f64.ll new file mode 100644 index 00000000000..de563cec341 --- /dev/null +++ b/test/CodeGen/AMDGPU/fmaxnum.f64.ll @@ -0,0 +1,76 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare double @llvm.maxnum.f64(double, double) #0 +declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>) #0 +declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>) #0 +declare <8 x double> @llvm.maxnum.v8f64(<8 x double>, <8 x double>) #0 +declare <16 x double> @llvm.maxnum.v16f64(<16 x double>, <16 x double>) #0 + +; FUNC-LABEL: @test_fmax_f64 +; SI: v_max_f64 +define void @test_fmax_f64(double addrspace(1)* %out, double %a, double %b) nounwind { + %val = call double @llvm.maxnum.f64(double %a, double %b) #0 + store double %val, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @test_fmax_v2f64 +; SI: v_max_f64 +; SI: v_max_f64 +define void @test_fmax_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { + %val = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b) #0 + store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: @test_fmax_v4f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +define void @test_fmax_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { + %val = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %a, <4 x double> %b) #0 + store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32 + ret void +} + +; FUNC-LABEL: @test_fmax_v8f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +define void @test_fmax_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { + %val = call <8 x double> @llvm.maxnum.v8f64(<8 x double> %a, <8 x double> %b) #0 + store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64 + ret void +} + +; FUNC-LABEL: @test_fmax_v16f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +; SI: v_max_f64 +define void @test_fmax_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { + %val = call <16 x double> @llvm.maxnum.v16f64(<16 x double> %a, <16 x double> %b) #0 + store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128 + ret void +} + +attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fmaxnum.ll b/test/CodeGen/AMDGPU/fmaxnum.ll new file mode 100644 index 00000000000..3029bd02e4d --- /dev/null +++ b/test/CodeGen/AMDGPU/fmaxnum.ll @@ -0,0 +1,283 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare float @llvm.maxnum.f32(float, float) #0 +declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #0 +declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #0 +declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>) #0 +declare <16 x float> @llvm.maxnum.v16f32(<16 x float>, <16 x float>) #0 + +declare double @llvm.maxnum.f64(double, double) + +; FUNC-LABEL: @test_fmax_f32 +; SI: v_max_f32_e32 + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG: MAX_DX10 {{.*}}[[OUT]] +define void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwind { + %val = call float @llvm.maxnum.f32(float %a, float %b) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_fmax_v2f32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]] +; EG: MAX_DX10 {{.*}}[[OUT]] +; EG: MAX_DX10 {{.*}}[[OUT]] +define void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind { + %val = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %b) #0 + store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @test_fmax_v4f32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]] +; EG: MAX_DX10 {{.*}}[[OUT]] +; EG: MAX_DX10 {{.*}}[[OUT]] +; EG: MAX_DX10 {{.*}}[[OUT]] +; EG: MAX_DX10 {{.*}}[[OUT]] +define void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind { + %val = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b) #0 + store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: @test_fmax_v8f32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]] +; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].X +; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Y +; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Z +; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].W +; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].X +; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Y +; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Z +; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].W +define void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind { + %val = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %a, <8 x float> %b) #0 + store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32 + ret void +} + +; FUNC-LABEL: @test_fmax_v16f32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 +; SI: v_max_f32_e32 + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT3:T[0-9]+]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT4:T[0-9]+]] +; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].X +; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Y +; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Z +; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].W +; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].X +; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Y +; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Z +; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].W +; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].X +; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].Y +; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].Z +; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].W +; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].X +; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Y +; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Z +; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].W +define void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind { + %val = call <16 x float> @llvm.maxnum.v16f32(<16 x float> %a, <16 x float> %b) #0 + store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64 + ret void +} + +; FUNC-LABEL: @constant_fold_fmax_f32 +; SI-NOT: v_max_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 2.0 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MAX_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind { + %val = call float @llvm.maxnum.f32(float 1.0, float 2.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @constant_fold_fmax_f32_nan_nan +; SI-NOT: v_max_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MAX_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +; EG: 2143289344(nan) +define void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind { + %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @constant_fold_fmax_f32_val_nan +; SI-NOT: v_max_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MAX_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind { + %val = call float @llvm.maxnum.f32(float 1.0, float 0x7FF8000000000000) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @constant_fold_fmax_f32_nan_val +; SI-NOT: v_max_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MAX_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind { + %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 1.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @constant_fold_fmax_f32_p0_p0 +; SI-NOT: v_max_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MAX_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind { + %val = call float @llvm.maxnum.f32(float 0.0, float 0.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @constant_fold_fmax_f32_p0_n0 +; SI-NOT: v_max_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MAX_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind { + %val = call float @llvm.maxnum.f32(float 0.0, float -0.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @constant_fold_fmax_f32_n0_p0 +; SI-NOT: v_max_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MAX_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind { + %val = call float @llvm.maxnum.f32(float -0.0, float 0.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @constant_fold_fmax_f32_n0_n0 +; SI-NOT: v_max_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MAX_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind { + %val = call float @llvm.maxnum.f32(float -0.0, float -0.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @fmax_var_immediate_f32 +; SI: v_max_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}} + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MAX_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind { + %val = call float @llvm.maxnum.f32(float %a, float 2.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @fmax_immediate_var_f32 +; SI: v_max_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}} + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} +define void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind { + %val = call float @llvm.maxnum.f32(float 2.0, float %a) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @fmax_var_literal_f32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000 +; SI: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} +define void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind { + %val = call float @llvm.maxnum.f32(float %a, float 99.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @fmax_literal_var_f32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000 +; SI: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} +define void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) nounwind { + %val = call float @llvm.maxnum.f32(float 99.0, float %a) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fmin.ll b/test/CodeGen/AMDGPU/fmin.ll new file mode 100644 index 00000000000..defa8c09638 --- /dev/null +++ b/test/CodeGen/AMDGPU/fmin.ll @@ -0,0 +1,17 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +;CHECK: MIN * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @test(<4 x float> inreg %reg0) #0 { + %r0 = extractelement <4 x float> %reg0, i32 0 + %r1 = extractelement <4 x float> %reg0, i32 1 + %r2 = fcmp uge float %r0, %r1 + %r3 = select i1 %r2, float %r1, float %r0 + %vec = insertelement <4 x float> undef, float %r3, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) + ret void +} + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="0" } \ No newline at end of file diff --git a/test/CodeGen/AMDGPU/fmin3.ll b/test/CodeGen/AMDGPU/fmin3.ll new file mode 100644 index 00000000000..0a76699b43e --- /dev/null +++ b/test/CodeGen/AMDGPU/fmin3.ll @@ -0,0 +1,40 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare float @llvm.minnum.f32(float, float) nounwind readnone + +; SI-LABEL: {{^}}test_fmin3_olt_0: +; SI: buffer_load_dword [[REGC:v[0-9]+]] +; SI: buffer_load_dword [[REGB:v[0-9]+]] +; SI: buffer_load_dword [[REGA:v[0-9]+]] +; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { + %a = load float, float addrspace(1)* %aptr, align 4 + %b = load float, float addrspace(1)* %bptr, align 4 + %c = load float, float addrspace(1)* %cptr, align 4 + %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone + %f1 = call float @llvm.minnum.f32(float %f0, float %c) nounwind readnone + store float %f1, float addrspace(1)* %out, align 4 + ret void +} + +; Commute operand of second fmin +; SI-LABEL: {{^}}test_fmin3_olt_1: +; SI: buffer_load_dword [[REGB:v[0-9]+]] +; SI: buffer_load_dword [[REGA:v[0-9]+]] +; SI: buffer_load_dword [[REGC:v[0-9]+]] +; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { + %a = load float, float addrspace(1)* %aptr, align 4 + %b = load float, float addrspace(1)* %bptr, align 4 + %c = load float, float addrspace(1)* %cptr, align 4 + %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone + %f1 = call float @llvm.minnum.f32(float %c, float %f0) nounwind readnone + store float %f1, float addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/fmin_legacy.f64.ll b/test/CodeGen/AMDGPU/fmin_legacy.f64.ll new file mode 100644 index 00000000000..e19a48f3f7e --- /dev/null +++ b/test/CodeGen/AMDGPU/fmin_legacy.f64.ll @@ -0,0 +1,77 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() #1 + +; FUNC-LABEL: @test_fmin_legacy_f64 +define void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double> inreg %reg0) #0 { + %r0 = extractelement <4 x double> %reg0, i32 0 + %r1 = extractelement <4 x double> %reg0, i32 1 + %r2 = fcmp uge double %r0, %r1 + %r3 = select i1 %r2, double %r1, double %r0 + %vec = insertelement <4 x double> undef, double %r3, i32 0 + store <4 x double> %vec, <4 x double> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: @test_fmin_legacy_ule_f64 +define void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0, align 8 + %b = load double, double addrspace(1)* %gep.1, align 8 + + %cmp = fcmp ule double %a, %b + %val = select i1 %cmp, double %a, double %b + store double %val, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @test_fmin_legacy_ole_f64 +define void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0, align 8 + %b = load double, double addrspace(1)* %gep.1, align 8 + + %cmp = fcmp ole double %a, %b + %val = select i1 %cmp, double %a, double %b + store double %val, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @test_fmin_legacy_olt_f64 +define void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0, align 8 + %b = load double, double addrspace(1)* %gep.1, align 8 + + %cmp = fcmp olt double %a, %b + %val = select i1 %cmp, double %a, double %b + store double %val, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @test_fmin_legacy_ult_f64 +define void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0, align 8 + %b = load double, double addrspace(1)* %gep.1, align 8 + + %cmp = fcmp ult double %a, %b + %val = select i1 %cmp, double %a, double %b + store double %val, double addrspace(1)* %out, align 8 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fmin_legacy.ll b/test/CodeGen/AMDGPU/fmin_legacy.ll new file mode 100644 index 00000000000..6a625c239d7 --- /dev/null +++ b/test/CodeGen/AMDGPU/fmin_legacy.ll @@ -0,0 +1,123 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -enable-no-nans-fp-math -enable-unsafe-fp-math -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FIXME: Should replace unsafe-fp-math with no signed zeros. + +declare i32 @llvm.r600.read.tidig.x() #1 + +; FUNC-LABEL: @test_fmin_legacy_f32 +; EG: MIN * +; SI-SAFE: v_min_legacy_f32_e32 +; SI-NONAN: v_min_f32_e32 +define void @test_fmin_legacy_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 { + %r0 = extractelement <4 x float> %reg0, i32 0 + %r1 = extractelement <4 x float> %reg0, i32 1 + %r2 = fcmp uge float %r0, %r1 + %r3 = select i1 %r2, float %r1, float %r0 + %vec = insertelement <4 x float> undef, float %r3, i32 0 + store <4 x float> %vec, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: @test_fmin_legacy_ule_f32 +; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +define void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %cmp = fcmp ule float %a, %b + %val = select i1 %cmp, float %a, float %b + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_fmin_legacy_ole_f32 +; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +define void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %cmp = fcmp ole float %a, %b + %val = select i1 %cmp, float %a, float %b + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_fmin_legacy_olt_f32 +; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] +; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +define void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %cmp = fcmp olt float %a, %b + %val = select i1 %cmp, float %a, float %b + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_fmin_legacy_ult_f32 +; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] +define void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %cmp = fcmp ult float %a, %b + %val = select i1 %cmp, float %a, float %b + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_fmin_legacy_ole_f32_multi_use +; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-NOT: v_min +; SI: v_cmp_le_f32 +; SI-NEXT: v_cndmask_b32 +; SI-NOT: v_min +; SI: s_endpgm +define void @test_fmin_legacy_ole_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %cmp = fcmp ole float %a, %b + %val0 = select i1 %cmp, float %a, float %b + store float %val0, float addrspace(1)* %out0, align 4 + store i1 %cmp, i1 addrspace(1)* %out1 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fminnum.f64.ll b/test/CodeGen/AMDGPU/fminnum.f64.ll new file mode 100644 index 00000000000..0f929d6a81f --- /dev/null +++ b/test/CodeGen/AMDGPU/fminnum.f64.ll @@ -0,0 +1,76 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare double @llvm.minnum.f64(double, double) #0 +declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>) #0 +declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>) #0 +declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>) #0 +declare <16 x double> @llvm.minnum.v16f64(<16 x double>, <16 x double>) #0 + +; FUNC-LABEL: @test_fmin_f64 +; SI: v_min_f64 +define void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) nounwind { + %val = call double @llvm.minnum.f64(double %a, double %b) #0 + store double %val, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @test_fmin_v2f64 +; SI: v_min_f64 +; SI: v_min_f64 +define void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { + %val = call <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b) #0 + store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: @test_fmin_v4f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +define void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { + %val = call <4 x double> @llvm.minnum.v4f64(<4 x double> %a, <4 x double> %b) #0 + store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32 + ret void +} + +; FUNC-LABEL: @test_fmin_v8f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +define void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { + %val = call <8 x double> @llvm.minnum.v8f64(<8 x double> %a, <8 x double> %b) #0 + store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64 + ret void +} + +; FUNC-LABEL: @test_fmin_v16f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +; SI: v_min_f64 +define void @test_fmin_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { + %val = call <16 x double> @llvm.minnum.v16f64(<16 x double> %a, <16 x double> %b) #0 + store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128 + ret void +} + +attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fminnum.ll b/test/CodeGen/AMDGPU/fminnum.ll new file mode 100644 index 00000000000..4d7b52540d8 --- /dev/null +++ b/test/CodeGen/AMDGPU/fminnum.ll @@ -0,0 +1,281 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare float @llvm.minnum.f32(float, float) #0 +declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #0 +declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #0 +declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>) #0 +declare <16 x float> @llvm.minnum.v16f32(<16 x float>, <16 x float>) #0 + +; FUNC-LABEL: @test_fmin_f32 +; SI: v_min_f32_e32 + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG: MIN_DX10 {{.*}}[[OUT]] +define void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwind { + %val = call float @llvm.minnum.f32(float %a, float %b) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_fmin_v2f32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]] +; EG: MIN_DX10 {{.*}}[[OUT]] +; EG: MIN_DX10 {{.*}}[[OUT]] +define void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind { + %val = call <2 x float> @llvm.minnum.v2f32(<2 x float> %a, <2 x float> %b) #0 + store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @test_fmin_v4f32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]] +; EG: MIN_DX10 {{.*}}[[OUT]] +; EG: MIN_DX10 {{.*}}[[OUT]] +; EG: MIN_DX10 {{.*}}[[OUT]] +; EG: MIN_DX10 {{.*}}[[OUT]] +define void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind { + %val = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b) #0 + store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: @test_fmin_v8f32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]] +; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].X +; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Y +; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Z +; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].W +; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].X +; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Y +; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Z +; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].W +define void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind { + %val = call <8 x float> @llvm.minnum.v8f32(<8 x float> %a, <8 x float> %b) #0 + store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32 + ret void +} + +; FUNC-LABEL: @test_fmin_v16f32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 +; SI: v_min_f32_e32 + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT3:T[0-9]+]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT4:T[0-9]+]] +; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].X +; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Y +; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Z +; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].W +; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].X +; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Y +; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Z +; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].W +; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].X +; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].Y +; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].Z +; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].W +; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].X +; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Y +; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Z +; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].W +define void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind { + %val = call <16 x float> @llvm.minnum.v16f32(<16 x float> %a, <16 x float> %b) #0 + store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64 + ret void +} + +; FUNC-LABEL: @constant_fold_fmin_f32 +; SI-NOT: v_min_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MIN_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind { + %val = call float @llvm.minnum.f32(float 1.0, float 2.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @constant_fold_fmin_f32_nan_nan +; SI-NOT: v_min_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MIN_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +; EG: 2143289344({{nan|1\.#QNAN0e\+00}}) +define void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind { + %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @constant_fold_fmin_f32_val_nan +; SI-NOT: v_min_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MIN_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind { + %val = call float @llvm.minnum.f32(float 1.0, float 0x7FF8000000000000) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @constant_fold_fmin_f32_nan_val +; SI-NOT: v_min_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MIN_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind { + %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 1.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @constant_fold_fmin_f32_p0_p0 +; SI-NOT: v_min_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MIN_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind { + %val = call float @llvm.minnum.f32(float 0.0, float 0.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @constant_fold_fmin_f32_p0_n0 +; SI-NOT: v_min_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MIN_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind { + %val = call float @llvm.minnum.f32(float 0.0, float -0.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @constant_fold_fmin_f32_n0_p0 +; SI-NOT: v_min_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MIN_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind { + %val = call float @llvm.minnum.f32(float -0.0, float 0.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @constant_fold_fmin_f32_n0_n0 +; SI-NOT: v_min_f32_e32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000 +; SI: buffer_store_dword [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MIN_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind { + %val = call float @llvm.minnum.f32(float -0.0, float -0.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @fmin_var_immediate_f32 +; SI: v_min_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}} + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} +define void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind { + %val = call float @llvm.minnum.f32(float %a, float 2.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @fmin_immediate_var_f32 +; SI: v_min_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}} + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} +define void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind { + %val = call float @llvm.minnum.f32(float 2.0, float %a) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @fmin_var_literal_f32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000 +; SI: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} +define void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind { + %val = call float @llvm.minnum.f32(float %a, float 99.0) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @fmin_literal_var_f32 +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000 +; SI: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]] + +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} +define void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) nounwind { + %val = call float @llvm.minnum.f32(float 99.0, float %a) #0 + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fmul.ll b/test/CodeGen/AMDGPU/fmul.ll new file mode 100644 index 00000000000..addc409c9eb --- /dev/null +++ b/test/CodeGen/AMDGPU/fmul.ll @@ -0,0 +1,92 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s + + +; FUNC-LABEL: {{^}}fmul_f32: +; R600: MUL_IEEE {{\** *}}{{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W + +; SI: v_mul_f32 +define void @fmul_f32(float addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fmul float %a, %b + store float %0, float addrspace(1)* %out + ret void +} + +declare float @llvm.R600.load.input(i32) readnone + +declare void @llvm.AMDGPU.store.output(float, i32) + +; FUNC-LABEL: {{^}}fmul_v2f32: +; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}} +; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}} + +; SI: v_mul_f32 +; SI: v_mul_f32 +define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { +entry: + %0 = fmul <2 x float> %a, %b + store <2 x float> %0, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fmul_v4f32: +; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_mul_f32 +; SI: v_mul_f32 +; SI: v_mul_f32 +; SI: v_mul_f32 +define void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 + %a = load <4 x float>, <4 x float> addrspace(1) * %in + %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr + %result = fmul <4 x float> %a, %b + store <4 x float> %result, <4 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_mul_2_k: +; SI: v_mul_f32 +; SI-NOT: v_mul_f32 +; SI: s_endpgm +define void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 { + %y = fmul float %x, 2.0 + %z = fmul float %y, 3.0 + store float %z, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_mul_2_k_inv: +; SI: v_mul_f32 +; SI-NOT: v_mul_f32 +; SI-NOT: v_mad_f32 +; SI: s_endpgm +define void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 { + %y = fmul float %x, 3.0 + %z = fmul float %y, 2.0 + store float %z, float addrspace(1)* %out + ret void +} + +; There should be three multiplies here; %a should be used twice (once +; negated), not duplicated into mul x, 5.0 and mul x, -5.0. +; FUNC-LABEL: {{^}}test_mul_twouse: +; SI: v_mul_f32 +; SI: v_mul_f32 +; SI: v_mul_f32 +; SI-NOT: v_mul_f32 +define void @test_mul_twouse(float addrspace(1)* %out, float %x, float %y) #0 { + %a = fmul float %x, 5.0 + %b = fsub float -0.0, %a + %c = fmul float %b, %y + %d = fmul float %c, %a + store float %d, float addrspace(1)* %out + ret void +} + +attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" } diff --git a/test/CodeGen/AMDGPU/fmul64.ll b/test/CodeGen/AMDGPU/fmul64.ll new file mode 100644 index 00000000000..3c222eaba89 --- /dev/null +++ b/test/CodeGen/AMDGPU/fmul64.ll @@ -0,0 +1,39 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s + +; FUNC-LABEL: {{^}}fmul_f64: +; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r2 = fmul double %r0, %r1 + store double %r2, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fmul_v2f64: +; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, + <2 x double> addrspace(1)* %in2) { + %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1 + %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2 + %r2 = fmul <2 x double> %r0, %r1 + store <2 x double> %r2, <2 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fmul_v4f64: +; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +define void @fmul_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1, + <4 x double> addrspace(1)* %in2) { + %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1 + %r1 = load <4 x double>, <4 x double> addrspace(1)* %in2 + %r2 = fmul <4 x double> %r0, %r1 + store <4 x double> %r2, <4 x double> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/fmuladd.ll b/test/CodeGen/AMDGPU/fmuladd.ll new file mode 100644 index 00000000000..ae84d841021 --- /dev/null +++ b/test/CodeGen/AMDGPU/fmuladd.ll @@ -0,0 +1,199 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s + +declare float @llvm.fmuladd.f32(float, float, float) +declare double @llvm.fmuladd.f64(double, double, double) +declare i32 @llvm.r600.read.tidig.x() nounwind readnone +declare float @llvm.fabs.f32(float) nounwind readnone + +; CHECK-LABEL: {{^}}fmuladd_f32: +; CHECK: v_mad_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}} + +define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1, + float addrspace(1)* %in2, float addrspace(1)* %in3) { + %r0 = load float, float addrspace(1)* %in1 + %r1 = load float, float addrspace(1)* %in2 + %r2 = load float, float addrspace(1)* %in3 + %r3 = tail call float @llvm.fmuladd.f32(float %r0, float %r1, float %r2) + store float %r3, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}fmuladd_f64: +; CHECK: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} + +define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2, double addrspace(1)* %in3) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r2 = load double, double addrspace(1)* %in3 + %r3 = tail call double @llvm.fmuladd.f64(double %r0, double %r1, double %r2) + store double %r3, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}fmuladd_2.0_a_b_f32 +; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]] +; CHECK: buffer_store_dword [[RESULT]] +define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load float, float addrspace(1)* %gep.0 + %r2 = load float, float addrspace(1)* %gep.1 + + %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2) + store float %r3, float addrspace(1)* %gep.out + ret void +} + +; CHECK-LABEL: {{^}}fmuladd_a_2.0_b_f32 +; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]] +; CHECK: buffer_store_dword [[RESULT]] +define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load float, float addrspace(1)* %gep.0 + %r2 = load float, float addrspace(1)* %gep.1 + + %r3 = tail call float @llvm.fmuladd.f32(float %r1, float 2.0, float %r2) + store float %r3, float addrspace(1)* %gep.out + ret void +} + +; CHECK-LABEL: {{^}}fadd_a_a_b_f32: +; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]] +; CHECK: buffer_store_dword [[RESULT]] +define void @fadd_a_a_b_f32(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r0 = load float, float addrspace(1)* %gep.0 + %r1 = load float, float addrspace(1)* %gep.1 + + %add.0 = fadd float %r0, %r0 + %add.1 = fadd float %add.0, %r1 + store float %add.1, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}fadd_b_a_a_f32: +; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]] +; CHECK: buffer_store_dword [[RESULT]] +define void @fadd_b_a_a_f32(float addrspace(1)* %out, + float addrspace(1)* %in1, + float addrspace(1)* %in2) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r0 = load float, float addrspace(1)* %gep.0 + %r1 = load float, float addrspace(1)* %gep.1 + + %add.0 = fadd float %r0, %r0 + %add.1 = fadd float %r1, %add.0 + store float %add.1, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32 +; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]] +; CHECK: buffer_store_dword [[RESULT]] +define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load float, float addrspace(1)* %gep.0 + %r2 = load float, float addrspace(1)* %gep.1 + + %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1, float %r2) + store float %r3, float addrspace(1)* %gep.out + ret void +} + + +; CHECK-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32 +; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]] +; CHECK: buffer_store_dword [[RESULT]] +define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load float, float addrspace(1)* %gep.0 + %r2 = load float, float addrspace(1)* %gep.1 + + %r1.fneg = fsub float -0.000000e+00, %r1 + + %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1.fneg, float %r2) + store float %r3, float addrspace(1)* %gep.out + ret void +} + + +; CHECK-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32 +; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]] +; CHECK: buffer_store_dword [[RESULT]] +define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load float, float addrspace(1)* %gep.0 + %r2 = load float, float addrspace(1)* %gep.1 + + %r1.fneg = fsub float -0.000000e+00, %r1 + + %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1.fneg, float %r2) + store float %r3, float addrspace(1)* %gep.out + ret void +} + + +; CHECK-LABEL: {{^}}fmuladd_2.0_a_neg_b_f32 +; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]] +; CHECK: buffer_store_dword [[RESULT]] +define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load float, float addrspace(1)* %gep.0 + %r2 = load float, float addrspace(1)* %gep.1 + + %r2.fneg = fsub float -0.000000e+00, %r2 + + %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2.fneg) + store float %r3, float addrspace(1)* %gep.out + ret void +} diff --git a/test/CodeGen/AMDGPU/fnearbyint.ll b/test/CodeGen/AMDGPU/fnearbyint.ll new file mode 100644 index 00000000000..4fa9adaabda --- /dev/null +++ b/test/CodeGen/AMDGPU/fnearbyint.ll @@ -0,0 +1,58 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s + +; This should have the exactly the same output as the test for rint, +; so no need to check anything. + +declare float @llvm.nearbyint.f32(float) #0 +declare <2 x float> @llvm.nearbyint.v2f32(<2 x float>) #0 +declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) #0 +declare double @llvm.nearbyint.f64(double) #0 +declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) #0 +declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) #0 + + +define void @fnearbyint_f32(float addrspace(1)* %out, float %in) #1 { +entry: + %0 = call float @llvm.nearbyint.f32(float %in) + store float %0, float addrspace(1)* %out + ret void +} + +define void @fnearbyint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 { +entry: + %0 = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %in) + store <2 x float> %0, <2 x float> addrspace(1)* %out + ret void +} + +define void @fnearbyint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 { +entry: + %0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %in) + store <4 x float> %0, <4 x float> addrspace(1)* %out + ret void +} + +define void @nearbyint_f64(double addrspace(1)* %out, double %in) { +entry: + %0 = call double @llvm.nearbyint.f64(double %in) + store double %0, double addrspace(1)* %out + ret void +} +define void @nearbyint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { +entry: + %0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %in) + store <2 x double> %0, <2 x double> addrspace(1)* %out + ret void +} + +define void @nearbyint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { +entry: + %0 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %in) + store <4 x double> %0, <4 x double> addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind readonly } +attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/test/CodeGen/AMDGPU/fneg-fabs.f64.ll new file mode 100644 index 00000000000..8830e827366 --- /dev/null +++ b/test/CodeGen/AMDGPU/fneg-fabs.f64.ll @@ -0,0 +1,100 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FIXME: Check something here. Currently it seems fabs + fneg aren't +; into 2 modifiers, although theoretically that should work. + +; FUNC-LABEL: {{^}}fneg_fabs_fadd_f64: +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}| +define void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) { + %fabs = call double @llvm.fabs.f64(double %x) + %fsub = fsub double -0.000000e+00, %fabs + %fadd = fadd double %y, %fsub + store double %fadd, double addrspace(1)* %out, align 8 + ret void +} + +define void @v_fneg_fabs_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %xptr, double addrspace(1)* %yptr) { + %x = load double, double addrspace(1)* %xptr, align 8 + %y = load double, double addrspace(1)* %xptr, align 8 + %fabs = call double @llvm.fabs.f64(double %x) + %fsub = fsub double -0.000000e+00, %fabs + %fadd = fadd double %y, %fsub + store double %fadd, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}fneg_fabs_fmul_f64: +; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|{{v\[[0-9]+:[0-9]+\]}}| +define void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y) { + %fabs = call double @llvm.fabs.f64(double %x) + %fsub = fsub double -0.000000e+00, %fabs + %fmul = fmul double %y, %fsub + store double %fmul, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}fneg_fabs_free_f64: +define void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) { + %bc = bitcast i64 %in to double + %fabs = call double @llvm.fabs.f64(double %bc) + %fsub = fsub double -0.000000e+00, %fabs + store double %fsub, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fneg_fabs_fn_free_f64: +; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 +; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) { + %bc = bitcast i64 %in to double + %fabs = call double @fabs(double %bc) + %fsub = fsub double -0.000000e+00, %fabs + store double %fsub, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fneg_fabs_f64: +; SI: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}} +; SI: s_load_dwordx2 +; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 +; SI-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]] +; SI-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]] +; SI: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}} +define void @fneg_fabs_f64(double addrspace(1)* %out, double %in) { + %fabs = call double @llvm.fabs.f64(double %in) + %fsub = fsub double -0.000000e+00, %fabs + store double %fsub, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}fneg_fabs_v2f64: +; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 +; SI-NOT: 0x80000000 +; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +define void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { + %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in) + %fsub = fsub <2 x double> , %fabs + store <2 x double> %fsub, <2 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fneg_fabs_v4f64: +; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 +; SI-NOT: 0x80000000 +; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +define void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { + %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in) + %fsub = fsub <4 x double> , %fabs + store <4 x double> %fsub, <4 x double> addrspace(1)* %out + ret void +} + +declare double @fabs(double) readnone +declare double @llvm.fabs.f64(double) readnone +declare <2 x double> @llvm.fabs.v2f64(<2 x double>) readnone +declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone diff --git a/test/CodeGen/AMDGPU/fneg-fabs.ll b/test/CodeGen/AMDGPU/fneg-fabs.ll new file mode 100644 index 00000000000..3b4930d9897 --- /dev/null +++ b/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -0,0 +1,118 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32: +; SI-NOT: and +; SI: v_sub_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, |{{v[0-9]+}}| +define void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) { + %fabs = call float @llvm.fabs.f32(float %x) + %fsub = fsub float -0.000000e+00, %fabs + %fadd = fadd float %y, %fsub + store float %fadd, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}fneg_fabs_fmul_f32: +; SI-NOT: and +; SI: v_mul_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, -|{{v[0-9]+}}| +; SI-NOT: and +define void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) { + %fabs = call float @llvm.fabs.f32(float %x) + %fsub = fsub float -0.000000e+00, %fabs + %fmul = fmul float %y, %fsub + store float %fmul, float addrspace(1)* %out, align 4 + ret void +} + +; DAGCombiner will transform: +; (fabs (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF)) +; unless isFabsFree returns true + +; FUNC-LABEL: {{^}}fneg_fabs_free_f32: +; R600-NOT: AND +; R600: |PV.{{[XYZW]}}| +; R600: -PV + +; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 +; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +define void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) { + %bc = bitcast i32 %in to float + %fabs = call float @llvm.fabs.f32(float %bc) + %fsub = fsub float -0.000000e+00, %fabs + store float %fsub, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fneg_fabs_fn_free_f32: +; R600-NOT: AND +; R600: |PV.{{[XYZW]}}| +; R600: -PV + +; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 +; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +define void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) { + %bc = bitcast i32 %in to float + %fabs = call float @fabs(float %bc) + %fsub = fsub float -0.000000e+00, %fabs + store float %fsub, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fneg_fabs_f32: +; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 +; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +define void @fneg_fabs_f32(float addrspace(1)* %out, float %in) { + %fabs = call float @llvm.fabs.f32(float %in) + %fsub = fsub float -0.000000e+00, %fabs + store float %fsub, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_fneg_fabs_f32: +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} +define void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) { + %val = load float, float addrspace(1)* %in, align 4 + %fabs = call float @llvm.fabs.f32(float %val) + %fsub = fsub float -0.000000e+00, %fabs + store float %fsub, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}fneg_fabs_v2f32: +; R600: |{{(PV|T[0-9])\.[XYZW]}}| +; R600: -PV +; R600: |{{(PV|T[0-9])\.[XYZW]}}| +; R600: -PV + +; FIXME: SGPR should be used directly for first src operand. +; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 +; SI-NOT: 0x80000000 +; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]] +; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]] +define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { + %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) + %fsub = fsub <2 x float> , %fabs + store <2 x float> %fsub, <2 x float> addrspace(1)* %out + ret void +} + +; FIXME: SGPR should be used directly for first src operand. +; FUNC-LABEL: {{^}}fneg_fabs_v4f32: +; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 +; SI-NOT: 0x80000000 +; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]] +; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]] +; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]] +; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]] +define void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { + %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) + %fsub = fsub <4 x float> , %fabs + store <4 x float> %fsub, <4 x float> addrspace(1)* %out + ret void +} + +declare float @fabs(float) readnone +declare float @llvm.fabs.f32(float) readnone +declare <2 x float> @llvm.fabs.v2f32(<2 x float>) readnone +declare <4 x float> @llvm.fabs.v4f32(<4 x float>) readnone diff --git a/test/CodeGen/AMDGPU/fneg.f64.ll b/test/CodeGen/AMDGPU/fneg.f64.ll new file mode 100644 index 00000000000..aa6df209035 --- /dev/null +++ b/test/CodeGen/AMDGPU/fneg.f64.ll @@ -0,0 +1,60 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}fneg_f64: +; GCN: v_xor_b32 +define void @fneg_f64(double addrspace(1)* %out, double %in) { + %fneg = fsub double -0.000000e+00, %in + store double %fneg, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fneg_v2f64: +; GCN: v_xor_b32 +; GCN: v_xor_b32 +define void @fneg_v2f64(<2 x double> addrspace(1)* nocapture %out, <2 x double> %in) { + %fneg = fsub <2 x double> , %in + store <2 x double> %fneg, <2 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fneg_v4f64: +; R600: -PV +; R600: -T +; R600: -PV +; R600: -PV + +; GCN: v_xor_b32 +; GCN: v_xor_b32 +; GCN: v_xor_b32 +; GCN: v_xor_b32 +define void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double> %in) { + %fneg = fsub <4 x double> , %in + store <4 x double> %fneg, <4 x double> addrspace(1)* %out + ret void +} + +; DAGCombiner will transform: +; (fneg (f64 bitcast (i64 a))) => (f64 bitcast (xor (i64 a), 0x80000000)) +; unless the target returns true for isNegFree() + +; FUNC-LABEL: {{^}}fneg_free_f64: +; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, 0, -{{s\[[0-9]+:[0-9]+\]$}} +define void @fneg_free_f64(double addrspace(1)* %out, i64 %in) { + %bc = bitcast i64 %in to double + %fsub = fsub double 0.0, %bc + store double %fsub, double addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fneg_fold_f64: +; SI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; GCN-NOT: xor +; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -[[NEG_VALUE]], [[NEG_VALUE]] +define void @fneg_fold_f64(double addrspace(1)* %out, double %in) { + %fsub = fsub double -0.0, %in + %fmul = fmul double %fsub, %in + store double %fmul, double addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/fneg.ll b/test/CodeGen/AMDGPU/fneg.ll new file mode 100644 index 00000000000..a0fd539863c --- /dev/null +++ b/test/CodeGen/AMDGPU/fneg.ll @@ -0,0 +1,70 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}fneg_f32: +; R600: -PV + +; GCN: v_xor_b32 +define void @fneg_f32(float addrspace(1)* %out, float %in) { + %fneg = fsub float -0.000000e+00, %in + store float %fneg, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fneg_v2f32: +; R600: -PV +; R600: -PV + +; GCN: v_xor_b32 +; GCN: v_xor_b32 +define void @fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) { + %fneg = fsub <2 x float> , %in + store <2 x float> %fneg, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fneg_v4f32: +; R600: -PV +; R600: -T +; R600: -PV +; R600: -PV + +; GCN: v_xor_b32 +; GCN: v_xor_b32 +; GCN: v_xor_b32 +; GCN: v_xor_b32 +define void @fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) { + %fneg = fsub <4 x float> , %in + store <4 x float> %fneg, <4 x float> addrspace(1)* %out + ret void +} + +; DAGCombiner will transform: +; (fneg (f32 bitcast (i32 a))) => (f32 bitcast (xor (i32 a), 0x80000000)) +; unless the target returns true for isNegFree() + +; FUNC-LABEL: {{^}}fneg_free_f32: +; R600-NOT: XOR +; R600: -KC0[2].Z + +; XXX: We could use v_add_f32_e64 with the negate bit here instead. +; GCN: v_sub_f32_e64 v{{[0-9]}}, 0, s{{[0-9]+$}} +define void @fneg_free_f32(float addrspace(1)* %out, i32 %in) { + %bc = bitcast i32 %in to float + %fsub = fsub float 0.0, %bc + store float %fsub, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fneg_fold_f32: +; SI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb +; VI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c +; GCN-NOT: xor +; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]] +define void @fneg_fold_f32(float addrspace(1)* %out, float %in) { + %fsub = fsub float -0.0, %in + %fmul = fmul float %fsub, %in + store float %fmul, float addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/fp-classify.ll b/test/CodeGen/AMDGPU/fp-classify.ll new file mode 100644 index 00000000000..4fac5176fac --- /dev/null +++ b/test/CodeGen/AMDGPU/fp-classify.ll @@ -0,0 +1,131 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare i1 @llvm.AMDGPU.class.f32(float, i32) #1 +declare i1 @llvm.AMDGPU.class.f64(double, i32) #1 +declare i32 @llvm.r600.read.tidig.x() #1 +declare float @llvm.fabs.f32(float) #1 +declare double @llvm.fabs.f64(double) #1 + +; SI-LABEL: {{^}}test_isinf_pattern: +; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x204{{$}} +; SI: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]] +; SI-NOT: v_cmp +; SI: s_endpgm +define void @test_isinf_pattern(i32 addrspace(1)* nocapture %out, float %x) #0 { + %fabs = tail call float @llvm.fabs.f32(float %x) #1 + %cmp = fcmp oeq float %fabs, 0x7FF0000000000000 + %ext = zext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_not_isinf_pattern_0: +; SI-NOT: v_cmp_class +; SI: s_endpgm +define void @test_not_isinf_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 { + %fabs = tail call float @llvm.fabs.f32(float %x) #1 + %cmp = fcmp ueq float %fabs, 0x7FF0000000000000 + %ext = zext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_not_isinf_pattern_1: +; SI-NOT: v_cmp_class +; SI: s_endpgm +define void @test_not_isinf_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 { + %fabs = tail call float @llvm.fabs.f32(float %x) #1 + %cmp = fcmp oeq float %fabs, 0xFFF0000000000000 + %ext = zext i1 %cmp to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_isfinite_pattern_0: +; SI-NOT: v_cmp +; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1f8{{$}} +; SI: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]] +; SI-NOT: v_cmp +; SI: s_endpgm +define void @test_isfinite_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 { + %ord = fcmp ord float %x, 0.000000e+00 + %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 + %ninf = fcmp une float %x.fabs, 0x7FF0000000000000 + %and = and i1 %ord, %ninf + %ext = zext i1 %and to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; Use negative infinity +; SI-LABEL: {{^}}test_isfinite_not_pattern_0: +; SI-NOT: v_cmp_class_f32 +; SI: s_endpgm +define void @test_isfinite_not_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 { + %ord = fcmp ord float %x, 0.000000e+00 + %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 + %ninf = fcmp une float %x.fabs, 0xFFF0000000000000 + %and = and i1 %ord, %ninf + %ext = zext i1 %and to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; No fabs +; SI-LABEL: {{^}}test_isfinite_not_pattern_1: +; SI-NOT: v_cmp_class_f32 +; SI: s_endpgm +define void @test_isfinite_not_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 { + %ord = fcmp ord float %x, 0.000000e+00 + %ninf = fcmp une float %x, 0x7FF0000000000000 + %and = and i1 %ord, %ninf + %ext = zext i1 %and to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; fabs of different value +; SI-LABEL: {{^}}test_isfinite_not_pattern_2: +; SI-NOT: v_cmp_class_f32 +; SI: s_endpgm +define void @test_isfinite_not_pattern_2(i32 addrspace(1)* nocapture %out, float %x, float %y) #0 { + %ord = fcmp ord float %x, 0.000000e+00 + %x.fabs = tail call float @llvm.fabs.f32(float %y) #1 + %ninf = fcmp une float %x.fabs, 0x7FF0000000000000 + %and = and i1 %ord, %ninf + %ext = zext i1 %and to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; Wrong ordered compare type +; SI-LABEL: {{^}}test_isfinite_not_pattern_3: +; SI-NOT: v_cmp_class_f32 +; SI: s_endpgm +define void @test_isfinite_not_pattern_3(i32 addrspace(1)* nocapture %out, float %x) #0 { + %ord = fcmp uno float %x, 0.000000e+00 + %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 + %ninf = fcmp une float %x.fabs, 0x7FF0000000000000 + %and = and i1 %ord, %ninf + %ext = zext i1 %and to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; Wrong unordered compare +; SI-LABEL: {{^}}test_isfinite_not_pattern_4: +; SI-NOT: v_cmp_class_f32 +; SI: s_endpgm +define void @test_isfinite_not_pattern_4(i32 addrspace(1)* nocapture %out, float %x) #0 { + %ord = fcmp ord float %x, 0.000000e+00 + %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 + %ninf = fcmp one float %x.fabs, 0x7FF0000000000000 + %and = and i1 %ord, %ninf + %ext = zext i1 %and to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fp16_to_fp.ll b/test/CodeGen/AMDGPU/fp16_to_fp.ll new file mode 100644 index 00000000000..5a79ca82bc2 --- /dev/null +++ b/test/CodeGen/AMDGPU/fp16_to_fp.ll @@ -0,0 +1,29 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone +declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone + +; SI-LABEL: {{^}}test_convert_fp16_to_fp32: +; SI: buffer_load_ushort [[VAL:v[0-9]+]] +; SI: v_cvt_f32_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] +; SI: buffer_store_dword [[RESULT]] +define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { + %val = load i16, i16 addrspace(1)* %in, align 2 + %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone + store float %cvt, float addrspace(1)* %out, align 4 + ret void +} + + +; SI-LABEL: {{^}}test_convert_fp16_to_fp64: +; SI: buffer_load_ushort [[VAL:v[0-9]+]] +; SI: v_cvt_f32_f16_e32 [[RESULT32:v[0-9]+]], [[VAL]] +; SI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[RESULT32]] +; SI: buffer_store_dwordx2 [[RESULT]] +define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { + %val = load i16, i16 addrspace(1)* %in, align 2 + %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone + store double %cvt, double addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/test/CodeGen/AMDGPU/fp32_to_fp16.ll new file mode 100644 index 00000000000..67925ebd82b --- /dev/null +++ b/test/CodeGen/AMDGPU/fp32_to_fp16.ll @@ -0,0 +1,15 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone + +; SI-LABEL: {{^}}test_convert_fp32_to_fp16: +; SI: buffer_load_dword [[VAL:v[0-9]+]] +; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[VAL]] +; SI: buffer_store_short [[RESULT]] +define void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { + %val = load float, float addrspace(1)* %in, align 4 + %cvt = call i16 @llvm.convert.to.fp16.f32(float %val) nounwind readnone + store i16 %cvt, i16 addrspace(1)* %out, align 2 + ret void +} diff --git a/test/CodeGen/AMDGPU/fp_to_sint.f64.ll b/test/CodeGen/AMDGPU/fp_to_sint.f64.ll new file mode 100644 index 00000000000..12df6606e8f --- /dev/null +++ b/test/CodeGen/AMDGPU/fp_to_sint.f64.ll @@ -0,0 +1,56 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; FUNC-LABEL: @fp_to_sint_f64_i32 +; SI: v_cvt_i32_f64_e32 +define void @fp_to_sint_f64_i32(i32 addrspace(1)* %out, double %in) { + %result = fptosi double %in to i32 + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @fp_to_sint_v2f64_v2i32 +; SI: v_cvt_i32_f64_e32 +; SI: v_cvt_i32_f64_e32 +define void @fp_to_sint_v2f64_v2i32(<2 x i32> addrspace(1)* %out, <2 x double> %in) { + %result = fptosi <2 x double> %in to <2 x i32> + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @fp_to_sint_v4f64_v4i32 +; SI: v_cvt_i32_f64_e32 +; SI: v_cvt_i32_f64_e32 +; SI: v_cvt_i32_f64_e32 +; SI: v_cvt_i32_f64_e32 +define void @fp_to_sint_v4f64_v4i32(<4 x i32> addrspace(1)* %out, <4 x double> %in) { + %result = fptosi <4 x double> %in to <4 x i32> + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @fp_to_sint_i64_f64 +; CI-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] +; CI-DAG: v_trunc_f64_e32 [[TRUNC:v\[[0-9]+:[0-9]+\]]], [[VAL]] +; CI-DAG: s_mov_b32 s[[K0_LO:[0-9]+]], 0{{$}} +; CI-DAG: s_mov_b32 s[[K0_HI:[0-9]+]], 0x3df00000 + +; CI-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[VAL]], s{{\[}}[[K0_LO]]:[[K0_HI]]{{\]}} +; CI-DAG: v_floor_f64_e32 [[FLOOR:v\[[0-9]+:[0-9]+\]]], [[MUL]] + +; CI-DAG: s_mov_b32 s[[K1_HI:[0-9]+]], 0xc1f00000 + +; CI-DAG: v_fma_f64 [[FMA:v\[[0-9]+:[0-9]+\]]], [[FLOOR]], s{{\[[0-9]+}}:[[K1_HI]]{{\]}}, [[TRUNC]] +; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]] +; CI-DAG: v_cvt_i32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]] +; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +define void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr double, double addrspace(1)* %in, i32 %tid + %val = load double, double addrspace(1)* %gep, align 8 + %cast = fptosi double %val to i64 + store i64 %cast, i64 addrspace(1)* %out, align 8 + ret void +} diff --git a/test/CodeGen/AMDGPU/fp_to_sint.ll b/test/CodeGen/AMDGPU/fp_to_sint.ll new file mode 100644 index 00000000000..301a94b4904 --- /dev/null +++ b/test/CodeGen/AMDGPU/fp_to_sint.ll @@ -0,0 +1,230 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC + +declare float @llvm.fabs.f32(float) #0 + +; FUNC-LABEL: {{^}}fp_to_sint_i32: +; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; SI: v_cvt_i32_f32_e32 +; SI: s_endpgm +define void @fp_to_sint_i32(i32 addrspace(1)* %out, float %in) { + %conv = fptosi float %in to i32 + store i32 %conv, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fp_to_sint_i32_fabs: +; SI: v_cvt_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}} +define void @fp_to_sint_i32_fabs(i32 addrspace(1)* %out, float %in) { + %in.fabs = call float @llvm.fabs.f32(float %in) #0 + %conv = fptosi float %in.fabs to i32 + store i32 %conv, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fp_to_sint_v2i32: +; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; SI: v_cvt_i32_f32_e32 +; SI: v_cvt_i32_f32_e32 +define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) { + %result = fptosi <2 x float> %in to <2 x i32> + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fp_to_sint_v4i32: +; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW]}} +; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; SI: v_cvt_i32_f32_e32 +; SI: v_cvt_i32_f32_e32 +; SI: v_cvt_i32_f32_e32 +; SI: v_cvt_i32_f32_e32 +define void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { + %value = load <4 x float>, <4 x float> addrspace(1) * %in + %result = fptosi <4 x float> %value to <4 x i32> + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fp_to_sint_i64: + +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT + +; Check that the compiler doesn't crash with a "cannot select" error +; SI: s_endpgm +define void @fp_to_sint_i64 (i64 addrspace(1)* %out, float %in) { +entry: + %0 = fptosi float %in to i64 + store i64 %0, i64 addrspace(1)* %out + ret void +} + +; FUNC: {{^}}fp_to_sint_v2i64: +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT + +; SI: s_endpgm +define void @fp_to_sint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) { + %conv = fptosi <2 x float> %x to <2 x i64> + store <2 x i64> %conv, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC: {{^}}fp_to_sint_v4i64: +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT + +; SI: s_endpgm +define void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) { + %conv = fptosi <4 x float> %x to <4 x i64> + store <4 x i64> %conv, <4 x i64> addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fp_to_uint.f64.ll b/test/CodeGen/AMDGPU/fp_to_uint.f64.ll new file mode 100644 index 00000000000..41bc2a78001 --- /dev/null +++ b/test/CodeGen/AMDGPU/fp_to_uint.f64.ll @@ -0,0 +1,70 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; SI-LABEL: {{^}}fp_to_uint_i32_f64: +; SI: v_cvt_u32_f64_e32 +define void @fp_to_uint_i32_f64(i32 addrspace(1)* %out, double %in) { + %cast = fptoui double %in to i32 + store i32 %cast, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: @fp_to_uint_v2i32_v2f64 +; SI: v_cvt_u32_f64_e32 +; SI: v_cvt_u32_f64_e32 +define void @fp_to_uint_v2i32_v2f64(<2 x i32> addrspace(1)* %out, <2 x double> %in) { + %cast = fptoui <2 x double> %in to <2 x i32> + store <2 x i32> %cast, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: @fp_to_uint_v4i32_v4f64 +; SI: v_cvt_u32_f64_e32 +; SI: v_cvt_u32_f64_e32 +; SI: v_cvt_u32_f64_e32 +; SI: v_cvt_u32_f64_e32 +define void @fp_to_uint_v4i32_v4f64(<4 x i32> addrspace(1)* %out, <4 x double> %in) { + %cast = fptoui <4 x double> %in to <4 x i32> + store <4 x i32> %cast, <4 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @fp_to_uint_i64_f64 +; CI-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] +; CI-DAG: v_trunc_f64_e32 [[TRUNC:v\[[0-9]+:[0-9]+\]]], [[VAL]] +; CI-DAG: s_mov_b32 s[[K0_LO:[0-9]+]], 0{{$}} +; CI-DAG: s_mov_b32 s[[K0_HI:[0-9]+]], 0x3df00000 + +; CI-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[VAL]], s{{\[}}[[K0_LO]]:[[K0_HI]]{{\]}} +; CI-DAG: v_floor_f64_e32 [[FLOOR:v\[[0-9]+:[0-9]+\]]], [[MUL]] + +; CI-DAG: s_mov_b32 s[[K1_HI:[0-9]+]], 0xc1f00000 + +; CI-DAG: v_fma_f64 [[FMA:v\[[0-9]+:[0-9]+\]]], [[FLOOR]], s{{\[[0-9]+}}:[[K1_HI]]{{\]}}, [[TRUNC]] +; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]] +; CI-DAG: v_cvt_u32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]] +; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +define void @fp_to_uint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr double, double addrspace(1)* %in, i32 %tid + %val = load double, double addrspace(1)* %gep, align 8 + %cast = fptoui double %val to i64 + store i64 %cast, i64 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: @fp_to_uint_v2i64_v2f64 +define void @fp_to_uint_v2i64_v2f64(<2 x i64> addrspace(1)* %out, <2 x double> %in) { + %cast = fptoui <2 x double> %in to <2 x i64> + store <2 x i64> %cast, <2 x i64> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: @fp_to_uint_v4i64_v4f64 +define void @fp_to_uint_v4i64_v4f64(<4 x i64> addrspace(1)* %out, <4 x double> %in) { + %cast = fptoui <4 x double> %in to <4 x i64> + store <4 x i64> %cast, <4 x i64> addrspace(1)* %out, align 32 + ret void +} diff --git a/test/CodeGen/AMDGPU/fp_to_uint.ll b/test/CodeGen/AMDGPU/fp_to_uint.ll new file mode 100644 index 00000000000..b7b6ccc238b --- /dev/null +++ b/test/CodeGen/AMDGPU/fp_to_uint.ll @@ -0,0 +1,217 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=EG -check-prefix=FUNC +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC + +; FUNC-LABEL: {{^}}fp_to_uint_f32_to_i32: +; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} + +; SI: v_cvt_u32_f32_e32 +; SI: s_endpgm +define void @fp_to_uint_f32_to_i32 (i32 addrspace(1)* %out, float %in) { + %conv = fptoui float %in to i32 + store i32 %conv, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fp_to_uint_v2f32_to_v2i32: +; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_cvt_u32_f32_e32 +; SI: v_cvt_u32_f32_e32 +define void @fp_to_uint_v2f32_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) { + %result = fptoui <2 x float> %in to <2 x i32> + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fp_to_uint_v4f32_to_v4i32: +; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; SI: v_cvt_u32_f32_e32 +; SI: v_cvt_u32_f32_e32 +; SI: v_cvt_u32_f32_e32 +; SI: v_cvt_u32_f32_e32 + +define void @fp_to_uint_v4f32_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { + %value = load <4 x float>, <4 x float> addrspace(1) * %in + %result = fptoui <4 x float> %value to <4 x i32> + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC: {{^}}fp_to_uint_f32_to_i64: +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT + +; SI: s_endpgm +define void @fp_to_uint_f32_to_i64(i64 addrspace(1)* %out, float %x) { + %conv = fptoui float %x to i64 + store i64 %conv, i64 addrspace(1)* %out + ret void +} + +; FUNC: {{^}}fp_to_uint_v2f32_to_v2i64: +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT + +; SI: s_endpgm +define void @fp_to_uint_v2f32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) { + %conv = fptoui <2 x float> %x to <2 x i64> + store <2 x i64> %conv, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC: {{^}}fp_to_uint_v4f32_to_v4i64: +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: AND_INT +; EG-DAG: LSHR +; EG-DAG: SUB_INT +; EG-DAG: AND_INT +; EG-DAG: ASHR +; EG-DAG: AND_INT +; EG-DAG: OR_INT +; EG-DAG: SUB_INT +; EG-DAG: LSHL +; EG-DAG: LSHL +; EG-DAG: SUB_INT +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT +; EG-DAG: SETGT_INT +; EG-DAG: XOR_INT +; EG-DAG: XOR_INT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT + +; SI: s_endpgm +define void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) { + %conv = fptoui <4 x float> %x to <4 x i64> + store <4 x i64> %conv, <4 x i64> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/fpext.ll b/test/CodeGen/AMDGPU/fpext.ll new file mode 100644 index 00000000000..734a43be229 --- /dev/null +++ b/test/CodeGen/AMDGPU/fpext.ll @@ -0,0 +1,45 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}fpext_f32_to_f64: +; SI: v_cvt_f64_f32_e32 {{v\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +define void @fpext_f32_to_f64(double addrspace(1)* %out, float %in) { + %result = fpext float %in to double + store double %result, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fpext_v2f32_to_v2f64: +; SI: v_cvt_f64_f32_e32 +; SI: v_cvt_f64_f32_e32 +define void @fpext_v2f32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x float> %in) { + %result = fpext <2 x float> %in to <2 x double> + store <2 x double> %result, <2 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fpext_v4f32_to_v4f64: +; SI: v_cvt_f64_f32_e32 +; SI: v_cvt_f64_f32_e32 +; SI: v_cvt_f64_f32_e32 +; SI: v_cvt_f64_f32_e32 +define void @fpext_v4f32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x float> %in) { + %result = fpext <4 x float> %in to <4 x double> + store <4 x double> %result, <4 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fpext_v8f32_to_v8f64: +; SI: v_cvt_f64_f32_e32 +; SI: v_cvt_f64_f32_e32 +; SI: v_cvt_f64_f32_e32 +; SI: v_cvt_f64_f32_e32 +; SI: v_cvt_f64_f32_e32 +; SI: v_cvt_f64_f32_e32 +; SI: v_cvt_f64_f32_e32 +; SI: v_cvt_f64_f32_e32 +define void @fpext_v8f32_to_v8f64(<8 x double> addrspace(1)* %out, <8 x float> %in) { + %result = fpext <8 x float> %in to <8 x double> + store <8 x double> %result, <8 x double> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/fptrunc.ll b/test/CodeGen/AMDGPU/fptrunc.ll new file mode 100644 index 00000000000..385e10e7baa --- /dev/null +++ b/test/CodeGen/AMDGPU/fptrunc.ll @@ -0,0 +1,45 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}fptrunc_f64_to_f32: +; SI: v_cvt_f32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} +define void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %in) { + %result = fptrunc double %in to float + store float %result, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fptrunc_v2f64_to_v2f32: +; SI: v_cvt_f32_f64_e32 +; SI: v_cvt_f32_f64_e32 +define void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x double> %in) { + %result = fptrunc <2 x double> %in to <2 x float> + store <2 x float> %result, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fptrunc_v4f64_to_v4f32: +; SI: v_cvt_f32_f64_e32 +; SI: v_cvt_f32_f64_e32 +; SI: v_cvt_f32_f64_e32 +; SI: v_cvt_f32_f64_e32 +define void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x double> %in) { + %result = fptrunc <4 x double> %in to <4 x float> + store <4 x float> %result, <4 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fptrunc_v8f64_to_v8f32: +; SI: v_cvt_f32_f64_e32 +; SI: v_cvt_f32_f64_e32 +; SI: v_cvt_f32_f64_e32 +; SI: v_cvt_f32_f64_e32 +; SI: v_cvt_f32_f64_e32 +; SI: v_cvt_f32_f64_e32 +; SI: v_cvt_f32_f64_e32 +; SI: v_cvt_f32_f64_e32 +define void @fptrunc_v8f64_to_v8f32(<8 x float> addrspace(1)* %out, <8 x double> %in) { + %result = fptrunc <8 x double> %in to <8 x float> + store <8 x float> %result, <8 x float> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/frem.ll b/test/CodeGen/AMDGPU/frem.ll new file mode 100644 index 00000000000..f245ef08cb9 --- /dev/null +++ b/test/CodeGen/AMDGPU/frem.ll @@ -0,0 +1,112 @@ +; RUN: llc -march=amdgcn -mcpu=SI -enable-misched < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=bonaire -enable-misched < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -enable-misched < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}frem_f32: +; GCN-DAG: buffer_load_dword [[X:v[0-9]+]], {{.*$}} +; GCN-DAG: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16 +; GCN-DAG: v_cmp +; GCN-DAG: v_mul_f32 +; GCN: v_rcp_f32_e32 +; GCN: v_mul_f32_e32 +; GCN: v_mul_f32_e32 +; GCN: v_trunc_f32_e32 +; GCN: v_mad_f32 +; GCN: s_endpgm +define void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, + float addrspace(1)* %in2) #0 { + %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 + %r0 = load float, float addrspace(1)* %in1, align 4 + %r1 = load float, float addrspace(1)* %gep2, align 4 + %r2 = frem float %r0, %r1 + store float %r2, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}unsafe_frem_f32: +; GCN: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16 +; GCN: buffer_load_dword [[X:v[0-9]+]], {{.*}} +; GCN: v_rcp_f32_e32 [[INVY:v[0-9]+]], [[Y]] +; GCN: v_mul_f32_e32 [[DIV:v[0-9]+]], [[INVY]], [[X]] +; GCN: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[DIV]] +; GCN: v_mad_f32 [[RESULT:v[0-9]+]], -[[TRUNC]], [[Y]], [[X]] +; GCN: buffer_store_dword [[RESULT]] +; GCN: s_endpgm +define void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, + float addrspace(1)* %in2) #1 { + %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 + %r0 = load float, float addrspace(1)* %in1, align 4 + %r1 = load float, float addrspace(1)* %gep2, align 4 + %r2 = frem float %r0, %r1 + store float %r2, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}frem_f64: +; GCN: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], {{.*}}, 0 +; GCN: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], {{.*}}, 0 +; GCN-DAG: v_div_fmas_f64 +; GCN-DAG: v_div_scale_f64 +; GCN-DAG: v_mul_f64 +; CI: v_trunc_f64_e32 +; CI: v_mul_f64 +; GCN: v_add_f64 +; GCN: buffer_store_dwordx2 +; GCN: s_endpgm +define void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) #0 { + %r0 = load double, double addrspace(1)* %in1, align 8 + %r1 = load double, double addrspace(1)* %in2, align 8 + %r2 = frem double %r0, %r1 + store double %r2, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}unsafe_frem_f64: +; GCN: v_rcp_f64_e32 +; GCN: v_mul_f64 +; SI: v_bfe_u32 +; CI: v_trunc_f64_e32 +; GCN: v_fma_f64 +; GCN: s_endpgm +define void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) #1 { + %r0 = load double, double addrspace(1)* %in1, align 8 + %r1 = load double, double addrspace(1)* %in2, align 8 + %r2 = frem double %r0, %r1 + store double %r2, double addrspace(1)* %out, align 8 + ret void +} + +define void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1, + <2 x float> addrspace(1)* %in2) #0 { + %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4 + %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1, align 8 + %r1 = load <2 x float>, <2 x float> addrspace(1)* %gep2, align 8 + %r2 = frem <2 x float> %r0, %r1 + store <2 x float> %r2, <2 x float> addrspace(1)* %out, align 8 + ret void +} + +define void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1, + <4 x float> addrspace(1)* %in2) #0 { + %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4 + %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1, align 16 + %r1 = load <4 x float>, <4 x float> addrspace(1)* %gep2, align 16 + %r2 = frem <4 x float> %r0, %r1 + store <4 x float> %r2, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +define void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, + <2 x double> addrspace(1)* %in2) #0 { + %gep2 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in2, i32 4 + %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1, align 16 + %r1 = load <2 x double>, <2 x double> addrspace(1)* %gep2, align 16 + %r2 = frem <2 x double> %r0, %r1 + store <2 x double> %r2, <2 x double> addrspace(1)* %out, align 16 + ret void +} + +attributes #0 = { nounwind "unsafe-fp-math"="false" } +attributes #1 = { nounwind "unsafe-fp-math"="true" } diff --git a/test/CodeGen/AMDGPU/fsqrt.ll b/test/CodeGen/AMDGPU/fsqrt.ll new file mode 100644 index 00000000000..04101346cdf --- /dev/null +++ b/test/CodeGen/AMDGPU/fsqrt.ll @@ -0,0 +1,29 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck %s + +; Run with unsafe-fp-math to make sure nothing tries to turn this into 1 / rsqrt(x) + +; CHECK: {{^}}fsqrt_f32: +; CHECK: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}} + +define void @fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) { + %r0 = load float, float addrspace(1)* %in + %r1 = call float @llvm.sqrt.f32(float %r0) + store float %r1, float addrspace(1)* %out + ret void +} + +; CHECK: {{^}}fsqrt_f64: +; CHECK: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} + +define void @fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) { + %r0 = load double, double addrspace(1)* %in + %r1 = call double @llvm.sqrt.f64(double %r0) + store double %r1, double addrspace(1)* %out + ret void +} + +declare float @llvm.sqrt.f32(float %Val) +declare double @llvm.sqrt.f64(double %Val) diff --git a/test/CodeGen/AMDGPU/fsub.ll b/test/CodeGen/AMDGPU/fsub.ll new file mode 100644 index 00000000000..dfe41cb5b11 --- /dev/null +++ b/test/CodeGen/AMDGPU/fsub.ll @@ -0,0 +1,75 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + + +; FUNC-LABEL: {{^}}v_fsub_f32: +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +define void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) { + %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 + %a = load float, float addrspace(1)* %in, align 4 + %b = load float, float addrspace(1)* %b_ptr, align 4 + %result = fsub float %a, %b + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_fsub_f32: +; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, -KC0[2].W + +; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +define void @s_fsub_f32(float addrspace(1)* %out, float %a, float %b) { + %sub = fsub float %a, %b + store float %sub, float addrspace(1)* %out, align 4 + ret void +} + +declare float @llvm.R600.load.input(i32) readnone + +declare void @llvm.AMDGPU.store.output(float, i32) + +; FUNC-LABEL: {{^}}fsub_v2f32: +; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z +; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y + +; FIXME: Should be using SGPR directly for first operand +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { + %sub = fsub <2 x float> %a, %b + store <2 x float> %sub, <2 x float> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_fsub_v4f32: +; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}} +; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}} +; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}} +; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}} + +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +define void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 + %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16 + %b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16 + %result = fsub <4 x float> %a, %b + store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; FIXME: Should be using SGPR directly for first operand + +; FUNC-LABEL: {{^}}s_fsub_v4f32: +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: s_endpgm +define void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) { + %result = fsub <4 x float> %a, %b + store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16 + ret void +} diff --git a/test/CodeGen/AMDGPU/fsub64.ll b/test/CodeGen/AMDGPU/fsub64.ll new file mode 100644 index 00000000000..f34a48e30a8 --- /dev/null +++ b/test/CodeGen/AMDGPU/fsub64.ll @@ -0,0 +1,107 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare double @llvm.fabs.f64(double) #0 + +; SI-LABEL: {{^}}fsub_f64: +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} +define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r2 = fsub double %r0, %r1 + store double %r2, double addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}fsub_fabs_f64: +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|}} +define void @fsub_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r1.fabs = call double @llvm.fabs.f64(double %r1) #0 + %r2 = fsub double %r0, %r1.fabs + store double %r2, double addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}fsub_fabs_inv_f64: +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, -v\[[0-9]+:[0-9]+\]}} +define void @fsub_fabs_inv_f64(double addrspace(1)* %out, double addrspace(1)* %in1, + double addrspace(1)* %in2) { + %r0 = load double, double addrspace(1)* %in1 + %r1 = load double, double addrspace(1)* %in2 + %r0.fabs = call double @llvm.fabs.f64(double %r0) #0 + %r2 = fsub double %r0.fabs, %r1 + store double %r2, double addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_fsub_f64: +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} +define void @s_fsub_f64(double addrspace(1)* %out, double %a, double %b) { + %sub = fsub double %a, %b + store double %sub, double addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_fsub_imm_f64: +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], 4.0, -s\[[0-9]+:[0-9]+\]}} +define void @s_fsub_imm_f64(double addrspace(1)* %out, double %a, double %b) { + %sub = fsub double 4.0, %a + store double %sub, double addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_fsub_imm_inv_f64: +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], -4.0, s\[[0-9]+:[0-9]+\]}} +define void @s_fsub_imm_inv_f64(double addrspace(1)* %out, double %a, double %b) { + %sub = fsub double %a, 4.0 + store double %sub, double addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_fsub_self_f64: +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -s\[[0-9]+:[0-9]+\]}} +define void @s_fsub_self_f64(double addrspace(1)* %out, double %a) { + %sub = fsub double %a, %a + store double %sub, double addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}fsub_v2f64: +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} +define void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) { + %sub = fsub <2 x double> %a, %b + store <2 x double> %sub, <2 x double> addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}fsub_v4f64: +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} +define void @fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x double>, <4 x double> addrspace(1)* %in, i32 1 + %a = load <4 x double>, <4 x double> addrspace(1)* %in + %b = load <4 x double>, <4 x double> addrspace(1)* %b_ptr + %result = fsub <4 x double> %a, %b + store <4 x double> %result, <4 x double> addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_fsub_v4f64: +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} +define void @s_fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) { + %result = fsub <4 x double> %a, %b + store <4 x double> %result, <4 x double> addrspace(1)* %out, align 16 + ret void +} + +attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/ftrunc.f64.ll b/test/CodeGen/AMDGPU/ftrunc.f64.ll new file mode 100644 index 00000000000..6618d8b5e57 --- /dev/null +++ b/test/CodeGen/AMDGPU/ftrunc.f64.ll @@ -0,0 +1,111 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s + +declare double @llvm.trunc.f64(double) nounwind readnone +declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone +declare <3 x double> @llvm.trunc.v3f64(<3 x double>) nounwind readnone +declare <4 x double> @llvm.trunc.v4f64(<4 x double>) nounwind readnone +declare <8 x double> @llvm.trunc.v8f64(<8 x double>) nounwind readnone +declare <16 x double> @llvm.trunc.v16f64(<16 x double>) nounwind readnone + +; FUNC-LABEL: {{^}}v_ftrunc_f64: +; CI: v_trunc_f64 +; SI: v_bfe_u32 {{v[0-9]+}}, {{v[0-9]+}}, 20, 11 +; SI: s_endpgm +define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) { + %x = load double, double addrspace(1)* %in, align 8 + %y = call double @llvm.trunc.f64(double %x) nounwind readnone + store double %y, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}ftrunc_f64: +; CI: v_trunc_f64_e32 + +; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014 +; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 +; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01 +; SI: s_lshr_b64 +; SI: s_not_b64 +; SI: s_and_b64 +; SI: cmp_gt_i32 +; SI: cndmask_b32 +; SI: cndmask_b32 +; SI: cmp_lt_i32 +; SI: cndmask_b32 +; SI: cndmask_b32 +; SI: s_endpgm +define void @ftrunc_f64(double addrspace(1)* %out, double %x) { + %y = call double @llvm.trunc.f64(double %x) nounwind readnone + store double %y, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ftrunc_v2f64: +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +define void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) { + %y = call <2 x double> @llvm.trunc.v2f64(<2 x double> %x) nounwind readnone + store <2 x double> %y, <2 x double> addrspace(1)* %out + ret void +} + +; FIXME-FUNC-LABEL: {{^}}ftrunc_v3f64: +; FIXME-CI: v_trunc_f64_e32 +; FIXME-CI: v_trunc_f64_e32 +; FIXME-CI: v_trunc_f64_e32 +; define void @ftrunc_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) { +; %y = call <3 x double> @llvm.trunc.v3f64(<3 x double> %x) nounwind readnone +; store <3 x double> %y, <3 x double> addrspace(1)* %out +; ret void +; } + +; FUNC-LABEL: {{^}}ftrunc_v4f64: +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +define void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) { + %y = call <4 x double> @llvm.trunc.v4f64(<4 x double> %x) nounwind readnone + store <4 x double> %y, <4 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ftrunc_v8f64: +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +define void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) { + %y = call <8 x double> @llvm.trunc.v8f64(<8 x double> %x) nounwind readnone + store <8 x double> %y, <8 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ftrunc_v16f64: +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +; CI: v_trunc_f64_e32 +define void @ftrunc_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) { + %y = call <16 x double> @llvm.trunc.v16f64(<16 x double> %x) nounwind readnone + store <16 x double> %y, <16 x double> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/ftrunc.ll b/test/CodeGen/AMDGPU/ftrunc.ll new file mode 100644 index 00000000000..edc08609a8a --- /dev/null +++ b/test/CodeGen/AMDGPU/ftrunc.ll @@ -0,0 +1,120 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG --check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI --check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI --check-prefix=FUNC %s + +declare float @llvm.trunc.f32(float) nounwind readnone +declare <2 x float> @llvm.trunc.v2f32(<2 x float>) nounwind readnone +declare <3 x float> @llvm.trunc.v3f32(<3 x float>) nounwind readnone +declare <4 x float> @llvm.trunc.v4f32(<4 x float>) nounwind readnone +declare <8 x float> @llvm.trunc.v8f32(<8 x float>) nounwind readnone +declare <16 x float> @llvm.trunc.v16f32(<16 x float>) nounwind readnone + +; FUNC-LABEL: {{^}}ftrunc_f32: +; EG: TRUNC +; SI: v_trunc_f32_e32 +define void @ftrunc_f32(float addrspace(1)* %out, float %x) { + %y = call float @llvm.trunc.f32(float %x) nounwind readnone + store float %y, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ftrunc_v2f32: +; EG: TRUNC +; EG: TRUNC +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +define void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) { + %y = call <2 x float> @llvm.trunc.v2f32(<2 x float> %x) nounwind readnone + store <2 x float> %y, <2 x float> addrspace(1)* %out + ret void +} + +; FIXME-FUNC-LABEL: {{^}}ftrunc_v3f32: +; FIXME-EG: TRUNC +; FIXME-EG: TRUNC +; FIXME-EG: TRUNC +; FIXME-SI: v_trunc_f32_e32 +; FIXME-SI: v_trunc_f32_e32 +; FIXME-SI: v_trunc_f32_e32 +; define void @ftrunc_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) { +; %y = call <3 x float> @llvm.trunc.v3f32(<3 x float> %x) nounwind readnone +; store <3 x float> %y, <3 x float> addrspace(1)* %out +; ret void +; } + +; FUNC-LABEL: {{^}}ftrunc_v4f32: +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +define void @ftrunc_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) { + %y = call <4 x float> @llvm.trunc.v4f32(<4 x float> %x) nounwind readnone + store <4 x float> %y, <4 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ftrunc_v8f32: +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +define void @ftrunc_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) { + %y = call <8 x float> @llvm.trunc.v8f32(<8 x float> %x) nounwind readnone + store <8 x float> %y, <8 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ftrunc_v16f32: +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; EG: TRUNC +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +; SI: v_trunc_f32_e32 +define void @ftrunc_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) { + %y = call <16 x float> @llvm.trunc.v16f32(<16 x float> %x) nounwind readnone + store <16 x float> %y, <16 x float> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/gep-address-space.ll b/test/CodeGen/AMDGPU/gep-address-space.ll new file mode 100644 index 00000000000..471b0f6b13e --- /dev/null +++ b/test/CodeGen/AMDGPU/gep-address-space.ll @@ -0,0 +1,55 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=CHECK %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s + +define void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind { +; CHECK-LABEL: {{^}}use_gep_address_space: +; CHECK: v_mov_b32_e32 [[PTR:v[0-9]+]], s{{[0-9]+}} +; CHECK: ds_write_b32 [[PTR]], v{{[0-9]+}} offset:64 + %p = getelementptr [1024 x i32], [1024 x i32] addrspace(3)* %array, i16 0, i16 16 + store i32 99, i32 addrspace(3)* %p + ret void +} + +define void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind { +; CHECK-LABEL: {{^}}use_gep_address_space_large_offset: +; The LDS offset will be 65536 bytes, which is larger than the size of LDS on +; SI, which is why it is being OR'd with the base pointer. +; SI: s_or_b32 +; CI: s_add_i32 +; CHECK: ds_write_b32 + %p = getelementptr [1024 x i32], [1024 x i32] addrspace(3)* %array, i16 0, i16 16384 + store i32 99, i32 addrspace(3)* %p + ret void +} + +define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind { +; CHECK-LABEL: {{^}}gep_as_vector_v4: +; CHECK: s_add_i32 +; CHECK: s_add_i32 +; CHECK: s_add_i32 +; CHECK: s_add_i32 + %p = getelementptr [1024 x i32], <4 x [1024 x i32] addrspace(3)*> %array, <4 x i16> zeroinitializer, <4 x i16> + %p0 = extractelement <4 x i32 addrspace(3)*> %p, i32 0 + %p1 = extractelement <4 x i32 addrspace(3)*> %p, i32 1 + %p2 = extractelement <4 x i32 addrspace(3)*> %p, i32 2 + %p3 = extractelement <4 x i32 addrspace(3)*> %p, i32 3 + store i32 99, i32 addrspace(3)* %p0 + store i32 99, i32 addrspace(3)* %p1 + store i32 99, i32 addrspace(3)* %p2 + store i32 99, i32 addrspace(3)* %p3 + ret void +} + +define void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind { +; CHECK-LABEL: {{^}}gep_as_vector_v2: +; CHECK: s_add_i32 +; CHECK: s_add_i32 + %p = getelementptr [1024 x i32], <2 x [1024 x i32] addrspace(3)*> %array, <2 x i16> zeroinitializer, <2 x i16> + %p0 = extractelement <2 x i32 addrspace(3)*> %p, i32 0 + %p1 = extractelement <2 x i32 addrspace(3)*> %p, i32 1 + store i32 99, i32 addrspace(3)* %p0 + store i32 99, i32 addrspace(3)* %p1 + ret void +} + diff --git a/test/CodeGen/AMDGPU/global-directive.ll b/test/CodeGen/AMDGPU/global-directive.ll new file mode 100644 index 00000000000..be775cf9292 --- /dev/null +++ b/test/CodeGen/AMDGPU/global-directive.ll @@ -0,0 +1,15 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; Make sure the GlobalDirective isn't merged with the function name + +; SI: .globl foo +; SI: {{^}}foo: +define void @foo(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %a = load i32, i32 addrspace(1)* %in + %b = load i32, i32 addrspace(1)* %b_ptr + %result = add i32 %a, %b + store i32 %result, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/global-extload-i1.ll b/test/CodeGen/AMDGPU/global-extload-i1.ll new file mode 100644 index 00000000000..bd9557d730f --- /dev/null +++ b/test/CodeGen/AMDGPU/global-extload-i1.ll @@ -0,0 +1,302 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; FIXME: Evergreen broken + +; FUNC-LABEL: {{^}}zextload_global_i1_to_i32: +; SI: buffer_load_ubyte +; SI: buffer_store_dword +; SI: s_endpgm +define void @zextload_global_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %a = load i1, i1 addrspace(1)* %in + %ext = zext i1 %a to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_i1_to_i32: +; SI: buffer_load_ubyte +; SI: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1{{$}} +; SI: buffer_store_dword +; SI: s_endpgm +define void @sextload_global_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %a = load i1, i1 addrspace(1)* %in + %ext = sext i1 %a to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v1i1_to_v1i32: +; SI: s_endpgm +define void @zextload_global_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i1>, <1 x i1> addrspace(1)* %in + %ext = zext <1 x i1> %load to <1 x i32> + store <1 x i32> %ext, <1 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v1i1_to_v1i32: +; SI: s_endpgm +define void @sextload_global_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i1>, <1 x i1> addrspace(1)* %in + %ext = sext <1 x i1> %load to <1 x i32> + store <1 x i32> %ext, <1 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v2i1_to_v2i32: +; SI: s_endpgm +define void @zextload_global_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i1>, <2 x i1> addrspace(1)* %in + %ext = zext <2 x i1> %load to <2 x i32> + store <2 x i32> %ext, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v2i1_to_v2i32: +; SI: s_endpgm +define void @sextload_global_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i1>, <2 x i1> addrspace(1)* %in + %ext = sext <2 x i1> %load to <2 x i32> + store <2 x i32> %ext, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v4i1_to_v4i32: +; SI: s_endpgm +define void @zextload_global_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i1>, <4 x i1> addrspace(1)* %in + %ext = zext <4 x i1> %load to <4 x i32> + store <4 x i32> %ext, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v4i1_to_v4i32: +; SI: s_endpgm +define void @sextload_global_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i1>, <4 x i1> addrspace(1)* %in + %ext = sext <4 x i1> %load to <4 x i32> + store <4 x i32> %ext, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v8i1_to_v8i32: +; SI: s_endpgm +define void @zextload_global_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i1>, <8 x i1> addrspace(1)* %in + %ext = zext <8 x i1> %load to <8 x i32> + store <8 x i32> %ext, <8 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v8i1_to_v8i32: +; SI: s_endpgm +define void @sextload_global_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i1>, <8 x i1> addrspace(1)* %in + %ext = sext <8 x i1> %load to <8 x i32> + store <8 x i32> %ext, <8 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v16i1_to_v16i32: +; SI: s_endpgm +define void @zextload_global_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i1>, <16 x i1> addrspace(1)* %in + %ext = zext <16 x i1> %load to <16 x i32> + store <16 x i32> %ext, <16 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v16i1_to_v16i32: +; SI: s_endpgm +define void @sextload_global_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i1>, <16 x i1> addrspace(1)* %in + %ext = sext <16 x i1> %load to <16 x i32> + store <16 x i32> %ext, <16 x i32> addrspace(1)* %out + ret void +} + +; XFUNC-LABEL: {{^}}zextload_global_v32i1_to_v32i32: +; XSI: s_endpgm +; define void @zextload_global_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind { +; %load = load <32 x i1>, <32 x i1> addrspace(1)* %in +; %ext = zext <32 x i1> %load to <32 x i32> +; store <32 x i32> %ext, <32 x i32> addrspace(1)* %out +; ret void +; } + +; XFUNC-LABEL: {{^}}sextload_global_v32i1_to_v32i32: +; XSI: s_endpgm +; define void @sextload_global_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind { +; %load = load <32 x i1>, <32 x i1> addrspace(1)* %in +; %ext = sext <32 x i1> %load to <32 x i32> +; store <32 x i32> %ext, <32 x i32> addrspace(1)* %out +; ret void +; } + +; XFUNC-LABEL: {{^}}zextload_global_v64i1_to_v64i32: +; XSI: s_endpgm +; define void @zextload_global_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind { +; %load = load <64 x i1>, <64 x i1> addrspace(1)* %in +; %ext = zext <64 x i1> %load to <64 x i32> +; store <64 x i32> %ext, <64 x i32> addrspace(1)* %out +; ret void +; } + +; XFUNC-LABEL: {{^}}sextload_global_v64i1_to_v64i32: +; XSI: s_endpgm +; define void @sextload_global_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind { +; %load = load <64 x i1>, <64 x i1> addrspace(1)* %in +; %ext = sext <64 x i1> %load to <64 x i32> +; store <64 x i32> %ext, <64 x i32> addrspace(1)* %out +; ret void +; } + +; FUNC-LABEL: {{^}}zextload_global_i1_to_i64: +; SI: buffer_load_ubyte [[LOAD:v[0-9]+]], +; SI: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}} +; SI: buffer_store_dwordx2 +define void @zextload_global_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %a = load i1, i1 addrspace(1)* %in + %ext = zext i1 %a to i64 + store i64 %ext, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_i1_to_i64: +; SI: buffer_load_ubyte [[LOAD:v[0-9]+]], +; SI: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}} +; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]] +; SI: buffer_store_dwordx2 +define void @sextload_global_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %a = load i1, i1 addrspace(1)* %in + %ext = sext i1 %a to i64 + store i64 %ext, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v1i1_to_v1i64: +; SI: s_endpgm +define void @zextload_global_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i1>, <1 x i1> addrspace(1)* %in + %ext = zext <1 x i1> %load to <1 x i64> + store <1 x i64> %ext, <1 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v1i1_to_v1i64: +; SI: s_endpgm +define void @sextload_global_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i1>, <1 x i1> addrspace(1)* %in + %ext = sext <1 x i1> %load to <1 x i64> + store <1 x i64> %ext, <1 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v2i1_to_v2i64: +; SI: s_endpgm +define void @zextload_global_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i1>, <2 x i1> addrspace(1)* %in + %ext = zext <2 x i1> %load to <2 x i64> + store <2 x i64> %ext, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v2i1_to_v2i64: +; SI: s_endpgm +define void @sextload_global_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i1>, <2 x i1> addrspace(1)* %in + %ext = sext <2 x i1> %load to <2 x i64> + store <2 x i64> %ext, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v4i1_to_v4i64: +; SI: s_endpgm +define void @zextload_global_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i1>, <4 x i1> addrspace(1)* %in + %ext = zext <4 x i1> %load to <4 x i64> + store <4 x i64> %ext, <4 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v4i1_to_v4i64: +; SI: s_endpgm +define void @sextload_global_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i1>, <4 x i1> addrspace(1)* %in + %ext = sext <4 x i1> %load to <4 x i64> + store <4 x i64> %ext, <4 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v8i1_to_v8i64: +; SI: s_endpgm +define void @zextload_global_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i1>, <8 x i1> addrspace(1)* %in + %ext = zext <8 x i1> %load to <8 x i64> + store <8 x i64> %ext, <8 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v8i1_to_v8i64: +; SI: s_endpgm +define void @sextload_global_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i1>, <8 x i1> addrspace(1)* %in + %ext = sext <8 x i1> %load to <8 x i64> + store <8 x i64> %ext, <8 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v16i1_to_v16i64: +; SI: s_endpgm +define void @zextload_global_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i1>, <16 x i1> addrspace(1)* %in + %ext = zext <16 x i1> %load to <16 x i64> + store <16 x i64> %ext, <16 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v16i1_to_v16i64: +; SI: s_endpgm +define void @sextload_global_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i1>, <16 x i1> addrspace(1)* %in + %ext = sext <16 x i1> %load to <16 x i64> + store <16 x i64> %ext, <16 x i64> addrspace(1)* %out + ret void +} + +; XFUNC-LABEL: {{^}}zextload_global_v32i1_to_v32i64: +; XSI: s_endpgm +; define void @zextload_global_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind { +; %load = load <32 x i1>, <32 x i1> addrspace(1)* %in +; %ext = zext <32 x i1> %load to <32 x i64> +; store <32 x i64> %ext, <32 x i64> addrspace(1)* %out +; ret void +; } + +; XFUNC-LABEL: {{^}}sextload_global_v32i1_to_v32i64: +; XSI: s_endpgm +; define void @sextload_global_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind { +; %load = load <32 x i1>, <32 x i1> addrspace(1)* %in +; %ext = sext <32 x i1> %load to <32 x i64> +; store <32 x i64> %ext, <32 x i64> addrspace(1)* %out +; ret void +; } + +; XFUNC-LABEL: {{^}}zextload_global_v64i1_to_v64i64: +; XSI: s_endpgm +; define void @zextload_global_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind { +; %load = load <64 x i1>, <64 x i1> addrspace(1)* %in +; %ext = zext <64 x i1> %load to <64 x i64> +; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out +; ret void +; } + +; XFUNC-LABEL: {{^}}sextload_global_v64i1_to_v64i64: +; XSI: s_endpgm +; define void @sextload_global_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind { +; %load = load <64 x i1>, <64 x i1> addrspace(1)* %in +; %ext = sext <64 x i1> %load to <64 x i64> +; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out +; ret void +; } diff --git a/test/CodeGen/AMDGPU/global-extload-i16.ll b/test/CodeGen/AMDGPU/global-extload-i16.ll new file mode 100644 index 00000000000..103a40dee27 --- /dev/null +++ b/test/CodeGen/AMDGPU/global-extload-i16.ll @@ -0,0 +1,302 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; FIXME: cypress is broken because the bigger testcases spill and it's not implemented + +; FUNC-LABEL: {{^}}zextload_global_i16_to_i32: +; SI: buffer_load_ushort +; SI: buffer_store_dword +; SI: s_endpgm +define void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { + %a = load i16, i16 addrspace(1)* %in + %ext = zext i16 %a to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_i16_to_i32: +; SI: buffer_load_sshort +; SI: buffer_store_dword +; SI: s_endpgm +define void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { + %a = load i16, i16 addrspace(1)* %in + %ext = sext i16 %a to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i32: +; SI: buffer_load_ushort +; SI: s_endpgm +define void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i16>, <1 x i16> addrspace(1)* %in + %ext = zext <1 x i16> %load to <1 x i32> + store <1 x i32> %ext, <1 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i32: +; SI: buffer_load_sshort +; SI: s_endpgm +define void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i16>, <1 x i16> addrspace(1)* %in + %ext = sext <1 x i16> %load to <1 x i32> + store <1 x i32> %ext, <1 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i32: +; SI: s_endpgm +define void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i16>, <2 x i16> addrspace(1)* %in + %ext = zext <2 x i16> %load to <2 x i32> + store <2 x i32> %ext, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i32: +; SI: s_endpgm +define void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i16>, <2 x i16> addrspace(1)* %in + %ext = sext <2 x i16> %load to <2 x i32> + store <2 x i32> %ext, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i32: +; SI: s_endpgm +define void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i16>, <4 x i16> addrspace(1)* %in + %ext = zext <4 x i16> %load to <4 x i32> + store <4 x i32> %ext, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i32: +; SI: s_endpgm +define void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i16>, <4 x i16> addrspace(1)* %in + %ext = sext <4 x i16> %load to <4 x i32> + store <4 x i32> %ext, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i32: +; SI: s_endpgm +define void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i16>, <8 x i16> addrspace(1)* %in + %ext = zext <8 x i16> %load to <8 x i32> + store <8 x i32> %ext, <8 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i32: +; SI: s_endpgm +define void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i16>, <8 x i16> addrspace(1)* %in + %ext = sext <8 x i16> %load to <8 x i32> + store <8 x i32> %ext, <8 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i32: +; SI: s_endpgm +define void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i16>, <16 x i16> addrspace(1)* %in + %ext = zext <16 x i16> %load to <16 x i32> + store <16 x i32> %ext, <16 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i32: +; SI: s_endpgm +define void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i16>, <16 x i16> addrspace(1)* %in + %ext = sext <16 x i16> %load to <16 x i32> + store <16 x i32> %ext, <16 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i32: +; SI: s_endpgm +define void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <32 x i16>, <32 x i16> addrspace(1)* %in + %ext = zext <32 x i16> %load to <32 x i32> + store <32 x i32> %ext, <32 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i32: +; SI: s_endpgm +define void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <32 x i16>, <32 x i16> addrspace(1)* %in + %ext = sext <32 x i16> %load to <32 x i32> + store <32 x i32> %ext, <32 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i32: +; SI: s_endpgm +define void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <64 x i16>, <64 x i16> addrspace(1)* %in + %ext = zext <64 x i16> %load to <64 x i32> + store <64 x i32> %ext, <64 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i32: +; SI: s_endpgm +define void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <64 x i16>, <64 x i16> addrspace(1)* %in + %ext = sext <64 x i16> %load to <64 x i32> + store <64 x i32> %ext, <64 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_i16_to_i64: +; SI: buffer_load_ushort v[[LO:[0-9]+]], +; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} +; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] +define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { + %a = load i16, i16 addrspace(1)* %in + %ext = zext i16 %a to i64 + store i64 %ext, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_i16_to_i64: +; SI: buffer_load_sshort [[LOAD:v[0-9]+]], +; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]] +; SI: buffer_store_dwordx2 +define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { + %a = load i16, i16 addrspace(1)* %in + %ext = sext i16 %a to i64 + store i64 %ext, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i64: +; SI: s_endpgm +define void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i16>, <1 x i16> addrspace(1)* %in + %ext = zext <1 x i16> %load to <1 x i64> + store <1 x i64> %ext, <1 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i64: +; SI: s_endpgm +define void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i16>, <1 x i16> addrspace(1)* %in + %ext = sext <1 x i16> %load to <1 x i64> + store <1 x i64> %ext, <1 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i64: +; SI: s_endpgm +define void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i16>, <2 x i16> addrspace(1)* %in + %ext = zext <2 x i16> %load to <2 x i64> + store <2 x i64> %ext, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i64: +; SI: s_endpgm +define void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i16>, <2 x i16> addrspace(1)* %in + %ext = sext <2 x i16> %load to <2 x i64> + store <2 x i64> %ext, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i64: +; SI: s_endpgm +define void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i16>, <4 x i16> addrspace(1)* %in + %ext = zext <4 x i16> %load to <4 x i64> + store <4 x i64> %ext, <4 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i64: +; SI: s_endpgm +define void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i16>, <4 x i16> addrspace(1)* %in + %ext = sext <4 x i16> %load to <4 x i64> + store <4 x i64> %ext, <4 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i64: +; SI: s_endpgm +define void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i16>, <8 x i16> addrspace(1)* %in + %ext = zext <8 x i16> %load to <8 x i64> + store <8 x i64> %ext, <8 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i64: +; SI: s_endpgm +define void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i16>, <8 x i16> addrspace(1)* %in + %ext = sext <8 x i16> %load to <8 x i64> + store <8 x i64> %ext, <8 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i64: +; SI: s_endpgm +define void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i16>, <16 x i16> addrspace(1)* %in + %ext = zext <16 x i16> %load to <16 x i64> + store <16 x i64> %ext, <16 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i64: +; SI: s_endpgm +define void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i16>, <16 x i16> addrspace(1)* %in + %ext = sext <16 x i16> %load to <16 x i64> + store <16 x i64> %ext, <16 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i64: +; SI: s_endpgm +define void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <32 x i16>, <32 x i16> addrspace(1)* %in + %ext = zext <32 x i16> %load to <32 x i64> + store <32 x i64> %ext, <32 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i64: +; SI: s_endpgm +define void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <32 x i16>, <32 x i16> addrspace(1)* %in + %ext = sext <32 x i16> %load to <32 x i64> + store <32 x i64> %ext, <32 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i64: +; SI: s_endpgm +define void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <64 x i16>, <64 x i16> addrspace(1)* %in + %ext = zext <64 x i16> %load to <64 x i64> + store <64 x i64> %ext, <64 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i64: +; SI: s_endpgm +define void @sextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { + %load = load <64 x i16>, <64 x i16> addrspace(1)* %in + %ext = sext <64 x i16> %load to <64 x i64> + store <64 x i64> %ext, <64 x i64> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/global-extload-i32.ll b/test/CodeGen/AMDGPU/global-extload-i32.ll new file mode 100644 index 00000000000..79b83452939 --- /dev/null +++ b/test/CodeGen/AMDGPU/global-extload-i32.ll @@ -0,0 +1,457 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}zextload_global_i32_to_i64: +; SI: buffer_load_dword v[[LO:[0-9]+]], +; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} +; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] +define void @zextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %a = load i32, i32 addrspace(1)* %in + %ext = zext i32 %a to i64 + store i64 %ext, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_i32_to_i64: +; SI: buffer_load_dword [[LOAD:v[0-9]+]], +; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]] +; SI: buffer_store_dwordx2 +define void @sextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %a = load i32, i32 addrspace(1)* %in + %ext = sext i32 %a to i64 + store i64 %ext, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v1i32_to_v1i64: +; SI: buffer_load_dword +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @zextload_global_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i32>, <1 x i32> addrspace(1)* %in + %ext = zext <1 x i32> %load to <1 x i64> + store <1 x i64> %ext, <1 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v1i32_to_v1i64: +; SI: buffer_load_dword +; SI: v_ashrrev_i32 +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @sextload_global_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i32>, <1 x i32> addrspace(1)* %in + %ext = sext <1 x i32> %load to <1 x i64> + store <1 x i64> %ext, <1 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v2i32_to_v2i64: +; SI: buffer_load_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @zextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i32>, <2 x i32> addrspace(1)* %in + %ext = zext <2 x i32> %load to <2 x i64> + store <2 x i64> %ext, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v2i32_to_v2i64: +; SI: buffer_load_dwordx2 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI: s_endpgm +define void @sextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i32>, <2 x i32> addrspace(1)* %in + %ext = sext <2 x i32> %load to <2 x i64> + store <2 x i64> %ext, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v4i32_to_v4i64: +; SI: buffer_load_dwordx4 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @zextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i32>, <4 x i32> addrspace(1)* %in + %ext = zext <4 x i32> %load to <4 x i64> + store <4 x i64> %ext, <4 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v4i32_to_v4i64: +; SI: buffer_load_dwordx4 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI: s_endpgm +define void @sextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i32>, <4 x i32> addrspace(1)* %in + %ext = sext <4 x i32> %load to <4 x i64> + store <4 x i64> %ext, <4 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v8i32_to_v8i64: +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI: s_endpgm +define void @zextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i32>, <8 x i32> addrspace(1)* %in + %ext = zext <8 x i32> %load to <8 x i64> + store <8 x i64> %ext, <8 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v8i32_to_v8i64: +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 + +; SI: s_endpgm +define void @sextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i32>, <8 x i32> addrspace(1)* %in + %ext = sext <8 x i32> %load to <8 x i64> + store <8 x i64> %ext, <8 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v16i32_to_v16i64: +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 + +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 + +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 + +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI: s_endpgm +define void @sextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i32>, <16 x i32> addrspace(1)* %in + %ext = sext <16 x i32> %load to <16 x i64> + store <16 x i64> %ext, <16 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v16i32_to_v16i64 +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 + +; SI: s_endpgm +define void @zextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i32>, <16 x i32> addrspace(1)* %in + %ext = zext <16 x i32> %load to <16 x i64> + store <16 x i64> %ext, <16 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v32i32_to_v32i64: +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 +; SI-DAG: v_ashrrev_i32 + +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 + +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 + +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 + +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 + +; SI: s_endpgm +define void @sextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* nocapture %in) nounwind { + %load = load <32 x i32>, <32 x i32> addrspace(1)* %in + %ext = sext <32 x i32> %load to <32 x i64> + store <32 x i64> %ext, <32 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v32i32_to_v32i64: +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 + +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 + +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 + +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 + +; SI: s_endpgm +define void @zextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* nocapture %in) nounwind { + %load = load <32 x i32>, <32 x i32> addrspace(1)* %in + %ext = zext <32 x i32> %load to <32 x i64> + store <32 x i64> %ext, <32 x i64> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/global-extload-i8.ll b/test/CodeGen/AMDGPU/global-extload-i8.ll new file mode 100644 index 00000000000..b31d5361d5a --- /dev/null +++ b/test/CodeGen/AMDGPU/global-extload-i8.ll @@ -0,0 +1,299 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}zextload_global_i8_to_i32: +; SI: buffer_load_ubyte +; SI: buffer_store_dword +; SI: s_endpgm +define void @zextload_global_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { + %a = load i8, i8 addrspace(1)* %in + %ext = zext i8 %a to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_i8_to_i32: +; SI: buffer_load_sbyte +; SI: buffer_store_dword +; SI: s_endpgm +define void @sextload_global_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { + %a = load i8, i8 addrspace(1)* %in + %ext = sext i8 %a to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v1i8_to_v1i32: +; SI: s_endpgm +define void @zextload_global_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i8>, <1 x i8> addrspace(1)* %in + %ext = zext <1 x i8> %load to <1 x i32> + store <1 x i32> %ext, <1 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v1i8_to_v1i32: +; SI: s_endpgm +define void @sextload_global_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i8>, <1 x i8> addrspace(1)* %in + %ext = sext <1 x i8> %load to <1 x i32> + store <1 x i32> %ext, <1 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v2i8_to_v2i32: +; SI: s_endpgm +define void @zextload_global_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i8>, <2 x i8> addrspace(1)* %in + %ext = zext <2 x i8> %load to <2 x i32> + store <2 x i32> %ext, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v2i8_to_v2i32: +; SI: s_endpgm +define void @sextload_global_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i8>, <2 x i8> addrspace(1)* %in + %ext = sext <2 x i8> %load to <2 x i32> + store <2 x i32> %ext, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v4i8_to_v4i32: +; SI: s_endpgm +define void @zextload_global_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i8>, <4 x i8> addrspace(1)* %in + %ext = zext <4 x i8> %load to <4 x i32> + store <4 x i32> %ext, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v4i8_to_v4i32: +; SI: s_endpgm +define void @sextload_global_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i8>, <4 x i8> addrspace(1)* %in + %ext = sext <4 x i8> %load to <4 x i32> + store <4 x i32> %ext, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v8i8_to_v8i32: +; SI: s_endpgm +define void @zextload_global_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i8>, <8 x i8> addrspace(1)* %in + %ext = zext <8 x i8> %load to <8 x i32> + store <8 x i32> %ext, <8 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v8i8_to_v8i32: +; SI: s_endpgm +define void @sextload_global_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i8>, <8 x i8> addrspace(1)* %in + %ext = sext <8 x i8> %load to <8 x i32> + store <8 x i32> %ext, <8 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v16i8_to_v16i32: +; SI: s_endpgm +define void @zextload_global_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i8>, <16 x i8> addrspace(1)* %in + %ext = zext <16 x i8> %load to <16 x i32> + store <16 x i32> %ext, <16 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v16i8_to_v16i32: +; SI: s_endpgm +define void @sextload_global_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i8>, <16 x i8> addrspace(1)* %in + %ext = sext <16 x i8> %load to <16 x i32> + store <16 x i32> %ext, <16 x i32> addrspace(1)* %out + ret void +} + +; XFUNC-LABEL: {{^}}zextload_global_v32i8_to_v32i32: +; XSI: s_endpgm +; define void @zextload_global_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind { +; %load = load <32 x i8>, <32 x i8> addrspace(1)* %in +; %ext = zext <32 x i8> %load to <32 x i32> +; store <32 x i32> %ext, <32 x i32> addrspace(1)* %out +; ret void +; } + +; XFUNC-LABEL: {{^}}sextload_global_v32i8_to_v32i32: +; XSI: s_endpgm +; define void @sextload_global_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind { +; %load = load <32 x i8>, <32 x i8> addrspace(1)* %in +; %ext = sext <32 x i8> %load to <32 x i32> +; store <32 x i32> %ext, <32 x i32> addrspace(1)* %out +; ret void +; } + +; XFUNC-LABEL: {{^}}zextload_global_v64i8_to_v64i32: +; XSI: s_endpgm +; define void @zextload_global_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind { +; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in +; %ext = zext <64 x i8> %load to <64 x i32> +; store <64 x i32> %ext, <64 x i32> addrspace(1)* %out +; ret void +; } + +; XFUNC-LABEL: {{^}}sextload_global_v64i8_to_v64i32: +; XSI: s_endpgm +; define void @sextload_global_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind { +; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in +; %ext = sext <64 x i8> %load to <64 x i32> +; store <64 x i32> %ext, <64 x i32> addrspace(1)* %out +; ret void +; } + +; FUNC-LABEL: {{^}}zextload_global_i8_to_i64: +; SI: buffer_load_ubyte v[[LO:[0-9]+]], +; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} +; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] +define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { + %a = load i8, i8 addrspace(1)* %in + %ext = zext i8 %a to i64 + store i64 %ext, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_i8_to_i64: +; SI: buffer_load_sbyte [[LOAD:v[0-9]+]], +; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]] +; SI: buffer_store_dwordx2 +define void @sextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { + %a = load i8, i8 addrspace(1)* %in + %ext = sext i8 %a to i64 + store i64 %ext, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v1i8_to_v1i64: +; SI: s_endpgm +define void @zextload_global_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i8>, <1 x i8> addrspace(1)* %in + %ext = zext <1 x i8> %load to <1 x i64> + store <1 x i64> %ext, <1 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v1i8_to_v1i64: +; SI: s_endpgm +define void @sextload_global_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <1 x i8>, <1 x i8> addrspace(1)* %in + %ext = sext <1 x i8> %load to <1 x i64> + store <1 x i64> %ext, <1 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v2i8_to_v2i64: +; SI: s_endpgm +define void @zextload_global_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i8>, <2 x i8> addrspace(1)* %in + %ext = zext <2 x i8> %load to <2 x i64> + store <2 x i64> %ext, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v2i8_to_v2i64: +; SI: s_endpgm +define void @sextload_global_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <2 x i8>, <2 x i8> addrspace(1)* %in + %ext = sext <2 x i8> %load to <2 x i64> + store <2 x i64> %ext, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v4i8_to_v4i64: +; SI: s_endpgm +define void @zextload_global_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i8>, <4 x i8> addrspace(1)* %in + %ext = zext <4 x i8> %load to <4 x i64> + store <4 x i64> %ext, <4 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v4i8_to_v4i64: +; SI: s_endpgm +define void @sextload_global_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <4 x i8>, <4 x i8> addrspace(1)* %in + %ext = sext <4 x i8> %load to <4 x i64> + store <4 x i64> %ext, <4 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v8i8_to_v8i64: +; SI: s_endpgm +define void @zextload_global_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i8>, <8 x i8> addrspace(1)* %in + %ext = zext <8 x i8> %load to <8 x i64> + store <8 x i64> %ext, <8 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v8i8_to_v8i64: +; SI: s_endpgm +define void @sextload_global_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <8 x i8>, <8 x i8> addrspace(1)* %in + %ext = sext <8 x i8> %load to <8 x i64> + store <8 x i64> %ext, <8 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_global_v16i8_to_v16i64: +; SI: s_endpgm +define void @zextload_global_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i8>, <16 x i8> addrspace(1)* %in + %ext = zext <16 x i8> %load to <16 x i64> + store <16 x i64> %ext, <16 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_global_v16i8_to_v16i64: +; SI: s_endpgm +define void @sextload_global_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind { + %load = load <16 x i8>, <16 x i8> addrspace(1)* %in + %ext = sext <16 x i8> %load to <16 x i64> + store <16 x i64> %ext, <16 x i64> addrspace(1)* %out + ret void +} + +; XFUNC-LABEL: {{^}}zextload_global_v32i8_to_v32i64: +; XSI: s_endpgm +; define void @zextload_global_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind { +; %load = load <32 x i8>, <32 x i8> addrspace(1)* %in +; %ext = zext <32 x i8> %load to <32 x i64> +; store <32 x i64> %ext, <32 x i64> addrspace(1)* %out +; ret void +; } + +; XFUNC-LABEL: {{^}}sextload_global_v32i8_to_v32i64: +; XSI: s_endpgm +; define void @sextload_global_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind { +; %load = load <32 x i8>, <32 x i8> addrspace(1)* %in +; %ext = sext <32 x i8> %load to <32 x i64> +; store <32 x i64> %ext, <32 x i64> addrspace(1)* %out +; ret void +; } + +; XFUNC-LABEL: {{^}}zextload_global_v64i8_to_v64i64: +; XSI: s_endpgm +; define void @zextload_global_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind { +; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in +; %ext = zext <64 x i8> %load to <64 x i64> +; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out +; ret void +; } + +; XFUNC-LABEL: {{^}}sextload_global_v64i8_to_v64i64: +; XSI: s_endpgm +; define void @sextload_global_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind { +; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in +; %ext = sext <64 x i8> %load to <64 x i64> +; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out +; ret void +; } diff --git a/test/CodeGen/AMDGPU/global-zero-initializer.ll b/test/CodeGen/AMDGPU/global-zero-initializer.ll new file mode 100644 index 00000000000..45aa8bf4e1d --- /dev/null +++ b/test/CodeGen/AMDGPU/global-zero-initializer.ll @@ -0,0 +1,13 @@ +; RUN: not llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s +; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s + +; CHECK: error: unsupported initializer for address space in load_init_global_global + +@lds = addrspace(1) global [256 x i32] zeroinitializer + +define void @load_init_global_global(i32 addrspace(1)* %out, i1 %p) { + %gep = getelementptr [256 x i32], [256 x i32] addrspace(1)* @lds, i32 0, i32 10 + %ld = load i32, i32 addrspace(1)* %gep + store i32 %ld, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/global_atomics.ll b/test/CodeGen/AMDGPU/global_atomics.ll new file mode 100644 index 00000000000..847950f6376 --- /dev/null +++ b/test/CodeGen/AMDGPU/global_atomics.ll @@ -0,0 +1,801 @@ +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}atomic_add_i32_offset: +; SI: buffer_atomic_add v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} +define void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_add_i32_ret_offset: +; SI: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_add_i32_addr64_offset: +; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} +define void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64_offset: +; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_add_i32: +; SI: buffer_atomic_add v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} +define void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_add_i32_ret: +; SI: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc +; SI: buffer_store_dword [[RET]] +define void @atomic_add_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %0 = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_add_i32_addr64: +; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} +define void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64: +; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_and_i32_offset: +; SI: buffer_atomic_and v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} +define void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_and_i32_ret_offset: +; SI: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_and_i32_addr64_offset: +; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} +define void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64_offset: +; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_and_i32: +; SI: buffer_atomic_and v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} +define void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_and_i32_ret: +; SI: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc +; SI: buffer_store_dword [[RET]] +define void @atomic_and_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %0 = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_and_i32_addr64: +; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} +define void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64: +; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_sub_i32_offset: +; SI: buffer_atomic_sub v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} +define void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_sub_i32_ret_offset: +; SI: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_sub_i32_addr64_offset: +; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} +define void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64_offset: +; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_sub_i32: +; SI: buffer_atomic_sub v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} +define void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_sub_i32_ret: +; SI: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc +; SI: buffer_store_dword [[RET]] +define void @atomic_sub_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %0 = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_sub_i32_addr64: +; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} +define void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64: +; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_max_i32_offset: +; SI: buffer_atomic_smax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} +define void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_max_i32_ret_offset: +; SI: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_max_i32_addr64_offset: +; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} +define void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64_offset: +; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_max_i32: +; SI: buffer_atomic_smax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} +define void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_max_i32_ret: +; SI: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc +; SI: buffer_store_dword [[RET]] +define void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %0 = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_max_i32_addr64: +; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} +define void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64: +; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_umax_i32_offset: +; SI: buffer_atomic_umax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} +define void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_umax_i32_ret_offset: +; SI: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_umax_i32_addr64_offset: +; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} +define void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64_offset: +; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_umax_i32: +; SI: buffer_atomic_umax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} +define void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_umax_i32_ret: +; SI: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc +; SI: buffer_store_dword [[RET]] +define void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %0 = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_umax_i32_addr64: +; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} +define void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64: +; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_min_i32_offset: +; SI: buffer_atomic_smin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} +define void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_min_i32_ret_offset: +; SI: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_min_i32_addr64_offset: +; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} +define void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64_offset: +; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_min_i32: +; SI: buffer_atomic_smin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} +define void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_min_i32_ret: +; SI: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc +; SI: buffer_store_dword [[RET]] +define void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %0 = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_min_i32_addr64: +; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} +define void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64: +; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_umin_i32_offset: +; SI: buffer_atomic_umin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} +define void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_umin_i32_ret_offset: +; SI: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_umin_i32_addr64_offset: +; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} +define void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64_offset: +; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_umin_i32: +; SI: buffer_atomic_umin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} +define void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_umin_i32_ret: +; SI: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc +; SI: buffer_store_dword [[RET]] +define void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %0 = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_umin_i32_addr64: +; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} +define void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64: +; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_or_i32_offset: +; SI: buffer_atomic_or v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} +define void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_or_i32_ret_offset: +; SI: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_or_i32_addr64_offset: +; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} +define void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64_offset: +; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_or_i32: +; SI: buffer_atomic_or v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} +define void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_or_i32_ret: +; SI: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc +; SI: buffer_store_dword [[RET]] +define void @atomic_or_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %0 = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_or_i32_addr64: +; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} +define void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64: +; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_xchg_i32_offset: +; SI: buffer_atomic_swap v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} +define void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_offset: +; SI: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64_offset: +; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} +define void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64_offset: +; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_xchg_i32: +; SI: buffer_atomic_swap v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} +define void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_xchg_i32_ret: +; SI: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc +; SI: buffer_store_dword [[RET]] +define void @atomic_xchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %0 = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64: +; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} +define void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64: +; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_xor_i32_offset: +; SI: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} +define void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_xor_i32_ret_offset: +; SI: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_xor_i32_addr64_offset: +; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} +define void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64_offset: +; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_xor_i32: +; SI: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} +define void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_xor_i32_ret: +; SI: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc +; SI: buffer_store_dword [[RET]] +define void @atomic_xor_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %0 = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_xor_i32_addr64: +; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} +define void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64: +; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} +; SI: buffer_store_dword [[RET]] +define void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst + store i32 %0, i32 addrspace(1)* %out2 + ret void +} diff --git a/test/CodeGen/AMDGPU/gv-const-addrspace-fail.ll b/test/CodeGen/AMDGPU/gv-const-addrspace-fail.ll new file mode 100644 index 00000000000..014b0a5482a --- /dev/null +++ b/test/CodeGen/AMDGPU/gv-const-addrspace-fail.ll @@ -0,0 +1,57 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; XUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + + +@a = internal addrspace(2) constant [1 x i8] [ i8 7 ], align 1 + +; FUNC-LABEL: {{^}}test_i8: +; EG: CF_END +; SI: buffer_store_byte +; SI: s_endpgm +define void @test_i8( i32 %s, i8 addrspace(1)* %out) #3 { + %arrayidx = getelementptr inbounds [1 x i8], [1 x i8] addrspace(2)* @a, i32 0, i32 %s + %1 = load i8, i8 addrspace(2)* %arrayidx, align 1 + store i8 %1, i8 addrspace(1)* %out + ret void +} + +@b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2 + +; FUNC-LABEL: {{^}}test_i16: +; EG: CF_END +; SI: buffer_store_short +; SI: s_endpgm +define void @test_i16( i32 %s, i16 addrspace(1)* %out) #3 { + %arrayidx = getelementptr inbounds [1 x i16], [1 x i16] addrspace(2)* @b, i32 0, i32 %s + %1 = load i16, i16 addrspace(2)* %arrayidx, align 2 + store i16 %1, i16 addrspace(1)* %out + ret void +} + +%struct.bar = type { float, [5 x i8] } + +; The illegal i8s aren't handled +@struct_bar_gv = internal addrspace(2) constant [1 x %struct.bar] [ %struct.bar { float 16.0, [5 x i8] [i8 0, i8 1, i8 2, i8 3, i8 4] } ] + +; FUNC-LABEL: {{^}}struct_bar_gv_load: +define void @struct_bar_gv_load(i8 addrspace(1)* %out, i32 %index) { + %gep = getelementptr inbounds [1 x %struct.bar], [1 x %struct.bar] addrspace(2)* @struct_bar_gv, i32 0, i32 0, i32 1, i32 %index + %load = load i8, i8 addrspace(2)* %gep, align 1 + store i8 %load, i8 addrspace(1)* %out, align 1 + ret void +} + + +; The private load isn't scalarzied. +@array_vector_gv = internal addrspace(2) constant [4 x <4 x i32>] [ <4 x i32> , + <4 x i32> , + <4 x i32> , + <4 x i32> ] + +; FUNC-LABEL: {{^}}array_vector_gv_load: +define void @array_vector_gv_load(<4 x i32> addrspace(1)* %out, i32 %index) { + %gep = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] addrspace(2)* @array_vector_gv, i32 0, i32 %index + %load = load <4 x i32>, <4 x i32> addrspace(2)* %gep, align 16 + store <4 x i32> %load, <4 x i32> addrspace(1)* %out, align 16 + ret void +} diff --git a/test/CodeGen/AMDGPU/gv-const-addrspace.ll b/test/CodeGen/AMDGPU/gv-const-addrspace.ll new file mode 100644 index 00000000000..3c1fc6c98f7 --- /dev/null +++ b/test/CodeGen/AMDGPU/gv-const-addrspace.ll @@ -0,0 +1,101 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + + +@b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2 + +@float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.0, float 1.0, float 2.0, float 3.0, float 4.0], align 4 + +; FUNC-LABEL: {{^}}float: +; FIXME: We should be using s_load_dword here. +; SI: buffer_load_dword +; VI: s_load_dword + +; EG-DAG: MOV {{\** *}}T2.X +; EG-DAG: MOV {{\** *}}T3.X +; EG-DAG: MOV {{\** *}}T4.X +; EG-DAG: MOV {{\** *}}T5.X +; EG-DAG: MOV {{\** *}}T6.X +; EG: MOVA_INT + +define void @float(float addrspace(1)* %out, i32 %index) { +entry: + %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index + %1 = load float, float addrspace(2)* %0 + store float %1, float addrspace(1)* %out + ret void +} + +@i32_gv = internal unnamed_addr addrspace(2) constant [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4], align 4 + +; FUNC-LABEL: {{^}}i32: + +; FIXME: We should be using s_load_dword here. +; SI: buffer_load_dword +; VI: s_load_dword + +; EG-DAG: MOV {{\** *}}T2.X +; EG-DAG: MOV {{\** *}}T3.X +; EG-DAG: MOV {{\** *}}T4.X +; EG-DAG: MOV {{\** *}}T5.X +; EG-DAG: MOV {{\** *}}T6.X +; EG: MOVA_INT + +define void @i32(i32 addrspace(1)* %out, i32 %index) { +entry: + %0 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(2)* @i32_gv, i32 0, i32 %index + %1 = load i32, i32 addrspace(2)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + + +%struct.foo = type { float, [5 x i32] } + +@struct_foo_gv = internal unnamed_addr addrspace(2) constant [1 x %struct.foo] [ %struct.foo { float 16.0, [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4] } ] + +; FUNC-LABEL: {{^}}struct_foo_gv_load: +; GCN: s_load_dword + +define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) { + %gep = getelementptr inbounds [1 x %struct.foo], [1 x %struct.foo] addrspace(2)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index + %load = load i32, i32 addrspace(2)* %gep, align 4 + store i32 %load, i32 addrspace(1)* %out, align 4 + ret void +} + +@array_v1_gv = internal addrspace(2) constant [4 x <1 x i32>] [ <1 x i32> , + <1 x i32> , + <1 x i32> , + <1 x i32> ] + +; FUNC-LABEL: {{^}}array_v1_gv_load: +; FIXME: We should be using s_load_dword here. +; SI: buffer_load_dword +; VI: s_load_dword +define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) { + %gep = getelementptr inbounds [4 x <1 x i32>], [4 x <1 x i32>] addrspace(2)* @array_v1_gv, i32 0, i32 %index + %load = load <1 x i32>, <1 x i32> addrspace(2)* %gep, align 4 + store <1 x i32> %load, <1 x i32> addrspace(1)* %out, align 4 + ret void +} + +define void @gv_addressing_in_branch(float addrspace(1)* %out, i32 %index, i32 %a) { +entry: + %0 = icmp eq i32 0, %a + br i1 %0, label %if, label %else + +if: + %1 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index + %2 = load float, float addrspace(2)* %1 + store float %2, float addrspace(1)* %out + br label %endif + +else: + store float 1.0, float addrspace(1)* %out + br label %endif + +endif: + ret void +} diff --git a/test/CodeGen/AMDGPU/half.ll b/test/CodeGen/AMDGPU/half.ll new file mode 100644 index 00000000000..bf8f11860b5 --- /dev/null +++ b/test/CodeGen/AMDGPU/half.ll @@ -0,0 +1,525 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; half args should be promoted to float + +; GCN-LABEL: {{^}}load_f16_arg: +; GCN: s_load_dword [[ARG:s[0-9]+]] +; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[ARG]] +; GCN: buffer_store_short [[CVT]] +define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 { + store half %arg, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}load_v2f16_arg: +; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 +; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46 +; GCN-DAG: buffer_store_short [[V0]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: buffer_store_short [[V1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} +; GCN: s_endpgm +define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { + store <2 x half> %arg, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}load_v3f16_arg: +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN-NOT: buffer_load +; GCN-DAG: buffer_store_dword +; GCN-DAG: buffer_store_short +; GCN-NOT: buffer_store +; GCN: s_endpgm +define void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 { + store <3 x half> %arg, <3 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}load_v4f16_arg: +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: s_endpgm +define void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 { + store <4 x half> %arg, <4 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}load_v8f16_arg: +define void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 { + store <8 x half> %arg, <8 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}extload_v2f16_arg: +define void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 { + %fpext = fpext <2 x half> %in to <2 x float> + store <2 x float> %fpext, <2 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}extload_f16_to_f32_arg: +define void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 { + %ext = fpext half %arg to float + store float %ext, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}extload_v2f16_to_v2f32_arg: +define void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 { + %ext = fpext <2 x half> %arg to <2 x float> + store <2 x float> %ext, <2 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg: +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN-NOT: buffer_load +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN-NOT: v_cvt_f32_f16 +; GCN-DAG: buffer_store_dword +; GCN-DAG: buffer_store_dwordx2 +; GCN: s_endpgm +define void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 { + %ext = fpext <3 x half> %arg to <3 x float> + store <3 x float> %ext, <3 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}extload_v4f16_to_v4f32_arg: +define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 { + %ext = fpext <4 x half> %arg to <4 x float> + store <4 x float> %ext, <4 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg: +define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 { + %ext = fpext <8 x half> %arg to <8 x float> + store <8 x float> %ext, <8 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}extload_f16_to_f64_arg: +define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 { + %ext = fpext half %arg to double + store double %ext, double addrspace(1)* %out + ret void +} +; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg: +define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 { + %ext = fpext <2 x half> %arg to <2 x double> + store <2 x double> %ext, <2 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg: +define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 { + %ext = fpext <3 x half> %arg to <3 x double> + store <3 x double> %ext, <3 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg: +define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 { + %ext = fpext <4 x half> %arg to <4 x double> + store <4 x double> %ext, <4 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg: +define void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 { + %ext = fpext <8 x half> %arg to <8 x double> + store <8 x double> %ext, <8 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_load_store_f16: +; GCN: buffer_load_ushort [[TMP:v[0-9]+]] +; GCN: buffer_store_short [[TMP]] +define void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { + %val = load half, half addrspace(1)* %in + store half %val, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_load_store_v2f16: +; GCN: buffer_load_dword [[TMP:v[0-9]+]] +; GCN: buffer_store_dword [[TMP]] +define void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { + %val = load <2 x half>, <2 x half> addrspace(1)* %in + store <2 x half> %val, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_load_store_v4f16: +; GCN: buffer_load_dwordx2 [[TMP:v\[[0-9]+:[0-9]+\]]] +; GCN: buffer_store_dwordx2 [[TMP]] +define void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 { + %val = load <4 x half>, <4 x half> addrspace(1)* %in + store <4 x half> %val, <4 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_load_store_v8f16: +; GCN: buffer_load_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]] +; GCN: buffer_store_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]] +; GCN: s_endpgm +define void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { + %val = load <8 x half>, <8 x half> addrspace(1)* %in + store <8 x half> %val, <8 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_extload_f16_to_f32: +; GCN: buffer_load_ushort [[LOAD:v[0-9]+]] +; GCN: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[LOAD]] +; GCN: buffer_store_dword [[CVT]] +define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 { + %val = load half, half addrspace(1)* %in + %cvt = fpext half %val to float + store float %cvt, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32: +define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { + %val = load <2 x half>, <2 x half> addrspace(1)* %in + %cvt = fpext <2 x half> %val to <2 x float> + store <2 x float> %cvt, <2 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f32: +define void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { + %val = load <3 x half>, <3 x half> addrspace(1)* %in + %cvt = fpext <3 x half> %val to <3 x float> + store <3 x float> %cvt, <3 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f32: +define void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { + %val = load <4 x half>, <4 x half> addrspace(1)* %in + %cvt = fpext <4 x half> %val to <4 x float> + store <4 x float> %cvt, <4 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f32: +define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { + %val = load <8 x half>, <8 x half> addrspace(1)* %in + %cvt = fpext <8 x half> %val to <8 x float> + store <8 x float> %cvt, <8 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f32: +define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { + %val = load <16 x half>, <16 x half> addrspace(1)* %in + %cvt = fpext <16 x half> %val to <16 x float> + store <16 x float> %cvt, <16 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_extload_f16_to_f64: +; GCN: buffer_load_ushort [[LOAD:v[0-9]+]] +; GCN: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[LOAD]] +; GCN: v_cvt_f64_f32_e32 [[CVT1:v\[[0-9]+:[0-9]+\]]], [[CVT0]] +; GCN: buffer_store_dwordx2 [[CVT1]] +define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 { + %val = load half, half addrspace(1)* %in + %cvt = fpext half %val to double + store double %cvt, double addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64: +define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { + %val = load <2 x half>, <2 x half> addrspace(1)* %in + %cvt = fpext <2 x half> %val to <2 x double> + store <2 x double> %cvt, <2 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64: +define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { + %val = load <3 x half>, <3 x half> addrspace(1)* %in + %cvt = fpext <3 x half> %val to <3 x double> + store <3 x double> %cvt, <3 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f64: +define void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { + %val = load <4 x half>, <4 x half> addrspace(1)* %in + %cvt = fpext <4 x half> %val to <4 x double> + store <4 x double> %cvt, <4 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f64: +define void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { + %val = load <8 x half>, <8 x half> addrspace(1)* %in + %cvt = fpext <8 x half> %val to <8 x double> + store <8 x double> %cvt, <8 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f64: +define void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { + %val = load <16 x half>, <16 x half> addrspace(1)* %in + %cvt = fpext <16 x half> %val to <16 x double> + store <16 x double> %cvt, <16 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_f32_to_f16: +; GCN: buffer_load_dword [[LOAD:v[0-9]+]] +; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[LOAD]] +; GCN: buffer_store_short [[CVT]] +define void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 { + %val = load float, float addrspace(1)* %in + %cvt = fptrunc float %val to half + store half %cvt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_v2f32_to_v2f16: +; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} +; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]] +; GCN-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]] +; GCN-DAG: buffer_store_short [[CVT0]] +; GCN-DAG: buffer_store_short [[CVT1]] +; GCN: s_endpgm +define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { + %val = load <2 x float>, <2 x float> addrspace(1)* %in + %cvt = fptrunc <2 x float> %val to <2 x half> + store <2 x half> %cvt, <2 x half> addrspace(1)* %out + ret void +} + +; FIXME: Shouldn't do 4th conversion +; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16: +; GCN: buffer_load_dwordx4 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: buffer_store_short +; GCN: buffer_store_dword +; GCN: s_endpgm +define void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { + %val = load <3 x float>, <3 x float> addrspace(1)* %in + %cvt = fptrunc <3 x float> %val to <3 x half> + store <3 x half> %cvt, <3 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_v4f32_to_v4f16: +; GCN: buffer_load_dwordx4 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: s_endpgm +define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { + %val = load <4 x float>, <4 x float> addrspace(1)* %in + %cvt = fptrunc <4 x float> %val to <4 x half> + store <4 x half> %cvt, <4 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16: +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: s_endpgm +define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 { + %val = load <8 x float>, <8 x float> addrspace(1)* %in + %cvt = fptrunc <8 x float> %val to <8 x half> + store <8 x half> %cvt, <8 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_v16f32_to_v16f16: +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: s_endpgm +define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { + %val = load <16 x float>, <16 x float> addrspace(1)* %in + %cvt = fptrunc <16 x float> %val to <16 x half> + store <16 x half> %cvt, <16 x half> addrspace(1)* %out + ret void +} + +; FIXME: Unsafe math should fold conversions away +; GCN-LABEL: {{^}}fadd_f16: +; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, +; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, +; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, +; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, +; SI: v_add_f32 +; GCN: s_endpgm +define void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 { + %add = fadd half %a, %b + store half %add, half addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}fadd_v2f16: +; SI: v_add_f32 +; SI: v_add_f32 +; GCN: s_endpgm +define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 { + %add = fadd <2 x half> %a, %b + store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8 + ret void +} + +; GCN-LABEL: {{^}}fadd_v4f16: +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; GCN: s_endpgm +define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { + %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1 + %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16 + %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16 + %result = fadd <4 x half> %a, %b + store <4 x half> %result, <4 x half> addrspace(1)* %out, align 16 + ret void +} + +; GCN-LABEL: {{^}}fadd_v8f16: +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; GCN: s_endpgm +define void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 { + %add = fadd <8 x half> %a, %b + store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32 + ret void +} + +; GCN-LABEL: {{^}}fsub_f16: +; GCN: v_subrev_f32_e32 +; GCN: s_endpgm +define void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { + %b_ptr = getelementptr half, half addrspace(1)* %in, i32 1 + %a = load half, half addrspace(1)* %in + %b = load half, half addrspace(1)* %b_ptr + %sub = fsub half %a, %b + store half %sub, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_bitcast_from_half: +; GCN: buffer_load_ushort [[TMP:v[0-9]+]] +; GCN: buffer_store_short [[TMP]] +define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 { + %val = load half, half addrspace(1)* %in + %val_int = bitcast half %val to i16 + store i16 %val_int, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_bitcast_to_half: +; GCN: buffer_load_ushort [[TMP:v[0-9]+]] +; GCN: buffer_store_short [[TMP]] +define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 { + %val = load i16, i16 addrspace(1)* %in + %val_fp = bitcast i16 %val to half + store half %val_fp, half addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/hsa.ll b/test/CodeGen/AMDGPU/hsa.ll new file mode 100644 index 00000000000..f9113399afe --- /dev/null +++ b/test/CodeGen/AMDGPU/hsa.ll @@ -0,0 +1,14 @@ +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s + +; HSA: .section .hsa.version +; HSA-NEXT: .ascii "HSA Code Unit:0.0:AMD:0.1:GFX8.1:0" +; HSA: {{^}}simple: +; Make sure we are setting the ATC bit: +; HSA: s_mov_b32 s[[HI:[0-9]]], 0x100f000 +; HSA: buffer_store_dword v{{[0-9]+}}, s[0:[[HI]]], 0 + +define void @simple(i32 addrspace(1)* %out) { +entry: + store i32 0, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll b/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll new file mode 100644 index 00000000000..b11a2113764 --- /dev/null +++ b/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll @@ -0,0 +1,22 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; SILowerI1Copies was not handling IMPLICIT_DEF +; SI-LABEL: {{^}}br_implicit_def: +; SI: BB#0: +; SI-NEXT: s_and_saveexec_b64 +; SI-NEXT: s_xor_b64 +; SI-NEXT: BB#1: +define void @br_implicit_def(i32 addrspace(1)* %out, i32 %arg) #0 { +bb: + br i1 undef, label %bb1, label %bb2 + +bb1: + store volatile i32 123, i32 addrspace(1)* %out + ret void + +bb2: + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/i1-copy-phi.ll b/test/CodeGen/AMDGPU/i1-copy-phi.ll new file mode 100644 index 00000000000..105cd06b330 --- /dev/null +++ b/test/CodeGen/AMDGPU/i1-copy-phi.ll @@ -0,0 +1,30 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}br_i1_phi: +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} +; SI: s_and_saveexec_b64 +; SI: s_xor_b64 +; SI: v_mov_b32_e32 [[REG]], -1{{$}} +; SI: v_cmp_ne_i32_e32 vcc, 0, [[REG]] +; SI: s_and_saveexec_b64 +; SI: s_xor_b64 +; SI: s_endpgm +define void @br_i1_phi(i32 %arg, i1 %arg1) #0 { +bb: + br i1 %arg1, label %bb2, label %bb3 + +bb2: ; preds = %bb + br label %bb3 + +bb3: ; preds = %bb2, %bb + %tmp = phi i1 [ true, %bb2 ], [ false, %bb ] + br i1 %tmp, label %bb4, label %bb6 + +bb4: ; preds = %bb3 + %tmp5 = mul i32 undef, %arg + br label %bb6 + +bb6: ; preds = %bb4, %bb3 + ret void +} diff --git a/test/CodeGen/AMDGPU/i8-to-double-to-float.ll b/test/CodeGen/AMDGPU/i8-to-double-to-float.ll new file mode 100644 index 00000000000..c218e1918bb --- /dev/null +++ b/test/CodeGen/AMDGPU/i8-to-double-to-float.ll @@ -0,0 +1,11 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @test(float addrspace(1)* %out, i8 addrspace(1)* %in) { + %1 = load i8, i8 addrspace(1)* %in + %2 = uitofp i8 %1 to double + %3 = fptrunc double %2 to float + store float %3, float addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll b/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll new file mode 100644 index 00000000000..60e59a5a528 --- /dev/null +++ b/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll @@ -0,0 +1,18 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +;Test that a select with reversed True/False values is correctly lowered +;to a SETNE_INT. There should only be one SETNE_INT instruction. + +;CHECK: SETNE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;CHECK-NOT: SETNE_INT + +define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +entry: + %0 = load i32, i32 addrspace(1)* %in + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 + %1 = load i32, i32 addrspace(1)* %arrayidx1 + %cmp = icmp eq i32 %0, %1 + %value = select i1 %cmp, i32 0, i32 -1 + store i32 %value, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/icmp64.ll b/test/CodeGen/AMDGPU/icmp64.ll new file mode 100644 index 00000000000..0eaa33ebafe --- /dev/null +++ b/test/CodeGen/AMDGPU/icmp64.ll @@ -0,0 +1,93 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}test_i64_eq: +; SI: v_cmp_eq_i64 +define void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %cmp = icmp eq i64 %a, %b + %result = sext i1 %cmp to i32 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_i64_ne: +; SI: v_cmp_ne_i64 +define void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %cmp = icmp ne i64 %a, %b + %result = sext i1 %cmp to i32 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_i64_slt: +; SI: v_cmp_lt_i64 +define void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %cmp = icmp slt i64 %a, %b + %result = sext i1 %cmp to i32 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_i64_ult: +; SI: v_cmp_lt_u64 +define void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %cmp = icmp ult i64 %a, %b + %result = sext i1 %cmp to i32 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_i64_sle: +; SI: v_cmp_le_i64 +define void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %cmp = icmp sle i64 %a, %b + %result = sext i1 %cmp to i32 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_i64_ule: +; SI: v_cmp_le_u64 +define void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %cmp = icmp ule i64 %a, %b + %result = sext i1 %cmp to i32 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_i64_sgt: +; SI: v_cmp_gt_i64 +define void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %cmp = icmp sgt i64 %a, %b + %result = sext i1 %cmp to i32 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_i64_ugt: +; SI: v_cmp_gt_u64 +define void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %cmp = icmp ugt i64 %a, %b + %result = sext i1 %cmp to i32 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_i64_sge: +; SI: v_cmp_ge_i64 +define void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %cmp = icmp sge i64 %a, %b + %result = sext i1 %cmp to i32 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_i64_uge: +; SI: v_cmp_ge_u64 +define void @test_i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %cmp = icmp uge i64 %a, %b + %result = sext i1 %cmp to i32 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + diff --git a/test/CodeGen/AMDGPU/imm.ll b/test/CodeGen/AMDGPU/imm.ll new file mode 100644 index 00000000000..12eed550eb1 --- /dev/null +++ b/test/CodeGen/AMDGPU/imm.ll @@ -0,0 +1,617 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=CHECK %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CHECK %s + +; Use a 64-bit value with lo bits that can be represented as an inline constant +; CHECK-LABEL: {{^}}i64_imm_inline_lo: +; CHECK: s_mov_b32 [[LO:s[0-9]+]], 5 +; CHECK: v_mov_b32_e32 v[[LO_VGPR:[0-9]+]], [[LO]] +; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VGPR]]: +define void @i64_imm_inline_lo(i64 addrspace(1) *%out) { +entry: + store i64 1311768464867721221, i64 addrspace(1) *%out ; 0x1234567800000005 + ret void +} + +; Use a 64-bit value with hi bits that can be represented as an inline constant +; CHECK-LABEL: {{^}}i64_imm_inline_hi: +; CHECK: s_mov_b32 [[HI:s[0-9]+]], 5 +; CHECK: v_mov_b32_e32 v[[HI_VGPR:[0-9]+]], [[HI]] +; CHECK: buffer_store_dwordx2 v{{\[[0-9]+:}}[[HI_VGPR]] +define void @i64_imm_inline_hi(i64 addrspace(1) *%out) { +entry: + store i64 21780256376, i64 addrspace(1) *%out ; 0x0000000512345678 + ret void +} + +; CHECK-LABEL: {{^}}store_imm_neg_0.0_i64: +; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x80000000 +; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]] +; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]] +; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} +define void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) { + store i64 -9223372036854775808, i64 addrspace(1) *%out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_neg_0.0_i32: +; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000 +; CHECK: buffer_store_dword [[REG]] +define void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) { + store i32 -2147483648, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_0.0_f32: +; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @store_inline_imm_0.0_f32(float addrspace(1)* %out) { + store float 0.0, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_imm_neg_0.0_f32: +; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000 +; CHECK: buffer_store_dword [[REG]] +define void @store_imm_neg_0.0_f32(float addrspace(1)* %out) { + store float -0.0, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_0.5_f32: +; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0.5{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @store_inline_imm_0.5_f32(float addrspace(1)* %out) { + store float 0.5, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_m_0.5_f32: +; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -0.5{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out) { + store float -0.5, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_1.0_f32: +; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @store_inline_imm_1.0_f32(float addrspace(1)* %out) { + store float 1.0, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_m_1.0_f32: +; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out) { + store float -1.0, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_2.0_f32: +; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 2.0{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @store_inline_imm_2.0_f32(float addrspace(1)* %out) { + store float 2.0, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_m_2.0_f32: +; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -2.0{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out) { + store float -2.0, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_4.0_f32: +; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 4.0{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @store_inline_imm_4.0_f32(float addrspace(1)* %out) { + store float 4.0, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_m_4.0_f32: +; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -4.0{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) { + store float -4.0, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_literal_imm_f32: +; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x45800000 +; CHECK: buffer_store_dword [[REG]] +define void @store_literal_imm_f32(float addrspace(1)* %out) { + store float 4096.0, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_0.0_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 0, [[VAL]]{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, 0.0 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_0.5_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 0.5, [[VAL]]{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, 0.5 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_neg_0.5_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -0.5, [[VAL]]{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, -0.5 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_1.0_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 1.0, [[VAL]]{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, 1.0 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_neg_1.0_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -1.0, [[VAL]]{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, -1.0 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_2.0_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 2.0, [[VAL]]{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, 2.0 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_neg_2.0_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -2.0, [[VAL]]{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, -2.0 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_4.0_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 4.0, [[VAL]]{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, 4.0 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_neg_4.0_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -4.0, [[VAL]]{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, -4.0 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}commute_add_inline_imm_0.5_f32: +; CHECK: buffer_load_dword [[VAL:v[0-9]+]] +; CHECK: v_add_f32_e32 [[REG:v[0-9]+]], 0.5, [[VAL]] +; CHECK: buffer_store_dword [[REG]] +define void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addrspace(1)* %in) { + %x = load float, float addrspace(1)* %in + %y = fadd float %x, 0.5 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}commute_add_literal_f32: +; CHECK: buffer_load_dword [[VAL:v[0-9]+]] +; CHECK: v_add_f32_e32 [[REG:v[0-9]+]], 0x44800000, [[VAL]] +; CHECK: buffer_store_dword [[REG]] +define void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1)* %in) { + %x = load float, float addrspace(1)* %in + %y = fadd float %x, 1024.0 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_1_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 1, [[VAL]]{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_1_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, 0x36a0000000000000 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_2_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 2, [[VAL]]{{$}} +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_2_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, 0x36b0000000000000 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_16_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 16, [[VAL]] +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_16_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, 0x36e0000000000000 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_neg_1_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -1, [[VAL]] +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_neg_1_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, 0xffffffffe0000000 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_neg_2_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -2, [[VAL]] +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_neg_2_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, 0xffffffffc0000000 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_neg_16_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -16, [[VAL]] +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_neg_16_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, 0xfffffffe00000000 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_63_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 63, [[VAL]] +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_63_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, 0x36ff800000000000 + store float %y, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_64_f32: +; CHECK: s_load_dword [[VAL:s[0-9]+]] +; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 64, [[VAL]] +; CHECK: buffer_store_dword [[REG]] +define void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) { + %y = fadd float %x, 0x3700000000000000 + store float %y, float addrspace(1)* %out + ret void +} + + +; CHECK-LABEL: {{^}}add_inline_imm_0.0_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 0, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, 0.0 + store double %y, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_0.5_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 0.5, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, 0.5 + store double %y, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_neg_0.5_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -0.5, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, -0.5 + store double %y, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_1.0_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, 1.0 + store double %y, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_neg_1.0_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -1.0, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, -1.0 + store double %y, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_2.0_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 2.0, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, 2.0 + store double %y, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_neg_2.0_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -2.0, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, -2.0 + store double %y, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_4.0_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 4.0, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, 4.0 + store double %y, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_neg_4.0_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -4.0, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, -4.0 + store double %y, double addrspace(1)* %out + ret void +} + + +; CHECK-LABEL: {{^}}add_inline_imm_1_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, 0x0000000000000001 + store double %y, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_2_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 2, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, 0x0000000000000002 + store double %y, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_16_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 16, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, 0x0000000000000010 + store double %y, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_neg_1_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -1, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, 0xffffffffffffffff + store double %y, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_neg_2_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -2, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, 0xfffffffffffffffe + store double %y, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_neg_16_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -16, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, 0xfffffffffffffff0 + store double %y, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_63_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 63, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, 0x000000000000003F + store double %y, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}add_inline_imm_64_f64: +; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c +; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 64, [[VAL]] +; CHECK: buffer_store_dwordx2 [[REG]] +define void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) { + %y = fadd double %x, 0x0000000000000040 + store double %y, double addrspace(1)* %out + ret void +} + + +; CHECK-LABEL: {{^}}store_inline_imm_0.0_f64: +; CHECK: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0 +; CHECK: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0 +; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} +define void @store_inline_imm_0.0_f64(double addrspace(1)* %out) { + store double 0.0, double addrspace(1)* %out + ret void +} + + +; CHECK-LABEL: {{^}}store_literal_imm_neg_0.0_f64: +; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x80000000 +; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]] +; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]] +; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} +define void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) { + store double -0.0, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_0.5_f64: +; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3fe00000 +; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} +define void @store_inline_imm_0.5_f64(double addrspace(1)* %out) { + store double 0.5, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_m_0.5_f64: +; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfe00000 +; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} +define void @store_inline_imm_m_0.5_f64(double addrspace(1)* %out) { + store double -0.5, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_1.0_f64: +; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3ff00000 +; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} +define void @store_inline_imm_1.0_f64(double addrspace(1)* %out) { + store double 1.0, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_m_1.0_f64: +; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbff00000 +; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} +define void @store_inline_imm_m_1.0_f64(double addrspace(1)* %out) { + store double -1.0, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_2.0_f64: +; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 2.0 +; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} +define void @store_inline_imm_2.0_f64(double addrspace(1)* %out) { + store double 2.0, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_m_2.0_f64: +; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], -2.0 +; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} +define void @store_inline_imm_m_2.0_f64(double addrspace(1)* %out) { + store double -2.0, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_4.0_f64: +; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x40100000 +; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} +define void @store_inline_imm_4.0_f64(double addrspace(1)* %out) { + store double 4.0, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_inline_imm_m_4.0_f64: +; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xc0100000 +; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} +define void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) { + store double -4.0, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}store_literal_imm_f64: +; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x40b00000 +; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}} +; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]] +; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]] +; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} +define void @store_literal_imm_f64(double addrspace(1)* %out) { + store double 4096.0, double addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/test/CodeGen/AMDGPU/indirect-addressing-si.ll new file mode 100644 index 00000000000..f551606d63a --- /dev/null +++ b/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -0,0 +1,121 @@ +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +; Tests for indirect addressing on SI, which is implemented using dynamic +; indexing of vectors. + +; CHECK-LABEL: {{^}}extract_w_offset: +; CHECK: s_mov_b32 m0 +; CHECK-NEXT: v_movrels_b32_e32 +define void @extract_w_offset(float addrspace(1)* %out, i32 %in) { +entry: + %0 = add i32 %in, 1 + %1 = extractelement <4 x float> , i32 %0 + store float %1, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}extract_wo_offset: +; CHECK: s_mov_b32 m0 +; CHECK-NEXT: v_movrels_b32_e32 +define void @extract_wo_offset(float addrspace(1)* %out, i32 %in) { +entry: + %0 = extractelement <4 x float> , i32 %in + store float %0, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}extract_neg_offset_sgpr: +; The offset depends on the register that holds the first element of the vector. +; CHECK: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} +; CHECK: v_movrels_b32_e32 v{{[0-9]}}, v0 +define void @extract_neg_offset_sgpr(i32 addrspace(1)* %out, i32 %offset) { +entry: + %index = add i32 %offset, -512 + %value = extractelement <4 x i32> , i32 %index + store i32 %value, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}extract_neg_offset_vgpr: +; The offset depends on the register that holds the first element of the vector. +; CHECK: v_readfirstlane_b32 +; CHECK: s_add_i32 m0, m0, 0xfffffe{{[0-9a-z]+}} +; CHECK-NEXT: v_movrels_b32_e32 v{{[0-9]}}, v0 +; CHECK: s_cbranch_execnz +define void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) { +entry: + %id = call i32 @llvm.r600.read.tidig.x() #1 + %index = add i32 %id, -512 + %value = extractelement <4 x i32> , i32 %index + store i32 %value, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}insert_w_offset: +; CHECK: s_mov_b32 m0 +; CHECK-NEXT: v_movreld_b32_e32 +define void @insert_w_offset(float addrspace(1)* %out, i32 %in) { +entry: + %0 = add i32 %in, 1 + %1 = insertelement <4 x float> , float 5.0, i32 %0 + %2 = extractelement <4 x float> %1, i32 2 + store float %2, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}insert_wo_offset: +; CHECK: s_mov_b32 m0 +; CHECK-NEXT: v_movreld_b32_e32 +define void @insert_wo_offset(float addrspace(1)* %out, i32 %in) { +entry: + %0 = insertelement <4 x float> , float 5.0, i32 %in + %1 = extractelement <4 x float> %0, i32 2 + store float %1, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}insert_neg_offset_sgpr: +; The offset depends on the register that holds the first element of the vector. +; CHECK: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} +; CHECK: v_movreld_b32_e32 v0, v{{[0-9]}} +define void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, i32 %offset) { +entry: + %index = add i32 %offset, -512 + %value = insertelement <4 x i32> , i32 5, i32 %index + store <4 x i32> %value, <4 x i32> addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}insert_neg_offset_vgpr: +; The offset depends on the register that holds the first element of the vector. +; CHECK: v_readfirstlane_b32 +; CHECK: s_add_i32 m0, m0, 0xfffffe{{[0-9a-z]+}} +; CHECK-NEXT: v_movreld_b32_e32 v0, v{{[0-9]}} +; CHECK: s_cbranch_execnz +define void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %id = call i32 @llvm.r600.read.tidig.x() #1 + %index = add i32 %id, -512 + %value = insertelement <4 x i32> , i32 5, i32 %index + store <4 x i32> %value, <4 x i32> addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}insert_neg_inline_offset_vgpr: +; The offset depends on the register that holds the first element of the vector. +; CHECK: v_readfirstlane_b32 +; CHECK: s_add_i32 m0, m0, -{{[0-9]+}} +; CHECK-NEXT: v_movreld_b32_e32 v0, v{{[0-9]}} +; CHECK: s_cbranch_execnz +define void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %id = call i32 @llvm.r600.read.tidig.x() #1 + %index = add i32 %id, -16 + %value = insertelement <4 x i32> , i32 5, i32 %index + store <4 x i32> %value, <4 x i32> addrspace(1)* %out + ret void +} + +declare i32 @llvm.r600.read.tidig.x() #1 +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/indirect-private-64.ll b/test/CodeGen/AMDGPU/indirect-private-64.ll new file mode 100644 index 00000000000..d63e1b6c521 --- /dev/null +++ b/test/CodeGen/AMDGPU/indirect-private-64.ll @@ -0,0 +1,91 @@ +; RUN: llc -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=SI -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s + + +declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind + +; SI-LABEL: {{^}}private_access_f64_alloca: + +; SI-ALLOCA: buffer_store_dwordx2 +; SI-ALLOCA: buffer_load_dwordx2 + +; SI-PROMOTE: ds_write_b64 +; SI-PROMOTE: ds_read_b64 +define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) nounwind { + %val = load double, double addrspace(1)* %in, align 8 + %array = alloca double, i32 16, align 8 + %ptr = getelementptr double, double* %array, i32 %b + store double %val, double* %ptr, align 8 + call void @llvm.AMDGPU.barrier.local() noduplicate nounwind + %result = load double, double* %ptr, align 8 + store double %result, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}private_access_v2f64_alloca: + +; SI-ALLOCA: buffer_store_dwordx4 +; SI-ALLOCA: buffer_load_dwordx4 + +; SI-PROMOTE: ds_write_b32 +; SI-PROMOTE: ds_write_b32 +; SI-PROMOTE: ds_write_b32 +; SI-PROMOTE: ds_write_b32 +; SI-PROMOTE: ds_read_b32 +; SI-PROMOTE: ds_read_b32 +; SI-PROMOTE: ds_read_b32 +; SI-PROMOTE: ds_read_b32 +define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind { + %val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16 + %array = alloca <2 x double>, i32 16, align 16 + %ptr = getelementptr <2 x double>, <2 x double>* %array, i32 %b + store <2 x double> %val, <2 x double>* %ptr, align 16 + call void @llvm.AMDGPU.barrier.local() noduplicate nounwind + %result = load <2 x double>, <2 x double>* %ptr, align 16 + store <2 x double> %result, <2 x double> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}private_access_i64_alloca: + +; SI-ALLOCA: buffer_store_dwordx2 +; SI-ALLOCA: buffer_load_dwordx2 + +; SI-PROMOTE: ds_write_b64 +; SI-PROMOTE: ds_read_b64 +define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) nounwind { + %val = load i64, i64 addrspace(1)* %in, align 8 + %array = alloca i64, i32 16, align 8 + %ptr = getelementptr i64, i64* %array, i32 %b + store i64 %val, i64* %ptr, align 8 + call void @llvm.AMDGPU.barrier.local() noduplicate nounwind + %result = load i64, i64* %ptr, align 8 + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}private_access_v2i64_alloca: + +; SI-ALLOCA: buffer_store_dwordx4 +; SI-ALLOCA: buffer_load_dwordx4 + +; SI-PROMOTE: ds_write_b32 +; SI-PROMOTE: ds_write_b32 +; SI-PROMOTE: ds_write_b32 +; SI-PROMOTE: ds_write_b32 +; SI-PROMOTE: ds_read_b32 +; SI-PROMOTE: ds_read_b32 +; SI-PROMOTE: ds_read_b32 +; SI-PROMOTE: ds_read_b32 +define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind { + %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 + %array = alloca <2 x i64>, i32 16, align 16 + %ptr = getelementptr <2 x i64>, <2 x i64>* %array, i32 %b + store <2 x i64> %val, <2 x i64>* %ptr, align 16 + call void @llvm.AMDGPU.barrier.local() noduplicate nounwind + %result = load <2 x i64>, <2 x i64>* %ptr, align 16 + store <2 x i64> %result, <2 x i64> addrspace(1)* %out, align 16 + ret void +} diff --git a/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll b/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll new file mode 100644 index 00000000000..f6e39b3d830 --- /dev/null +++ b/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll @@ -0,0 +1,10 @@ +; XFAIL: * +; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s + +define void @inf_loop_irreducible_cfg() nounwind { +entry: + br label %block + +block: + br label %block +} diff --git a/test/CodeGen/AMDGPU/infinite-loop.ll b/test/CodeGen/AMDGPU/infinite-loop.ll new file mode 100644 index 00000000000..7233aa57fd7 --- /dev/null +++ b/test/CodeGen/AMDGPU/infinite-loop.ll @@ -0,0 +1,18 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}infinite_loop: +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7 +; SI: BB0_1: +; SI: buffer_store_dword [[REG]] +; SI: s_waitcnt vmcnt(0) expcnt(0) +; SI: s_branch BB0_1 +define void @infinite_loop(i32 addrspace(1)* %out) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + store i32 999, i32 addrspace(1)* %out, align 4 + br label %for.body +} + diff --git a/test/CodeGen/AMDGPU/inline-asm.ll b/test/CodeGen/AMDGPU/inline-asm.ll new file mode 100644 index 00000000000..efc2292de3a --- /dev/null +++ b/test/CodeGen/AMDGPU/inline-asm.ll @@ -0,0 +1,12 @@ +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +; CHECK: {{^}}inline_asm: +; CHECK: s_endpgm +; CHECK: s_endpgm +define void @inline_asm(i32 addrspace(1)* %out) { +entry: + store i32 5, i32 addrspace(1)* %out + call void asm sideeffect "s_endpgm", ""() + ret void +} diff --git a/test/CodeGen/AMDGPU/inline-calls.ll b/test/CodeGen/AMDGPU/inline-calls.ll new file mode 100644 index 00000000000..33a4c832e75 --- /dev/null +++ b/test/CodeGen/AMDGPU/inline-calls.ll @@ -0,0 +1,25 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s + +; CHECK-NOT: {{^}}func: +define internal fastcc i32 @func(i32 %a) { +entry: + %tmp0 = add i32 %a, 1 + ret i32 %tmp0 +} + +; CHECK: {{^}}kernel: +define void @kernel(i32 addrspace(1)* %out) { +entry: + %tmp0 = call i32 @func(i32 1) + store i32 %tmp0, i32 addrspace(1)* %out + ret void +} + +; CHECK: {{^}}kernel2: +define void @kernel2(i32 addrspace(1)* %out) { +entry: + call void @kernel(i32 addrspace(1)* %out) + ret void +} diff --git a/test/CodeGen/AMDGPU/input-mods.ll b/test/CodeGen/AMDGPU/input-mods.ll new file mode 100644 index 00000000000..1c4d285cbcb --- /dev/null +++ b/test/CodeGen/AMDGPU/input-mods.ll @@ -0,0 +1,26 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG +;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM + +;EG-LABEL: {{^}}test: +;EG: EXP_IEEE * +;CM-LABEL: {{^}}test: +;CM: EXP_IEEE T{{[0-9]+}}.X, -|T{{[0-9]+}}.X| +;CM: EXP_IEEE T{{[0-9]+}}.Y (MASKED), -|T{{[0-9]+}}.X| +;CM: EXP_IEEE T{{[0-9]+}}.Z (MASKED), -|T{{[0-9]+}}.X| +;CM: EXP_IEEE * T{{[0-9]+}}.W (MASKED), -|T{{[0-9]+}}.X| + +define void @test(<4 x float> inreg %reg0) #0 { + %r0 = extractelement <4 x float> %reg0, i32 0 + %r1 = call float @llvm.fabs.f32(float %r0) + %r2 = fsub float -0.000000e+00, %r1 + %r3 = call float @llvm.exp2.f32(float %r2) + %vec = insertelement <4 x float> undef, float %r3, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) + ret void +} + +declare float @llvm.exp2.f32(float) readnone +declare float @llvm.fabs.f32(float) readnone +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="0" } diff --git a/test/CodeGen/AMDGPU/insert_subreg.ll b/test/CodeGen/AMDGPU/insert_subreg.ll new file mode 100644 index 00000000000..4a5e8869c2d --- /dev/null +++ b/test/CodeGen/AMDGPU/insert_subreg.ll @@ -0,0 +1,16 @@ +; RUN: llc -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s + +; Test that INSERT_SUBREG instructions don't have non-register operands after +; instruction selection. + +; Make sure this doesn't crash +; CHECK-LABEL: test: +define void @test(i64 addrspace(1)* %out) { +entry: + %tmp0 = alloca [16 x i32] + %tmp1 = ptrtoint [16 x i32]* %tmp0 to i32 + %tmp2 = sext i32 %tmp1 to i64 + store i64 %tmp2, i64 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.ll b/test/CodeGen/AMDGPU/insert_vector_elt.ll new file mode 100644 index 00000000000..6de3d408c48 --- /dev/null +++ b/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -0,0 +1,252 @@ +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s + +; FIXME: Broken on evergreen +; FIXME: For some reason the 8 and 16 vectors are being stored as +; individual elements instead of 128-bit stores. + + +; FIXME: Why is the constant moved into the intermediate register and +; not just directly into the vector component? + +; SI-LABEL: {{^}}insertelement_v4f32_0: +; s_load_dwordx4 s{{[}}[[LOW_REG:[0-9]+]]: +; v_mov_b32_e32 +; v_mov_b32_e32 [[CONSTREG:v[0-9]+]], 5.000000e+00 +; v_mov_b32_e32 v[[LOW_REG]], [[CONSTREG]] +; buffer_store_dwordx4 v{{[}}[[LOW_REG]]: +define void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { + %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0 + store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}insertelement_v4f32_1: +define void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { + %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1 + store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}insertelement_v4f32_2: +define void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { + %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2 + store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}insertelement_v4f32_3: +define void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { + %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3 + store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}insertelement_v4i32_0: +define void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind { + %vecins = insertelement <4 x i32> %a, i32 999, i32 0 + store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v2f32: +; SI: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 +; SI: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]] +; SI: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]: +define void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind { + %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b + store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v4f32: +; SI: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 +; SI: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]] +; SI: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]: +define void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind { + %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b + store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v8f32: +; FIXMESI: buffer_store_dwordx4 +; FIXMESI: buffer_store_dwordx4 +define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind { + %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b + store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32 + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v16f32: +; FIXMESI: buffer_store_dwordx4 +; FIXMESI: buffer_store_dwordx4 +; FIXMESI: buffer_store_dwordx4 +; FIXMESI: buffer_store_dwordx4 +define void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind { + %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b + store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64 + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v2i32: +; SI: buffer_store_dwordx2 +define void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind { + %vecins = insertelement <2 x i32> %a, i32 5, i32 %b + store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v4i32: +; SI: buffer_store_dwordx4 +define void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b) nounwind { + %vecins = insertelement <4 x i32> %a, i32 5, i32 %b + store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v8i32: +; FIXMESI: buffer_store_dwordx4 +; FIXMESI: buffer_store_dwordx4 +define void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind { + %vecins = insertelement <8 x i32> %a, i32 5, i32 %b + store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v16i32: +; FIXMESI: buffer_store_dwordx4 +; FIXMESI: buffer_store_dwordx4 +; FIXMESI: buffer_store_dwordx4 +; FIXMESI: buffer_store_dwordx4 +define void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind { + %vecins = insertelement <16 x i32> %a, i32 5, i32 %b + store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64 + ret void +} + + +; SI-LABEL: {{^}}dynamic_insertelement_v2i16: +; FIXMESI: buffer_store_dwordx2 +define void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind { + %vecins = insertelement <2 x i16> %a, i16 5, i32 %b + store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v4i16: +; FIXMESI: buffer_store_dwordx4 +define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind { + %vecins = insertelement <4 x i16> %a, i16 5, i32 %b + store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out, align 16 + ret void +} + + +; SI-LABEL: {{^}}dynamic_insertelement_v2i8: +; FIXMESI: BUFFER_STORE_USHORT +define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind { + %vecins = insertelement <2 x i8> %a, i8 5, i32 %b + store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v4i8: +; FIXMESI: buffer_store_dword +define void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind { + %vecins = insertelement <4 x i8> %a, i8 5, i32 %b + store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v8i8: +; FIXMESI: buffer_store_dwordx2 +define void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind { + %vecins = insertelement <8 x i8> %a, i8 5, i32 %b + store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v16i8: +; FIXMESI: buffer_store_dwordx4 +define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind { + %vecins = insertelement <16 x i8> %a, i8 5, i32 %b + store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16 + ret void +} + +; This test requires handling INSERT_SUBREG in SIFixSGPRCopies. Check that +; the compiler doesn't crash. +; SI-LABEL: {{^}}insert_split_bb: +define void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) { +entry: + %0 = insertelement <2 x i32> undef, i32 %a, i32 0 + %1 = icmp eq i32 %a, 0 + br i1 %1, label %if, label %else + +if: + %2 = load i32, i32 addrspace(1)* %in + %3 = insertelement <2 x i32> %0, i32 %2, i32 1 + br label %endif + +else: + %4 = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %5 = load i32, i32 addrspace(1)* %4 + %6 = insertelement <2 x i32> %0, i32 %5, i32 1 + br label %endif + +endif: + %7 = phi <2 x i32> [%3, %if], [%6, %else] + store <2 x i32> %7, <2 x i32> addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v2f64: +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind { + %vecins = insertelement <2 x double> %a, double 8.0, i32 %b + store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v2i64: +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind { + %vecins = insertelement <2 x i64> %a, i64 5, i32 %b + store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v4f64: +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind { + %vecins = insertelement <4 x double> %a, double 8.0, i32 %b + store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}dynamic_insertelement_v8f64: +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind { + %vecins = insertelement <8 x double> %a, double 8.0, i32 %b + store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16 + ret void +} diff --git a/test/CodeGen/AMDGPU/jump-address.ll b/test/CodeGen/AMDGPU/jump-address.ll new file mode 100644 index 00000000000..f55912e3740 --- /dev/null +++ b/test/CodeGen/AMDGPU/jump-address.ll @@ -0,0 +1,52 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; CHECK: JUMP @6 +; CHECK: EXPORT +; CHECK-NOT: EXPORT + +define void @main() #0 { +main_body: + %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %1 = extractelement <4 x float> %0, i32 0 + %2 = bitcast float %1 to i32 + %3 = icmp eq i32 %2, 0 + %4 = sext i1 %3 to i32 + %5 = bitcast i32 %4 to float + %6 = bitcast float %5 to i32 + %7 = icmp ne i32 %6, 0 + br i1 %7, label %ENDIF, label %ELSE + +ELSE: ; preds = %main_body + %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %9 = extractelement <4 x float> %8, i32 0 + %10 = bitcast float %9 to i32 + %11 = icmp eq i32 %10, 1 + %12 = sext i1 %11 to i32 + %13 = bitcast i32 %12 to float + %14 = bitcast float %13 to i32 + %15 = icmp ne i32 %14, 0 + br i1 %15, label %IF13, label %ENDIF + +ENDIF: ; preds = %IF13, %ELSE, %main_body + %temp.0 = phi float [ 0xFFF8000000000000, %main_body ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF13 ] + %temp1.0 = phi float [ 0.000000e+00, %main_body ], [ %23, %IF13 ], [ 0.000000e+00, %ELSE ] + %temp2.0 = phi float [ 1.000000e+00, %main_body ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF13 ] + %temp3.0 = phi float [ 5.000000e-01, %main_body ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF13 ] + %16 = insertelement <4 x float> undef, float %temp.0, i32 0 + %17 = insertelement <4 x float> %16, float %temp1.0, i32 1 + %18 = insertelement <4 x float> %17, float %temp2.0, i32 2 + %19 = insertelement <4 x float> %18, float %temp3.0, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %19, i32 0, i32 0) + ret void + +IF13: ; preds = %ELSE + %20 = load <4 x float>, <4 x float> addrspace(8)* null + %21 = extractelement <4 x float> %20, i32 0 + %22 = fsub float -0.000000e+00, %21 + %23 = fadd float 0xFFF8000000000000, %22 + br label %ENDIF +} + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="0" } diff --git a/test/CodeGen/AMDGPU/kcache-fold.ll b/test/CodeGen/AMDGPU/kcache-fold.ll new file mode 100644 index 00000000000..7e2291cfdc3 --- /dev/null +++ b/test/CodeGen/AMDGPU/kcache-fold.ll @@ -0,0 +1,100 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; CHECK: {{^}}main1: +; CHECK: MOV * T{{[0-9]+\.[XYZW], KC0}} +define void @main1() { +main_body: + %0 = load <4 x float>, <4 x float> addrspace(8)* null + %1 = extractelement <4 x float> %0, i32 0 + %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %3 = extractelement <4 x float> %2, i32 0 + %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %5 = extractelement <4 x float> %4, i32 0 + %6 = fcmp ogt float %1, 0.000000e+00 + %7 = select i1 %6, float %3, float %5 + %8 = load <4 x float>, <4 x float> addrspace(8)* null + %9 = extractelement <4 x float> %8, i32 1 + %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %11 = extractelement <4 x float> %10, i32 1 + %12 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %13 = extractelement <4 x float> %12, i32 1 + %14 = fcmp ogt float %9, 0.000000e+00 + %15 = select i1 %14, float %11, float %13 + %16 = load <4 x float>, <4 x float> addrspace(8)* null + %17 = extractelement <4 x float> %16, i32 2 + %18 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %19 = extractelement <4 x float> %18, i32 2 + %20 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %21 = extractelement <4 x float> %20, i32 2 + %22 = fcmp ogt float %17, 0.000000e+00 + %23 = select i1 %22, float %19, float %21 + %24 = load <4 x float>, <4 x float> addrspace(8)* null + %25 = extractelement <4 x float> %24, i32 3 + %26 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %27 = extractelement <4 x float> %26, i32 3 + %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %29 = extractelement <4 x float> %28, i32 3 + %30 = fcmp ogt float %25, 0.000000e+00 + %31 = select i1 %30, float %27, float %29 + %32 = call float @llvm.AMDIL.clamp.(float %7, float 0.000000e+00, float 1.000000e+00) + %33 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00) + %34 = call float @llvm.AMDIL.clamp.(float %23, float 0.000000e+00, float 1.000000e+00) + %35 = call float @llvm.AMDIL.clamp.(float %31, float 0.000000e+00, float 1.000000e+00) + %36 = insertelement <4 x float> undef, float %32, i32 0 + %37 = insertelement <4 x float> %36, float %33, i32 1 + %38 = insertelement <4 x float> %37, float %34, i32 2 + %39 = insertelement <4 x float> %38, float %35, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %39, i32 0, i32 0) + ret void +} + +; CHECK: {{^}}main2: +; CHECK-NOT: MOV +define void @main2() { +main_body: + %0 = load <4 x float>, <4 x float> addrspace(8)* null + %1 = extractelement <4 x float> %0, i32 0 + %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %3 = extractelement <4 x float> %2, i32 0 + %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %5 = extractelement <4 x float> %4, i32 1 + %6 = fcmp ogt float %1, 0.000000e+00 + %7 = select i1 %6, float %3, float %5 + %8 = load <4 x float>, <4 x float> addrspace(8)* null + %9 = extractelement <4 x float> %8, i32 1 + %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %11 = extractelement <4 x float> %10, i32 0 + %12 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %13 = extractelement <4 x float> %12, i32 1 + %14 = fcmp ogt float %9, 0.000000e+00 + %15 = select i1 %14, float %11, float %13 + %16 = load <4 x float>, <4 x float> addrspace(8)* null + %17 = extractelement <4 x float> %16, i32 2 + %18 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %19 = extractelement <4 x float> %18, i32 3 + %20 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %21 = extractelement <4 x float> %20, i32 2 + %22 = fcmp ogt float %17, 0.000000e+00 + %23 = select i1 %22, float %19, float %21 + %24 = load <4 x float>, <4 x float> addrspace(8)* null + %25 = extractelement <4 x float> %24, i32 3 + %26 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %27 = extractelement <4 x float> %26, i32 3 + %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %29 = extractelement <4 x float> %28, i32 2 + %30 = fcmp ogt float %25, 0.000000e+00 + %31 = select i1 %30, float %27, float %29 + %32 = call float @llvm.AMDIL.clamp.(float %7, float 0.000000e+00, float 1.000000e+00) + %33 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00) + %34 = call float @llvm.AMDIL.clamp.(float %23, float 0.000000e+00, float 1.000000e+00) + %35 = call float @llvm.AMDIL.clamp.(float %31, float 0.000000e+00, float 1.000000e+00) + %36 = insertelement <4 x float> undef, float %32, i32 0 + %37 = insertelement <4 x float> %36, float %33, i32 1 + %38 = insertelement <4 x float> %37, float %34, i32 2 + %39 = insertelement <4 x float> %38, float %35, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %39, i32 0, i32 0) + ret void +} + +declare float @llvm.AMDIL.clamp.(float, float, float) readnone +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) diff --git a/test/CodeGen/AMDGPU/kernel-args.ll b/test/CodeGen/AMDGPU/kernel-args.ll new file mode 100644 index 00000000000..1dd7c2cb799 --- /dev/null +++ b/test/CodeGen/AMDGPU/kernel-args.ll @@ -0,0 +1,473 @@ +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC + +; FUNC-LABEL: {{^}}i8_arg: +; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z +; GCN: buffer_load_ubyte + +define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind { +entry: + %0 = zext i8 %in to i32 + store i32 %0, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}i8_zext_arg: +; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z +; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb +; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c + +define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind { +entry: + %0 = zext i8 %in to i32 + store i32 %0, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}i8_sext_arg: +; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z +; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb +; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c + +define void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind { +entry: + %0 = sext i8 %in to i32 + store i32 %0, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}i16_arg: +; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z +; GCN: buffer_load_ushort + +define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind { +entry: + %0 = zext i16 %in to i32 + store i32 %0, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}i16_zext_arg: +; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z +; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb +; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c + +define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind { +entry: + %0 = zext i16 %in to i32 + store i32 %0, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}i16_sext_arg: +; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z +; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb +; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c + +define void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind { +entry: + %0 = sext i16 %in to i32 + store i32 %0, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}i32_arg: +; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z +; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb +; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c +define void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind { +entry: + store i32 %in, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}f32_arg: +; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z +; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb +; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c +define void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind { +entry: + store float %in, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v2i8_arg: +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +define void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) { +entry: + store <2 x i8> %in, <2 x i8> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v2i16_arg: +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; GCN-DAG: buffer_load_ushort +; GCN-DAG: buffer_load_ushort +define void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) { +entry: + store <2 x i16> %in, <2 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v2i32_arg: +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W +; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb +; VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c +define void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind { +entry: + store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v2f32_arg: +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W +; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb +; VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c +define void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind { +entry: + store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v3i8_arg: +; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40 +; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41 +; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42 +define void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind { +entry: + store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v3i16_arg: +; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44 +; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 +; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48 +define void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind { +entry: + store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4 + ret void +} +; FUNC-LABEL: {{^}}v3i32_arg: +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W +; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd +; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 +define void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind { +entry: + store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v3f32_arg: +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W +; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd +; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 +define void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind { +entry: + store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v4i8_arg: +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +define void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) { +entry: + store <4 x i8> %in, <4 x i8> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v4i16_arg: +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +define void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) { +entry: + store <4 x i16> %in, <4 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v4i32_arg: +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X +; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd +; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 +define void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind { +entry: + store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v4f32_arg: +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X +; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd +; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 +define void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind { +entry: + store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v8i8_arg: +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +define void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) { +entry: + store <8 x i8> %in, <8 x i8> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v8i16_arg: +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +define void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) { +entry: + store <8 x i16> %in, <8 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v8i32_arg: +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X +; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11 +; VI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x44 +define void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind { +entry: + store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v8f32_arg: +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X +; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11 +define void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind { +entry: + store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v16i8_arg: +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; EG: VTX_READ_8 +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +define void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) { +entry: + store <16 x i8> %in, <16 x i8> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v16i16_arg: +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; EG: VTX_READ_16 +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +define void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) { +entry: + store <16 x i16> %in, <16 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v16i32_arg: +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X +; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 +; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 +define void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind { +entry: + store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v16f32_arg: +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W +; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X +; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 +; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 +define void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind { +entry: + store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}kernel_arg_i64: +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: buffer_store_dwordx2 +define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { + store i64 %a, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}f64_kernel_arg: +; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9 +; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb +; VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x24 +; VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x2c +; GCN: buffer_store_dwordx2 +define void @f64_kernel_arg(double addrspace(1)* %out, double %in) { +entry: + store double %in, double addrspace(1)* %out + ret void +} + +; XFUNC-LABEL: {{^}}kernel_arg_v1i64: +; XGCN: s_load_dwordx2 +; XGCN: s_load_dwordx2 +; XGCN: buffer_store_dwordx2 +; define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind { +; store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8 +; ret void +; } diff --git a/test/CodeGen/AMDGPU/large-alloca.ll b/test/CodeGen/AMDGPU/large-alloca.ll new file mode 100644 index 00000000000..671833d1a33 --- /dev/null +++ b/test/CodeGen/AMDGPU/large-alloca.ll @@ -0,0 +1,15 @@ +; XFAIL: * +; REQUIRES: asserts +; RUN: llc -march=amdgcn -mcpu=SI < %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s + +define void @large_alloca(i32 addrspace(1)* %out, i32 %x, i32 %y) nounwind { + %large = alloca [8192 x i32], align 4 + %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191 + store i32 %x, i32* %gep + %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y + %0 = load i32, i32* %gep1 + store i32 %0, i32 addrspace(1)* %out + ret void +} + diff --git a/test/CodeGen/AMDGPU/large-constant-initializer.ll b/test/CodeGen/AMDGPU/large-constant-initializer.ll new file mode 100644 index 00000000000..9975b1b7f5c --- /dev/null +++ b/test/CodeGen/AMDGPU/large-constant-initializer.ll @@ -0,0 +1,19 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s +; CHECK: s_endpgm + +@gv = external unnamed_addr addrspace(2) constant [239 x i32], align 4 + +define void @opencv_cvtfloat_crash(i32 addrspace(1)* %out, i32 %x) nounwind { + %val = load i32, i32 addrspace(2)* getelementptr ([239 x i32], [239 x i32] addrspace(2)* @gv, i64 0, i64 239), align 4 + %mul12 = mul nsw i32 %val, 7 + br i1 undef, label %exit, label %bb + +bb: + %cmp = icmp slt i32 %x, 0 + br label %exit + +exit: + ret void +} + diff --git a/test/CodeGen/AMDGPU/lds-initializer.ll b/test/CodeGen/AMDGPU/lds-initializer.ll new file mode 100644 index 00000000000..bf8df63be9f --- /dev/null +++ b/test/CodeGen/AMDGPU/lds-initializer.ll @@ -0,0 +1,13 @@ +; RUN: not llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s +; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s + +; CHECK: error: unsupported initializer for address space in load_init_lds_global + +@lds = addrspace(3) global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8] + +define void @load_init_lds_global(i32 addrspace(1)* %out, i1 %p) { + %gep = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds, i32 0, i32 10 + %ld = load i32, i32 addrspace(3)* %gep + store i32 %ld, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/lds-oqap-crash.ll b/test/CodeGen/AMDGPU/lds-oqap-crash.ll new file mode 100644 index 00000000000..6ff6fc3d7af --- /dev/null +++ b/test/CodeGen/AMDGPU/lds-oqap-crash.ll @@ -0,0 +1,28 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s + +; The test is for a bug in R600EmitClauseMarkers.cpp where this pass +; was searching for a use of the OQAP register in order to determine +; if an LDS instruction could fit in the current clause, but never finding +; one. This created an infinite loop and hung the compiler. +; +; The LDS instruction should not have been defining OQAP in the first place, +; because the LDS instructions are pseudo instructions and the OQAP +; reads and writes are bundled together in the same instruction. + +; CHECK: {{^}}lds_crash: +define void @lds_crash(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %a, i32 %b, i32 %c) { +entry: + %0 = load i32, i32 addrspace(3)* %in + ; This block needs to be > 115 ISA instructions to hit the bug, + ; so we'll use udiv instructions. + %div0 = udiv i32 %0, %b + %div1 = udiv i32 %div0, %a + %div2 = udiv i32 %div1, 11 + %div3 = udiv i32 %div2, %a + %div4 = udiv i32 %div3, %b + %div5 = udiv i32 %div4, %c + %div6 = udiv i32 %div5, %div0 + %div7 = udiv i32 %div6, %div1 + store i32 %div7, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/lds-output-queue.ll b/test/CodeGen/AMDGPU/lds-output-queue.ll new file mode 100644 index 00000000000..44ffc36af14 --- /dev/null +++ b/test/CodeGen/AMDGPU/lds-output-queue.ll @@ -0,0 +1,99 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s +; +; This test checks that the lds input queue will is empty at the end of +; the ALU clause. + +; CHECK-LABEL: {{^}}lds_input_queue: +; CHECK: LDS_READ_RET * OQAP +; CHECK-NOT: ALU clause +; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP + +@local_mem = internal unnamed_addr addrspace(3) global [2 x i32] undef, align 4 + +define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) { +entry: + %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index + %1 = load i32, i32 addrspace(3)* %0 + call void @llvm.AMDGPU.barrier.local() + + ; This will start a new clause for the vertex fetch + %2 = load i32, i32 addrspace(1)* %in + %3 = add i32 %1, %2 + store i32 %3, i32 addrspace(1)* %out + ret void +} + +declare void @llvm.AMDGPU.barrier.local() + +; The machine scheduler does not do proper alias analysis and assumes that +; loads from global values (Note that a global value is different that a +; value from global memory. A global value is a value that is declared +; outside of a function, it can reside in any address space) alias with +; all other loads. +; +; This is a problem for scheduling the reads from the local data share (lds). +; These reads are implemented using two instructions. The first copies the +; data from lds into the lds output queue, and the second moves the data from +; the input queue into main memory. These two instructions don't have to be +; scheduled one after the other, but they do need to be scheduled in the same +; clause. The aliasing problem mentioned above causes problems when there is a +; load from global memory which immediately follows a load from a global value that +; has been declared in the local memory space: +; +; %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index +; %1 = load i32, i32 addrspace(3)* %0 +; %2 = load i32, i32 addrspace(1)* %in +; +; The instruction selection phase will generate ISA that looks like this: +; %OQAP = LDS_READ_RET +; %vreg0 = MOV %OQAP +; %vreg1 = VTX_READ_32 +; %vreg2 = ADD_INT %vreg1, %vreg0 +; +; The bottom scheduler will schedule the two ALU instructions first: +; +; UNSCHEDULED: +; %OQAP = LDS_READ_RET +; %vreg1 = VTX_READ_32 +; +; SCHEDULED: +; +; vreg0 = MOV %OQAP +; vreg2 = ADD_INT %vreg1, %vreg2 +; +; The lack of proper aliasing results in the local memory read (LDS_READ_RET) +; to consider the global memory read (VTX_READ_32) has a chain dependency, so +; the global memory read will always be scheduled first. This will give us a +; final program which looks like this: +; +; Alu clause: +; %OQAP = LDS_READ_RET +; VTX clause: +; %vreg1 = VTX_READ_32 +; Alu clause: +; vreg0 = MOV %OQAP +; vreg2 = ADD_INT %vreg1, %vreg2 +; +; This is an illegal program because the OQAP def and use know occur in +; different ALU clauses. +; +; This test checks this scenario and makes sure it doesn't result in an +; illegal program. For now, we have fixed this issue by merging the +; LDS_READ_RET and MOV together during instruction selection and then +; expanding them after scheduling. Once the scheduler has better alias +; analysis, we should be able to keep these instructions sparate before +; scheduling. +; +; CHECK-LABEL: {{^}}local_global_alias: +; CHECK: LDS_READ_RET +; CHECK-NOT: ALU clause +; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP +define void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +entry: + %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 0 + %1 = load i32, i32 addrspace(3)* %0 + %2 = load i32, i32 addrspace(1)* %in + %3 = add i32 %2, %1 + store i32 %3, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/lds-size.ll b/test/CodeGen/AMDGPU/lds-size.ll new file mode 100644 index 00000000000..3e8328659fd --- /dev/null +++ b/test/CodeGen/AMDGPU/lds-size.ll @@ -0,0 +1,26 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; This test makes sure we do not double count global values when they are +; used in different basic blocks. + +; CHECK: .long 166120 +; CHECK-NEXT: .long 1 +; CHECK-LABEL: {{^}}test: +@lds = internal unnamed_addr addrspace(3) global i32 undef, align 4 + +define void @test(i32 addrspace(1)* %out, i32 %cond) { +entry: + %0 = icmp eq i32 %cond, 0 + br i1 %0, label %if, label %else + +if: + store i32 1, i32 addrspace(3)* @lds + br label %endif + +else: + store i32 2, i32 addrspace(3)* @lds + br label %endif + +endif: + ret void +} diff --git a/test/CodeGen/AMDGPU/lds-zero-initializer.ll b/test/CodeGen/AMDGPU/lds-zero-initializer.ll new file mode 100644 index 00000000000..fb51bc0e50c --- /dev/null +++ b/test/CodeGen/AMDGPU/lds-zero-initializer.ll @@ -0,0 +1,13 @@ +; RUN: not llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s +; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s + +; CHECK: error: unsupported initializer for address space in load_zeroinit_lds_global + +@lds = addrspace(3) global [256 x i32] zeroinitializer + +define void @load_zeroinit_lds_global(i32 addrspace(1)* %out, i1 %p) { + %gep = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds, i32 0, i32 10 + %ld = load i32, i32 addrspace(3)* %gep + store i32 %ld, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll b/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll new file mode 100644 index 00000000000..4244c48d240 --- /dev/null +++ b/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll @@ -0,0 +1,26 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; This tests a bug where LegalizeDAG was not checking the target's +; BooleanContents value and always using one for true, when expanding +; setcc to select_cc. +; +; This bug caused the icmp IR instruction to be expanded to two machine +; instructions, when only one is needed. +; + +; CHECK: {{^}}setcc_expand: +; CHECK: SET +; CHECK-NOT: CND +define void @setcc_expand(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = icmp eq i32 %in, 5 + br i1 %0, label %IF, label %ENDIF +IF: + %1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + store i32 0, i32 addrspace(1)* %1 + br label %ENDIF + +ENDIF: + store i32 0, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/lit.local.cfg b/test/CodeGen/AMDGPU/lit.local.cfg new file mode 100644 index 00000000000..2a665f06be7 --- /dev/null +++ b/test/CodeGen/AMDGPU/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'AMDGPU' in config.root.targets: + config.unsupported = True diff --git a/test/CodeGen/AMDGPU/literals.ll b/test/CodeGen/AMDGPU/literals.ll new file mode 100644 index 00000000000..cff1c24f89d --- /dev/null +++ b/test/CodeGen/AMDGPU/literals.ll @@ -0,0 +1,64 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; Test using an integer literal constant. +; Generated ASM should be: +; ADD_INT KC0[2].Z literal.x, 5 +; or +; ADD_INT literal.x KC0[2].Z, 5 + +; CHECK: {{^}}i32_literal: +; CHECK: ADD_INT {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x +; CHECK-NEXT: LSHR +; CHECK-NEXT: 5 +define void @i32_literal(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = add i32 5, %in + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; Test using a float literal constant. +; Generated ASM should be: +; ADD KC0[2].Z literal.x, 5.0 +; or +; ADD literal.x KC0[2].Z, 5.0 + +; CHECK: {{^}}float_literal: +; CHECK: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x +; CHECK-NEXT: LSHR +; CHECK-NEXT: 1084227584(5.0 +define void @float_literal(float addrspace(1)* %out, float %in) { +entry: + %0 = fadd float 5.0, %in + store float %0, float addrspace(1)* %out + ret void +} + +; Make sure inline literals are folded into REG_SEQUENCE instructions. +; CHECK: {{^}}inline_literal_reg_sequence: +; CHECK: MOV {{\** *}}T[[GPR:[0-9]]].X, 0.0 +; CHECK-NEXT: MOV {{\** *}}T[[GPR]].Y, 0.0 +; CHECK-NEXT: MOV {{\** *}}T[[GPR]].Z, 0.0 +; CHECK-NEXT: MOV {{\** *}}T[[GPR]].W, 0.0 + +define void @inline_literal_reg_sequence(<4 x i32> addrspace(1)* %out) { +entry: + store <4 x i32> , <4 x i32> addrspace(1)* %out + ret void +} + +; CHECK: {{^}}inline_literal_dot4: +; CHECK: DOT4 T[[GPR:[0-9]]].X, 1.0 +; CHECK-NEXT: DOT4 T[[GPR]].Y (MASKED), 1.0 +; CHECK-NEXT: DOT4 T[[GPR]].Z (MASKED), 1.0 +; CHECK-NEXT: DOT4 * T[[GPR]].W (MASKED), 1.0 +define void @inline_literal_dot4(float addrspace(1)* %out) { +entry: + %0 = call float @llvm.AMDGPU.dp4(<4 x float> , <4 x float> ) + store float %0, float addrspace(1)* %out + ret void +} + +declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 + +attributes #1 = { readnone } diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll new file mode 100644 index 00000000000..8bf094b8bc7 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.abs.ll @@ -0,0 +1,49 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare i32 @llvm.AMDGPU.abs(i32) nounwind readnone + +; Legacy name +declare i32 @llvm.AMDIL.abs.i32(i32) nounwind readnone + +; FUNC-LABEL: {{^}}s_abs_i32: +; SI: s_sub_i32 +; SI: s_max_i32 +; SI: s_endpgm + +; EG: SUB_INT +; EG: MAX_INT +define void @s_abs_i32(i32 addrspace(1)* %out, i32 %src) nounwind { + %abs = call i32 @llvm.AMDGPU.abs(i32 %src) nounwind readnone + store i32 %abs, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_abs_i32: +; SI: v_sub_i32_e32 +; SI: v_max_i32_e32 +; SI: s_endpgm + +; EG: SUB_INT +; EG: MAX_INT +define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind { + %val = load i32, i32 addrspace(1)* %src, align 4 + %abs = call i32 @llvm.AMDGPU.abs(i32 %val) nounwind readnone + store i32 %abs, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}abs_i32_legacy_amdil: +; SI: v_sub_i32_e32 +; SI: v_max_i32_e32 +; SI: s_endpgm + +; EG: SUB_INT +; EG: MAX_INT +define void @abs_i32_legacy_amdil(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind { + %val = load i32, i32 addrspace(1)* %src, align 4 + %abs = call i32 @llvm.AMDIL.abs.i32(i32 %val) nounwind readnone + store i32 %abs, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll new file mode 100644 index 00000000000..db883972d64 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll @@ -0,0 +1,30 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}test_barrier_global: +; EG: GROUP_BARRIER +; SI: buffer_store_dword +; SI: s_waitcnt +; SI: s_barrier + +define void @test_barrier_global(i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.tidig.x() + %1 = getelementptr i32, i32 addrspace(1)* %out, i32 %0 + store i32 %0, i32 addrspace(1)* %1 + call void @llvm.AMDGPU.barrier.global() + %2 = call i32 @llvm.r600.read.local.size.x() + %3 = sub i32 %2, 1 + %4 = sub i32 %3, %0 + %5 = getelementptr i32, i32 addrspace(1)* %out, i32 %4 + %6 = load i32, i32 addrspace(1)* %5 + store i32 %6, i32 addrspace(1)* %1 + ret void +} + +declare void @llvm.AMDGPU.barrier.global() + +declare i32 @llvm.r600.read.tidig.x() #0 +declare i32 @llvm.r600.read.local.size.x() #0 + +attributes #0 = { readnone } diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll new file mode 100644 index 00000000000..48fb2e0b1a8 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll @@ -0,0 +1,31 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}test_barrier_local: +; EG: GROUP_BARRIER + +; SI: buffer_store_dword +; SI: s_waitcnt +; SI: s_barrier + +define void @test_barrier_local(i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.tidig.x() + %1 = getelementptr i32, i32 addrspace(1)* %out, i32 %0 + store i32 %0, i32 addrspace(1)* %1 + call void @llvm.AMDGPU.barrier.local() + %2 = call i32 @llvm.r600.read.local.size.x() + %3 = sub i32 %2, 1 + %4 = sub i32 %3, %0 + %5 = getelementptr i32, i32 addrspace(1)* %out, i32 %4 + %6 = load i32, i32 addrspace(1)* %5 + store i32 %6, i32 addrspace(1)* %1 + ret void +} + +declare void @llvm.AMDGPU.barrier.local() + +declare i32 @llvm.r600.read.tidig.x() #0 +declare i32 @llvm.r600.read.local.size.x() #0 + +attributes #0 = { readnone } diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll new file mode 100644 index 00000000000..1168713ca66 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll @@ -0,0 +1,437 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone + +; FUNC-LABEL: {{^}}bfe_i32_arg_arg_arg: +; SI: v_bfe_i32 +; EG: BFE_INT +; EG: encoding: [{{[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+}},0xac +define void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_arg_arg_imm: +; SI: v_bfe_i32 +; EG: BFE_INT +define void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 123) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_arg_imm_arg: +; SI: v_bfe_i32 +; EG: BFE_INT +define void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 123, i32 %src2) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_imm_arg_arg: +; SI: v_bfe_i32 +; EG: BFE_INT +define void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 123, i32 %src1, i32 %src2) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_bfe_print_arg: +; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 2, 8 +define void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) nounwind { + %load = load i32, i32 addrspace(1)* %src0, align 4 + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 2, i32 8) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_arg_0_width_reg_offset: +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 0) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_arg_0_width_imm_offset: +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 8, i32 0) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_test_6: +; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} +; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} +; SI: s_endpgm +define void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 1, i32 31) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_test_7: +; SI-NOT: shl +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +define void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 0, i32 31) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_test_8: +; SI: buffer_load_dword +; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1 +; SI: s_endpgm +define void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_test_9: +; SI-NOT: {{[^@]}}bfe +; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_test_10: +; SI-NOT: {{[^@]}}bfe +; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 1, i32 31) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_test_11: +; SI-NOT: {{[^@]}}bfe +; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 8, i32 24) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_test_12: +; SI-NOT: {{[^@]}}bfe +; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}} +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 24, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_test_13: +; SI: v_ashrrev_i32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}} +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = ashr i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_test_14: +; SI-NOT: lshr +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = lshr i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_0: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 0) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_1: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 12334, i32 0, i32 0) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_2: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 1) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_3: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 1, i32 0, i32 1) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_4: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 0, i32 1) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_5: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 7, i32 1) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_6: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0xffffff80 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 0, i32 8) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_7: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 0, i32 8) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_8: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 6, i32 8) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_9: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65536, i32 16, i32 8) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_10: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65535, i32 16, i32 16) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_11: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -6 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 4) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_12: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 31, i32 1) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_13: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 131070, i32 16, i32 16) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_14: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 40 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 2, i32 30) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_15: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 28) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_16: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 1, i32 7) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_17: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 1, i32 31) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_18: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind { + %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 31, i32 1) nounwind readnone + store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_sext_in_reg_i24: +; SI: buffer_load_dword [[LOAD:v[0-9]+]], +; SI-NOT: v_lshl +; SI-NOT: v_ashr +; SI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 0, 24 +; SI: buffer_store_dword [[BFE]], +define void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 0, i32 24) + %shl = shl i32 %bfe, 8 + %ashr = ashr i32 %shl, 8 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @simplify_demanded_bfe_sdiv +; SI: buffer_load_dword [[LOAD:v[0-9]+]] +; SI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 1, 16 +; SI: v_lshrrev_b32_e32 [[TMP0:v[0-9]+]], 31, [[BFE]] +; SI: v_add_i32_e32 [[TMP1:v[0-9]+]], [[TMP0]], [[BFE]] +; SI: v_ashrrev_i32_e32 [[TMP2:v[0-9]+]], 1, [[TMP1]] +; SI: buffer_store_dword [[TMP2]] +define void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %src = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %src, i32 1, i32 16) nounwind readnone + %div = sdiv i32 %bfe, 2 + store i32 %div, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll new file mode 100644 index 00000000000..541119242a9 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll @@ -0,0 +1,627 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare i32 @llvm.AMDGPU.bfe.u32(i32, i32, i32) nounwind readnone + +; FUNC-LABEL: {{^}}bfe_u32_arg_arg_arg: +; SI: v_bfe_u32 +; EG: BFE_UINT +define void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_arg_arg_imm: +; SI: v_bfe_u32 +; EG: BFE_UINT +define void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 123) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_arg_imm_arg: +; SI: v_bfe_u32 +; EG: BFE_UINT +define void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 123, i32 %src2) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_imm_arg_arg: +; SI: v_bfe_u32 +; EG: BFE_UINT +define void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 123, i32 %src1, i32 %src2) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_arg_0_width_reg_offset: +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 0) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_arg_0_width_imm_offset: +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 8, i32 0) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_zextload_i8: +; SI: buffer_load_ubyte +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { + %load = load i8, i8 addrspace(1)* %in + %ext = zext i8 %load to i32 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8: +; SI: buffer_load_dword +; SI: v_add_i32 +; SI-NEXT: v_and_b32_e32 +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %load = load i32, i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 1 + %ext = and i32 %add, 255 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i16: +; SI: buffer_load_dword +; SI: v_add_i32 +; SI-NEXT: v_and_b32_e32 +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %load = load i32, i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 1 + %ext = and i32 %add, 65535 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 16) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_1: +; SI: buffer_load_dword +; SI: v_add_i32 +; SI: bfe +; SI: s_endpgm +define void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %load = load i32, i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 1 + %ext = and i32 %add, 255 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 1, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_3: +; SI: buffer_load_dword +; SI: v_add_i32 +; SI-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0xf8 +; SI-NEXT: bfe +; SI: s_endpgm +define void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %load = load i32, i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 1 + %ext = and i32 %add, 255 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 3, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_7: +; SI: buffer_load_dword +; SI: v_add_i32 +; SI-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0x80 +; SI-NEXT: bfe +; SI: s_endpgm +define void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %load = load i32, i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 1 + %ext = and i32 %add, 255 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 7, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i16_offset_8: +; SI: buffer_load_dword +; SI: v_add_i32 +; SI-NEXT: bfe +; SI: s_endpgm +define void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %load = load i32, i32 addrspace(1)* %in, align 4 + %add = add i32 %load, 1 + %ext = and i32 %add, 65535 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 8, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_test_1: +; SI: buffer_load_dword +; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}} +; SI: s_endpgm +; EG: AND_INT T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, 1, +define void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 0, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +define void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_test_4: +; SI-NOT: lshl +; SI-NOT: shr +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +define void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %shr = lshr i32 %shl, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shr, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_test_5: +; SI: buffer_load_dword +; SI-NOT: lshl +; SI-NOT: shr +; SI: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1 +; SI: s_endpgm +define void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %shr = ashr i32 %shl, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shr, i32 0, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_test_6: +; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} +; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} +; SI: s_endpgm +define void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 1, i32 31) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_test_7: +; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 31) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_test_8: +; SI-NOT: {{[^@]}}bfe +; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}} +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_test_9: +; SI-NOT: {{[^@]}}bfe +; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_test_10: +; SI-NOT: {{[^@]}}bfe +; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 1, i32 31) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_test_11: +; SI-NOT: {{[^@]}}bfe +; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 8, i32 24) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_test_12: +; SI-NOT: {{[^@]}}bfe +; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}} +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 24, i32 8) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_test_13: +; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}} +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = ashr i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_test_14: +; SI-NOT: lshr +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = lshr i32 %x, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_0: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 0) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_1: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 12334, i32 0, i32 0) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_2: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 1) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_3: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 1, i32 0, i32 1) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_4: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 0, i32 1) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_5: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 7, i32 1) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_6: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x80 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 0, i32 8) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_7: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 0, i32 8) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_8: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 6, i32 8) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_9: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65536, i32 16, i32 8) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_10: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65535, i32 16, i32 16) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_11: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 4) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_12: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 31, i32 1) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_13: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 131070, i32 16, i32 16) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_14: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 40 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 2, i32 30) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_15: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 28) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_16: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 1, i32 7) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_17: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 1, i32 31) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_18: +; SI-NOT: {{[^@]}}bfe +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 +; SI: buffer_store_dword [[VREG]], +; SI: s_endpgm +; EG-NOT: BFE +define void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind { + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 31, i32 1) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 + ret void +} + +; Make sure that SimplifyDemandedBits doesn't cause the and to be +; reduced to the bits demanded by the bfe. + +; XXX: The operand to v_bfe_u32 could also just directly be the load register. +; FUNC-LABEL: {{^}}simplify_bfe_u32_multi_use_arg: +; SI: buffer_load_dword [[ARG:v[0-9]+]] +; SI: v_and_b32_e32 [[AND:v[0-9]+]], 63, [[ARG]] +; SI: v_bfe_u32 [[BFE:v[0-9]+]], [[AND]], 2, 2 +; SI-DAG: buffer_store_dword [[AND]] +; SI-DAG: buffer_store_dword [[BFE]] +; SI: s_endpgm +define void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0, + i32 addrspace(1)* %out1, + i32 addrspace(1)* %in) nounwind { + %src = load i32, i32 addrspace(1)* %in, align 4 + %and = and i32 %src, 63 + %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %and, i32 2, i32 2) nounwind readnone + store i32 %bfe_u32, i32 addrspace(1)* %out0, align 4 + store i32 %and, i32 addrspace(1)* %out1, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lshr_and: +; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006 +; SI: buffer_store_dword +define void @lshr_and(i32 addrspace(1)* %out, i32 %a) nounwind { + %b = lshr i32 %a, 6 + %c = and i32 %b, 7 + store i32 %c, i32 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_lshr_and: +; SI: v_bfe_u32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, 3 +; SI: buffer_store_dword +define void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %c = lshr i32 %a, %b + %d = and i32 %c, 7 + store i32 %d, i32 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}and_lshr: +; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006 +; SI: buffer_store_dword +define void @and_lshr(i32 addrspace(1)* %out, i32 %a) nounwind { + %b = and i32 %a, 448 + %c = lshr i32 %b, 6 + store i32 %c, i32 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}and_lshr2: +; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006 +; SI: buffer_store_dword +define void @and_lshr2(i32 addrspace(1)* %out, i32 %a) nounwind { + %b = and i32 %a, 511 + %c = lshr i32 %b, 6 + store i32 %c, i32 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}shl_lshr: +; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x150002 +; SI: buffer_store_dword +define void @shl_lshr(i32 addrspace(1)* %out, i32 %a) nounwind { + %b = shl i32 %a, 9 + %c = lshr i32 %b, 11 + store i32 %c, i32 addrspace(1)* %out, align 8 + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfi.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfi.ll new file mode 100644 index 00000000000..517a55abc09 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfi.ll @@ -0,0 +1,42 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare i32 @llvm.AMDGPU.bfi(i32, i32, i32) nounwind readnone + +; FUNC-LABEL: {{^}}bfi_arg_arg_arg: +; SI: v_bfi_b32 +; EG: BFI_INT +define void @bfi_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind { + %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 %src1, i32 %src1) nounwind readnone + store i32 %bfi, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfi_arg_arg_imm: +; SI: v_bfi_b32 +; EG: BFI_INT +define void @bfi_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { + %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 %src1, i32 123) nounwind readnone + store i32 %bfi, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfi_arg_imm_arg: +; SI: v_bfi_b32 +; EG: BFI_INT +define void @bfi_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind { + %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 123, i32 %src2) nounwind readnone + store i32 %bfi, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfi_imm_arg_arg: +; SI: v_bfi_b32 +; EG: BFI_INT +define void @bfi_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind { + %bfi = call i32 @llvm.AMDGPU.bfi(i32 123, i32 %src1, i32 %src2) nounwind readnone + store i32 %bfi, i32 addrspace(1)* %out, align 4 + ret void +} + diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfm.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfm.ll new file mode 100644 index 00000000000..50492289d74 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfm.ll @@ -0,0 +1,60 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare i32 @llvm.AMDGPU.bfm(i32, i32) nounwind readnone + +; FUNC-LABEL: {{^}}bfm_arg_arg: +; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +; EG: BFM_INT +define void @bfm_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { + %bfm = call i32 @llvm.AMDGPU.bfm(i32 %src0, i32 %src1) nounwind readnone + store i32 %bfm, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfm_arg_imm: +; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x7b +; EG: BFM_INT +define void @bfm_arg_imm(i32 addrspace(1)* %out, i32 %src0) nounwind { + %bfm = call i32 @llvm.AMDGPU.bfm(i32 %src0, i32 123) nounwind readnone + store i32 %bfm, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfm_imm_arg: +; SI: s_bfm_b32 {{s[0-9]+}}, 0x7b, {{s[0-9]+}} +; EG: BFM_INT +define void @bfm_imm_arg(i32 addrspace(1)* %out, i32 %src1) nounwind { + %bfm = call i32 @llvm.AMDGPU.bfm(i32 123, i32 %src1) nounwind readnone + store i32 %bfm, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfm_imm_imm: +; SI: s_bfm_b32 {{s[0-9]+}}, 0x7b, 0x1c8 +; EG: BFM_INT +define void @bfm_imm_imm(i32 addrspace(1)* %out) nounwind { + %bfm = call i32 @llvm.AMDGPU.bfm(i32 123, i32 456) nounwind readnone + store i32 %bfm, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfm_pattern: +; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} +define void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) { + %a = shl i32 1, %x + %b = sub i32 %a, 1 + %c = shl i32 %b, %y + store i32 %c, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}bfm_pattern_simple: +; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0 +define void @bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) { + %a = shl i32 1, %x + %b = sub i32 %a, 1 + store i32 %b, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.brev.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.brev.ll new file mode 100644 index 00000000000..301de4b1c82 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.brev.ll @@ -0,0 +1,28 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.AMDGPU.brev(i32) nounwind readnone + +; FUNC-LABEL: {{^}}s_brev_i32: +; SI: s_load_dword [[VAL:s[0-9]+]], +; SI: s_brev_b32 [[SRESULT:s[0-9]+]], [[VAL]] +; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] +; SI: buffer_store_dword [[VRESULT]], +; SI: s_endpgm +define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { + %ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone + store i32 %ctlz, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_brev_i32: +; SI: buffer_load_dword [[VAL:v[0-9]+]], +; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + %val = load i32, i32 addrspace(1)* %valptr, align 4 + %ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone + store i32 %ctlz, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll new file mode 100644 index 00000000000..11ec963ab31 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll @@ -0,0 +1,67 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare float @llvm.fabs.f32(float) nounwind readnone +declare float @llvm.AMDGPU.clamp.f32(float, float, float) nounwind readnone +declare float @llvm.AMDIL.clamp.f32(float, float, float) nounwind readnone + +; FUNC-LABEL: {{^}}clamp_0_1_f32: +; SI: s_load_dword [[ARG:s[0-9]+]], +; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, [[ARG]] clamp{{$}} +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm + +; EG: MOV_SAT +define void @clamp_0_1_f32(float addrspace(1)* %out, float %src) nounwind { + %clamp = call float @llvm.AMDGPU.clamp.f32(float %src, float 0.0, float 1.0) nounwind readnone + store float %clamp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}clamp_fabs_0_1_f32: +; SI: s_load_dword [[ARG:s[0-9]+]], +; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, |[[ARG]]| clamp{{$}} +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @clamp_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind { + %src.fabs = call float @llvm.fabs.f32(float %src) nounwind readnone + %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fabs, float 0.0, float 1.0) nounwind readnone + store float %clamp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}clamp_fneg_0_1_f32: +; SI: s_load_dword [[ARG:s[0-9]+]], +; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, -[[ARG]] clamp{{$}} +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @clamp_fneg_0_1_f32(float addrspace(1)* %out, float %src) nounwind { + %src.fneg = fsub float -0.0, %src + %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fneg, float 0.0, float 1.0) nounwind readnone + store float %clamp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}clamp_fneg_fabs_0_1_f32: +; SI: s_load_dword [[ARG:s[0-9]+]], +; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, -|[[ARG]]| clamp{{$}} +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @clamp_fneg_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind { + %src.fabs = call float @llvm.fabs.f32(float %src) nounwind readnone + %src.fneg.fabs = fsub float -0.0, %src.fabs + %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fneg.fabs, float 0.0, float 1.0) nounwind readnone + store float %clamp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}clamp_0_1_amdil_legacy_f32: +; SI: s_load_dword [[ARG:s[0-9]+]], +; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, [[ARG]] clamp{{$}} +; SI: buffer_store_dword [[RESULT]] +define void @clamp_0_1_amdil_legacy_f32(float addrspace(1)* %out, float %src) nounwind { + %clamp = call float @llvm.AMDIL.clamp.f32(float %src, float 0.0, float 1.0) nounwind readnone + store float %clamp, float addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll new file mode 100644 index 00000000000..805a88b59c7 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.class.ll @@ -0,0 +1,497 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare i1 @llvm.AMDGPU.class.f32(float, i32) #1 +declare i1 @llvm.AMDGPU.class.f64(double, i32) #1 +declare i32 @llvm.r600.read.tidig.x() #1 +declare float @llvm.fabs.f32(float) #1 +declare double @llvm.fabs.f64(double) #1 + +; SI-LABEL: {{^}}test_class_f32: +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[VB]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { + %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_fabs_f32: +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { + %a.fabs = call float @llvm.fabs.f32(float %a) #1 + %result = call i1 @llvm.AMDGPU.class.f32(float %a.fabs, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_fneg_f32: +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { + %a.fneg = fsub float -0.0, %a + %result = call i1 @llvm.AMDGPU.class.f32(float %a.fneg, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_fneg_fabs_f32: +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { + %a.fabs = call float @llvm.fabs.f32(float %a) #1 + %a.fneg.fabs = fsub float -0.0, %a.fabs + %result = call i1 @llvm.AMDGPU.class.f32(float %a.fneg.fabs, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_1_f32: +; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 1{{$}} +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]] +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 { + %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_64_f32: +; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 64{{$}} +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]] +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 { + %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 64) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; Set all 10 bits of mask +; SI-LABEL: {{^}}test_class_full_mask_f32: +; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}} +; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 { + %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1023) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_9bit_mask_f32: +; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} +; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 { + %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 511) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}v_test_class_full_mask_f32: +; SI-DAG: buffer_load_dword [[VA:v[0-9]+]] +; SI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} +; SI: v_cmp_class_f32_e32 vcc, [[VA]], [[MASK]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep.in + + %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 511) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %gep.out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f32: +; SI-DAG: buffer_load_dword [[VB:v[0-9]+]] +; SI: v_cmp_class_f32_e32 vcc, 1.0, [[VB]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %b = load i32, i32 addrspace(1)* %gep.in + + %result = call i1 @llvm.AMDGPU.class.f32(float 1.0, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %gep.out, align 4 + ret void +} + +; FIXME: Why isn't this using a literal constant operand? +; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f32: +; SI-DAG: buffer_load_dword [[VB:v[0-9]+]] +; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 +; SI: v_cmp_class_f32_e32 vcc, [[VK]], [[VB]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %b = load i32, i32 addrspace(1)* %gep.in + + %result = call i1 @llvm.AMDGPU.class.f32(float 1024.0, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %gep.out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_f64: +; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[VB]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { + %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_fabs_f64: +; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { + %a.fabs = call double @llvm.fabs.f64(double %a) #1 + %result = call i1 @llvm.AMDGPU.class.f64(double %a.fabs, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_fneg_f64: +; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { + %a.fneg = fsub double -0.0, %a + %result = call i1 @llvm.AMDGPU.class.f64(double %a.fneg, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_fneg_fabs_f64: +; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { + %a.fabs = call double @llvm.fabs.f64(double %a) #1 + %a.fneg.fabs = fsub double -0.0, %a.fabs + %result = call i1 @llvm.AMDGPU.class.f64(double %a.fneg.fabs, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_1_f64: +; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 1{{$}} +; SI: s_endpgm +define void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 { + %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 1) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_64_f64: +; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 64{{$}} +; SI: s_endpgm +define void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 { + %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 64) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; Set all 9 bits of mask +; SI-LABEL: {{^}}test_class_full_mask_f64: +; SI: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} +; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[MASK]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI-NEXT: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 { + %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 511) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}v_test_class_full_mask_f64: +; SI-DAG: buffer_load_dwordx2 [[VA:v\[[0-9]+:[0-9]+\]]] +; SI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} +; SI: v_cmp_class_f64_e32 vcc, [[VA]], [[MASK]] +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load double, double addrspace(1)* %in + + %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 511) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %gep.out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f64: +; XSI: v_cmp_class_f64_e32 vcc, 1.0, +; SI: v_cmp_class_f64_e32 vcc, +; SI: s_endpgm +define void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %b = load i32, i32 addrspace(1)* %gep.in + + %result = call i1 @llvm.AMDGPU.class.f64(double 1.0, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %gep.out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f64: +; SI: v_cmp_class_f64_e32 vcc, s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} +; SI: s_endpgm +define void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %b = load i32, i32 addrspace(1)* %gep.in + + %result = call i1 @llvm.AMDGPU.class.f64(double 1024.0, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %gep.out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_fold_or_class_f32_0: +; SI-NOT: v_cmp_class +; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 3{{$}} +; SI-NOT: v_cmp_class +; SI: s_endpgm +define void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep.in + + %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1 + %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 3) #1 + %or = or i1 %class0, %class1 + + %sext = sext i1 %or to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_fold_or3_class_f32_0: +; SI-NOT: v_cmp_class +; SI: v_cmp_class_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}} +; SI-NOT: v_cmp_class +; SI: s_endpgm +define void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep.in + + %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1 + %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1 + %class2 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1 + %or.0 = or i1 %class0, %class1 + %or.1 = or i1 %or.0, %class2 + + %sext = sext i1 %or.1 to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_fold_or_all_tests_class_f32_0: +; SI-NOT: v_cmp_class +; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}} +; SI: v_cmp_class_f32_e32 vcc, v{{[0-9]+}}, [[MASK]]{{$}} +; SI-NOT: v_cmp_class +; SI: s_endpgm +define void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep.in + + %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1 + %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1 + %class2 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1 + %class3 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1 + %class4 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 16) #1 + %class5 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 32) #1 + %class6 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 64) #1 + %class7 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 128) #1 + %class8 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 256) #1 + %class9 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 512) #1 + %or.0 = or i1 %class0, %class1 + %or.1 = or i1 %or.0, %class2 + %or.2 = or i1 %or.1, %class3 + %or.3 = or i1 %or.2, %class4 + %or.4 = or i1 %or.3, %class5 + %or.5 = or i1 %or.4, %class6 + %or.6 = or i1 %or.5, %class7 + %or.7 = or i1 %or.6, %class8 + %or.8 = or i1 %or.7, %class9 + %sext = sext i1 %or.8 to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_fold_or_class_f32_1: +; SI-NOT: v_cmp_class +; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 12{{$}} +; SI-NOT: v_cmp_class +; SI: s_endpgm +define void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep.in + + %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1 + %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1 + %or = or i1 %class0, %class1 + + %sext = sext i1 %or to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_fold_or_class_f32_2: +; SI-NOT: v_cmp_class +; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}} +; SI-NOT: v_cmp_class +; SI: s_endpgm +define void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep.in + + %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1 + %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1 + %or = or i1 %class0, %class1 + + %sext = sext i1 %or to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_no_fold_or_class_f32_0: +; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 4{{$}} +; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 8{{$}} +; SI: s_or_b64 +; SI: s_endpgm +define void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep.in + + %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1 + %class1 = call i1 @llvm.AMDGPU.class.f32(float %b, i32 8) #1 + %or = or i1 %class0, %class1 + + %sext = sext i1 %or to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_0_f32: +; SI-NOT: v_cmp_class +; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 { + %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 0) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_class_0_f64: +; SI-NOT: v_cmp_class +; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 { + %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 0) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll new file mode 100644 index 00000000000..e95a51093cb --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll @@ -0,0 +1,59 @@ + +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; CHECK: {{^}}cube: +; CHECK: CUBE T{{[0-9]}}.X +; CHECK: CUBE T{{[0-9]}}.Y +; CHECK: CUBE T{{[0-9]}}.Z +; CHECK: CUBE * T{{[0-9]}}.W +define void @cube() #0 { +main_body: + %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) + %1 = extractelement <4 x float> %0, i32 3 + %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) + %3 = extractelement <4 x float> %2, i32 0 + %4 = fdiv float %3, %1 + %5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) + %6 = extractelement <4 x float> %5, i32 1 + %7 = fdiv float %6, %1 + %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) + %9 = extractelement <4 x float> %8, i32 2 + %10 = fdiv float %9, %1 + %11 = insertelement <4 x float> undef, float %4, i32 0 + %12 = insertelement <4 x float> %11, float %7, i32 1 + %13 = insertelement <4 x float> %12, float %10, i32 2 + %14 = insertelement <4 x float> %13, float 1.000000e+00, i32 3 + %15 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %14) + %16 = extractelement <4 x float> %15, i32 0 + %17 = extractelement <4 x float> %15, i32 1 + %18 = extractelement <4 x float> %15, i32 2 + %19 = extractelement <4 x float> %15, i32 3 + %20 = call float @fabs(float %18) + %21 = fdiv float 1.000000e+00, %20 + %22 = fmul float %16, %21 + %23 = fadd float %22, 1.500000e+00 + %24 = fmul float %17, %21 + %25 = fadd float %24, 1.500000e+00 + %26 = insertelement <4 x float> undef, float %25, i32 0 + %27 = insertelement <4 x float> %26, float %23, i32 1 + %28 = insertelement <4 x float> %27, float %19, i32 2 + %29 = insertelement <4 x float> %28, float %25, i32 3 + %30 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %29, i32 16, i32 0, i32 4) + call void @llvm.R600.store.swizzle(<4 x float> %30, i32 0, i32 0) + ret void +} + +; Function Attrs: readnone +declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #1 + +; Function Attrs: readnone +declare float @fabs(float) #1 + +; Function Attrs: readnone +declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1 + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { readnone } + diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.cvt_f32_ubyte.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.cvt_f32_ubyte.ll new file mode 100644 index 00000000000..8b32f696449 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.cvt_f32_ubyte.ll @@ -0,0 +1,43 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s + +declare float @llvm.AMDGPU.cvt.f32.ubyte0(i32) nounwind readnone +declare float @llvm.AMDGPU.cvt.f32.ubyte1(i32) nounwind readnone +declare float @llvm.AMDGPU.cvt.f32.ubyte2(i32) nounwind readnone +declare float @llvm.AMDGPU.cvt.f32.ubyte3(i32) nounwind readnone + +; SI-LABEL: {{^}}test_unpack_byte0_to_float: +; SI: v_cvt_f32_ubyte0 +define void @test_unpack_byte0_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %val = load i32, i32 addrspace(1)* %in, align 4 + %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte0(i32 %val) nounwind readnone + store float %cvt, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_unpack_byte1_to_float: +; SI: v_cvt_f32_ubyte1 +define void @test_unpack_byte1_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %val = load i32, i32 addrspace(1)* %in, align 4 + %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte1(i32 %val) nounwind readnone + store float %cvt, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_unpack_byte2_to_float: +; SI: v_cvt_f32_ubyte2 +define void @test_unpack_byte2_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %val = load i32, i32 addrspace(1)* %in, align 4 + %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte2(i32 %val) nounwind readnone + store float %cvt, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_unpack_byte3_to_float: +; SI: v_cvt_f32_ubyte3 +define void @test_unpack_byte3_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %val = load i32, i32 addrspace(1)* %in, align 4 + %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte3(i32 %val) nounwind readnone + store float %cvt, float addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fixup.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fixup.ll new file mode 100644 index 00000000000..55ca9c7536e --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fixup.ll @@ -0,0 +1,31 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s + +declare float @llvm.AMDGPU.div.fixup.f32(float, float, float) nounwind readnone +declare double @llvm.AMDGPU.div.fixup.f64(double, double, double) nounwind readnone + +; GCN-LABEL: {{^}}test_div_fixup_f32: +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] +; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]] +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm +define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { + %result = call float @llvm.AMDGPU.div.fixup.f32(float %a, float %b, float %c) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_div_fixup_f64: +; GCN: v_div_fixup_f64 +define void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind { + %result = call double @llvm.AMDGPU.div.fixup.f64(double %a, double %b, double %c) nounwind readnone + store double %result, double addrspace(1)* %out, align 8 + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll new file mode 100644 index 00000000000..bcb7f870f1f --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll @@ -0,0 +1,179 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; FIXME: Enable for VI. + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone +declare void @llvm.AMDGPU.barrier.global() nounwind noduplicate +declare float @llvm.AMDGPU.div.fmas.f32(float, float, float, i1) nounwind readnone +declare double @llvm.AMDGPU.div.fmas.f64(double, double, double, i1) nounwind readnone + +; GCN-LABEL: {{^}}test_div_fmas_f32: +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] +; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]] +; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VB]], [[VA]], [[VC]] +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm +define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { + %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_0: +; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] +; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VB]], [[VC]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { + %result = call float @llvm.AMDGPU.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_1: +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] +; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]] +; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VA]], [[VC]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { + %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_2: +; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]] +; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] +; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0 +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { + %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_div_fmas_f64: +; GCN: v_div_fmas_f64 +define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind { + %result = call double @llvm.AMDGPU.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone + store double %result, double addrspace(1)* %out, align 8 + ret void +} + +; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc: +; SI: v_cmp_eq_i32_e64 vcc, 0, s{{[0-9]+}} +; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +define void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind { + %cmp = icmp eq i32 %i, 0 + %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_div_fmas_f32_imm_false_cond_to_vcc: +; SI: s_mov_b64 vcc, 0 +; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +define void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { + %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_div_fmas_f32_imm_true_cond_to_vcc: +; SI: s_mov_b64 vcc, -1 +; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +define void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { + %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_div_fmas_f32_logical_cond_to_vcc: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} + +; SI-DAG: v_cmp_eq_i32_e32 [[CMP0:vcc]], 0, v{{[0-9]+}} +; SI-DAG: v_cmp_ne_i32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 0, s{{[0-9]+}} +; SI: s_and_b64 vcc, [[CMP0]], [[CMP1]] +; SI: v_div_fmas_f32 {{v[0-9]+}}, [[A]], [[B]], [[C]] +; SI: s_endpgm +define void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 %d) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1 + %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 2 + + %a = load float, float addrspace(1)* %gep.a + %b = load float, float addrspace(1)* %gep.b + %c = load float, float addrspace(1)* %gep.c + + %cmp0 = icmp eq i32 %tid, 0 + %cmp1 = icmp ne i32 %d, 0 + %and = and i1 %cmp0, %cmp1 + + %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %and) nounwind readnone + store float %result, float addrspace(1)* %gep.out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc: +; SI: v_cmp_eq_i32_e32 vcc, 0, v{{[0-9]+}} +; SI: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc +; SI: s_xor_b64 [[SAVE]], exec, [[SAVE]] + +; SI: buffer_load_dword [[LOAD:v[0-9]+]] +; SI: v_cmp_ne_i32_e32 vcc, 0, [[LOAD]] +; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc + + +; SI: BB9_2: +; SI: s_or_b64 exec, exec, [[SAVE]] +; SI: v_cmp_ne_i32_e32 vcc, 0, v0 +; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: buffer_store_dword +; SI: s_endpgm +define void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind { +entry: + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.out = getelementptr float, float addrspace(1)* %out, i32 2 + %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1 + %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2 + + %a = load float, float addrspace(1)* %gep.a + %b = load float, float addrspace(1)* %gep.b + %c = load float, float addrspace(1)* %gep.c + + %cmp0 = icmp eq i32 %tid, 0 + br i1 %cmp0, label %bb, label %exit + +bb: + %val = load i32, i32 addrspace(1)* %dummy + %cmp1 = icmp ne i32 %val, 0 + br label %exit + +exit: + %cond = phi i1 [false, %entry], [%cmp1, %bb] + %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %cond) nounwind readnone + store float %result, float addrspace(1)* %gep.out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.div_scale.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.div_scale.ll new file mode 100644 index 00000000000..de830de039c --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.div_scale.ll @@ -0,0 +1,364 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone +declare { float, i1 } @llvm.AMDGPU.div.scale.f32(float, float, i1) nounwind readnone +declare { double, i1 } @llvm.AMDGPU.div.scale.f64(double, double, i1) nounwind readnone +declare float @llvm.fabs.f32(float) nounwind readnone + +; SI-LABEL @test_div_scale_f32_1: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f32_2: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f64_1: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] +; SI: buffer_store_dwordx2 [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0, align 8 + %b = load double, double addrspace(1)* %gep.1, align 8 + + %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL @test_div_scale_f64_1: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] +; SI: buffer_store_dwordx2 [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 + + %a = load double, double addrspace(1)* %gep.0, align 8 + %b = load double, double addrspace(1)* %gep.1, align 8 + + %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL @test_div_scale_f32_scalar_num_1: +; SI-DAG: buffer_load_dword [[B:v[0-9]+]] +; SI-DAG: s_load_dword [[A:s[0-9]+]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr float, float addrspace(1)* %in, i32 %tid + + %b = load float, float addrspace(1)* %gep, align 4 + + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f32_scalar_num_2: +; SI-DAG: buffer_load_dword [[B:v[0-9]+]] +; SI-DAG: s_load_dword [[A:s[0-9]+]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr float, float addrspace(1)* %in, i32 %tid + + %b = load float, float addrspace(1)* %gep, align 4 + + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f32_scalar_den_1: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]] +; SI-DAG: s_load_dword [[B:s[0-9]+]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr float, float addrspace(1)* %in, i32 %tid + + %a = load float, float addrspace(1)* %gep, align 4 + + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f32_scalar_den_2: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]] +; SI-DAG: s_load_dword [[B:s[0-9]+]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr float, float addrspace(1)* %in, i32 %tid + + %a = load float, float addrspace(1)* %gep, align 4 + + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f64_scalar_num_1: +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]] +; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] +; SI: buffer_store_dwordx2 [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr double, double addrspace(1)* %in, i32 %tid + + %b = load double, double addrspace(1)* %gep, align 8 + + %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL @test_div_scale_f64_scalar_num_2: +; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]] +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] +; SI: buffer_store_dwordx2 [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr double, double addrspace(1)* %in, i32 %tid + + %b = load double, double addrspace(1)* %gep, align 8 + + %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL @test_div_scale_f64_scalar_den_1: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] +; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] +; SI: buffer_store_dwordx2 [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr double, double addrspace(1)* %in, i32 %tid + + %a = load double, double addrspace(1)* %gep, align 8 + + %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL @test_div_scale_f64_scalar_den_2: +; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] +; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] +; SI: buffer_store_dwordx2 [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr double, double addrspace(1)* %in, i32 %tid + + %a = load double, double addrspace(1)* %gep, align 8 + + %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL @test_div_scale_f32_all_scalar_1: +; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc +; SI: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[VA]] +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind { + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f32_all_scalar_2: +; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc +; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[VB]], [[A]] +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind { + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f64_all_scalar_1: +; SI-DAG: s_load_dwordx2 s{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: v_mov_b32_e32 v[[VA_LO:[0-9]+]], s[[A_LO]] +; SI-DAG: v_mov_b32_e32 v[[VA_HI:[0-9]+]], s[[A_HI]] +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], v{{\[}}[[VA_LO]]:[[VA_HI]]{{\]}} +; SI: buffer_store_dwordx2 [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind { + %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL @test_div_scale_f64_all_scalar_2: +; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dwordx2 s{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xd +; SI-DAG: v_mov_b32_e32 v[[VB_LO:[0-9]+]], s[[B_LO]] +; SI-DAG: v_mov_b32_e32 v[[VB_HI:[0-9]+]], s[[B_HI]] +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], v{{\[}}[[VB_LO]]:[[VB_HI]]{{\]}}, [[A]] +; SI: buffer_store_dwordx2 [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind { + %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone + %result0 = extractvalue { double, i1 } %result, 0 + store double %result0, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL @test_div_scale_f32_inline_imm_num: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[A]], 1.0 +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %a = load float, float addrspace(1)* %gep.0, align 4 + + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float 1.0, float %a, i1 false) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f32_inline_imm_den: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], 2.0, 2.0, [[A]] +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %a = load float, float addrspace(1)* %gep.0, align 4 + + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float 2.0, i1 false) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f32_fabs_num: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], |[[A]]| +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone + + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a.fabs, float %b, i1 false) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL @test_div_scale_f32_fabs_den: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], |[[B]]|, |[[B]]|, [[A]] +; SI: buffer_store_dword [[RESULT0]] +; SI: s_endpgm +define void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone + + %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b.fabs, i1 false) nounwind readnone + %result0 = extractvalue { float, i1 } %result, 0 + store float %result0, float addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.flbit.i32.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.flbit.i32.ll new file mode 100644 index 00000000000..20c7af8ade5 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.flbit.i32.ll @@ -0,0 +1,28 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.AMDGPU.flbit.i32(i32) nounwind readnone + +; FUNC-LABEL: {{^}}s_flbit: +; SI: s_load_dword [[VAL:s[0-9]+]], +; SI: s_flbit_i32 [[SRESULT:s[0-9]+]], [[VAL]] +; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] +; SI: buffer_store_dword [[VRESULT]], +; SI: s_endpgm +define void @s_flbit(i32 addrspace(1)* noalias %out, i32 %val) nounwind { + %r = call i32 @llvm.AMDGPU.flbit.i32(i32 %val) nounwind readnone + store i32 %r, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_flbit: +; SI: buffer_load_dword [[VAL:v[0-9]+]], +; SI: v_ffbh_i32_e32 [[RESULT:v[0-9]+]], [[VAL]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @v_flbit(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { + %val = load i32, i32 addrspace(1)* %valptr, align 4 + %r = call i32 @llvm.AMDGPU.flbit.i32(i32 %val) nounwind readnone + store i32 %r, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.f64.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.f64.ll new file mode 100644 index 00000000000..e098dd35d6d --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.f64.ll @@ -0,0 +1,60 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s + +declare double @llvm.fabs.f64(double %Val) +declare double @llvm.AMDGPU.fract.f64(double) nounwind readnone + +; FUNC-LABEL: {{^}}fract_f64: +; GCN: v_fract_f64_e32 [[FRC:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]] +; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1 +; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff +; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]] +; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3 +; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]] +; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]] +; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]] +; CI: buffer_store_dwordx2 [[FRC]] +define void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) nounwind { + %val = load double, double addrspace(1)* %src, align 4 + %fract = call double @llvm.AMDGPU.fract.f64(double %val) nounwind readnone + store double %fract, double addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}fract_f64_neg: +; GCN: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]] +; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1 +; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff +; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]] +; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3 +; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]] +; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]] +; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]] +; CI: buffer_store_dwordx2 [[FRC]] +define void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src) nounwind { + %val = load double, double addrspace(1)* %src, align 4 + %neg = fsub double 0.0, %val + %fract = call double @llvm.AMDGPU.fract.f64(double %neg) nounwind readnone + store double %fract, double addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}fract_f64_neg_abs: +; GCN: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -|v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]| +; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1 +; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff +; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]] +; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3 +; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]] +; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]] +; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]] +; CI: buffer_store_dwordx2 [[FRC]] +define void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* %src) nounwind { + %val = load double, double addrspace(1)* %src, align 4 + %abs = call double @llvm.fabs.f64(double %val) + %neg = fsub double 0.0, %abs + %fract = call double @llvm.AMDGPU.fract.f64(double %neg) nounwind readnone + store double %fract, double addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.ll new file mode 100644 index 00000000000..7501b4b7546 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.fract.ll @@ -0,0 +1,65 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare float @llvm.fabs.f32(float %Val) +declare float @llvm.AMDGPU.fract.f32(float) nounwind readnone + +; Legacy name +declare float @llvm.AMDIL.fraction.f32(float) nounwind readnone + +; FUNC-LABEL: {{^}}fract_f32: +; CI: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]] +; SI: v_floor_f32_e32 [[FLR:v[0-9]+]], [[INPUT:v[0-9]+]] +; SI: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[FLR]], [[INPUT]] +; GCN: buffer_store_dword [[RESULT]] +; EG: FRACT +define void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) nounwind { + %val = load float, float addrspace(1)* %src, align 4 + %fract = call float @llvm.AMDGPU.fract.f32(float %val) nounwind readnone + store float %fract, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}fract_f32_legacy_amdil: +; CI: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]] +; SI: v_floor_f32_e32 [[FLR:v[0-9]+]], [[INPUT:v[0-9]+]] +; SI: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[FLR]], [[INPUT]] +; GCN: buffer_store_dword [[RESULT]] +; EG: FRACT +define void @fract_f32_legacy_amdil(float addrspace(1)* %out, float addrspace(1)* %src) nounwind { + %val = load float, float addrspace(1)* %src, align 4 + %fract = call float @llvm.AMDIL.fraction.f32(float %val) nounwind readnone + store float %fract, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}fract_f32_neg: +; CI: v_fract_f32_e64 [[RESULT:v[0-9]+]], -[[INPUT:v[0-9]+]] +; SI: v_floor_f32_e64 [[FLR:v[0-9]+]], -[[INPUT:v[0-9]+]] +; SI: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[INPUT]], [[FLR]] +; GCN: buffer_store_dword [[RESULT]] +; EG: FRACT +define void @fract_f32_neg(float addrspace(1)* %out, float addrspace(1)* %src) nounwind { + %val = load float, float addrspace(1)* %src, align 4 + %neg = fsub float 0.0, %val + %fract = call float @llvm.AMDGPU.fract.f32(float %neg) nounwind readnone + store float %fract, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}fract_f32_neg_abs: +; CI: v_fract_f32_e64 [[RESULT:v[0-9]+]], -|[[INPUT:v[0-9]+]]| +; SI: v_floor_f32_e64 [[FLR:v[0-9]+]], -|[[INPUT:v[0-9]+]]| +; SI: v_sub_f32_e64 [[RESULT:v[0-9]+]], -|[[INPUT]]|, [[FLR]] +; GCN: buffer_store_dword [[RESULT]] +; EG: FRACT +define void @fract_f32_neg_abs(float addrspace(1)* %out, float addrspace(1)* %src) nounwind { + %val = load float, float addrspace(1)* %src, align 4 + %abs = call float @llvm.fabs.f32(float %val) + %neg = fsub float 0.0, %abs + %fract = call float @llvm.AMDGPU.fract.f32(float %neg) nounwind readnone + store float %fract, float addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.imad24.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.imad24.ll new file mode 100644 index 00000000000..42102e30f07 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.imad24.ll @@ -0,0 +1,22 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; XUN: llc -march=r600 -mcpu=r770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s + +; FIXME: Store of i32 seems to be broken pre-EG somehow? + +declare i32 @llvm.AMDGPU.imad24(i32, i32, i32) nounwind readnone + +; FUNC-LABEL: {{^}}test_imad24: +; SI: v_mad_i32_i24 +; CM: MULADD_INT24 +; R600: MULLO_INT +; R600: ADD_INT +define void @test_imad24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind { + %mad = call i32 @llvm.AMDGPU.imad24(i32 %src0, i32 %src1, i32 %src2) nounwind readnone + store i32 %mad, i32 addrspace(1)* %out, align 4 + ret void +} + diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.imax.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.imax.ll new file mode 100644 index 00000000000..46662f96c29 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.imax.ll @@ -0,0 +1,33 @@ +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}vector_imax: +; SI: v_max_i32_e32 +define void @vector_imax(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 { +main_body: + %load = load i32, i32 addrspace(1)* %in, align 4 + %max = call i32 @llvm.AMDGPU.imax(i32 %p0, i32 %load) + %bc = bitcast i32 %max to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) + ret void +} + +; SI-LABEL: {{^}}scalar_imax: +; SI: s_max_i32 +define void @scalar_imax(i32 %p0, i32 %p1) #0 { +entry: + %max = call i32 @llvm.AMDGPU.imax(i32 %p0, i32 %p1) + %bc = bitcast i32 %max to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) + ret void +} + +; Function Attrs: readnone +declare i32 @llvm.AMDGPU.imax(i32, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } + +!0 = !{!"const", null, i32 1} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.imin.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.imin.ll new file mode 100644 index 00000000000..34b454e2375 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.imin.ll @@ -0,0 +1,33 @@ +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}vector_imin: +; SI: v_min_i32_e32 +define void @vector_imin(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 { +main_body: + %load = load i32, i32 addrspace(1)* %in, align 4 + %min = call i32 @llvm.AMDGPU.imin(i32 %p0, i32 %load) + %bc = bitcast i32 %min to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) + ret void +} + +; SI-LABEL: {{^}}scalar_imin: +; SI: s_min_i32 +define void @scalar_imin(i32 %p0, i32 %p1) #0 { +entry: + %min = call i32 @llvm.AMDGPU.imin(i32 %p0, i32 %p1) + %bc = bitcast i32 %min to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) + ret void +} + +; Function Attrs: readnone +declare i32 @llvm.AMDGPU.imin(i32, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } + +!0 = !{!"const", null, i32 1} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.imul24.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.imul24.ll new file mode 100644 index 00000000000..fdc1172260b --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.imul24.ll @@ -0,0 +1,16 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s + +declare i32 @llvm.AMDGPU.imul24(i32, i32) nounwind readnone + +; FUNC-LABEL: {{^}}test_imul24: +; SI: v_mul_i32_i24 +; CM: MUL_INT24 +; R600: MULLO_INT +define void @test_imul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { + %mul = call i32 @llvm.AMDGPU.imul24(i32 %src0, i32 %src1) nounwind readnone + store i32 %mul, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll new file mode 100644 index 00000000000..057708e7b5c --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll @@ -0,0 +1,39 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}kill_gs_const: +; SI-NOT: v_cmpx_le_f32 +; SI: s_mov_b64 exec, 0 + +define void @kill_gs_const() #0 { +main_body: + %0 = icmp ule i32 0, 3 + %1 = select i1 %0, float 1.000000e+00, float -1.000000e+00 + call void @llvm.AMDGPU.kill(float %1) + %2 = icmp ule i32 3, 0 + %3 = select i1 %2, float 1.000000e+00, float -1.000000e+00 + call void @llvm.AMDGPU.kill(float %3) + ret void +} + +; SI-LABEL: {{^}}kill_vcc_implicit_def: +; SI-NOT: v_cmp_gt_f32_e32 vcc, +; SI: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}} +; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}} +; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]] +define void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #1 { +entry: + %tmp0 = fcmp olt float %13, 0.0 + call void @llvm.AMDGPU.kill(float %14) + %tmp1 = select i1 %tmp0, float 1.0, float 0.0 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 1, i32 1, float %tmp1, float %tmp1, float %tmp1, float %tmp1) + ret void +} + +declare void @llvm.AMDGPU.kill(float) +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="2" } +attributes #1 = { "ShaderType"="0" } + +!0 = !{!"const", null, i32 1} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.ldexp.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.ldexp.ll new file mode 100644 index 00000000000..a59c0ce6d67 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.ldexp.ll @@ -0,0 +1,23 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare float @llvm.AMDGPU.ldexp.f32(float, i32) nounwind readnone +declare double @llvm.AMDGPU.ldexp.f64(double, i32) nounwind readnone + +; SI-LABEL: {{^}}test_ldexp_f32: +; SI: v_ldexp_f32 +; SI: s_endpgm +define void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind { + %result = call float @llvm.AMDGPU.ldexp.f32(float %a, i32 %b) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_ldexp_f64: +; SI: v_ldexp_f64 +; SI: s_endpgm +define void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwind { + %result = call double @llvm.AMDGPU.ldexp.f64(double %a, i32 %b) nounwind readnone + store double %result, double addrspace(1)* %out, align 8 + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.legacy.rsq.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.legacy.rsq.ll new file mode 100644 index 00000000000..4cafd563685 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.legacy.rsq.ll @@ -0,0 +1,13 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare float @llvm.AMDGPU.legacy.rsq(float) nounwind readnone + +; FUNC-LABEL: {{^}}rsq_legacy_f32: +; SI: v_rsq_legacy_f32_e32 +; EG: RECIPSQRT_IEEE +define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) nounwind { + %rsq = call float @llvm.AMDGPU.legacy.rsq(float %src) nounwind readnone + store float %rsq, float addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.mul.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.mul.ll new file mode 100644 index 00000000000..83b56a5029d --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.mul.ll @@ -0,0 +1,17 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @test(<4 x float> inreg %reg0) #0 { + %r0 = extractelement <4 x float> %reg0, i32 0 + %r1 = extractelement <4 x float> %reg0, i32 1 + %r2 = call float @llvm.AMDGPU.mul( float %r0, float %r1) + %vec = insertelement <4 x float> undef, float %r2, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) + ret void +} + +declare float @llvm.AMDGPU.mul(float ,float ) readnone +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="0" } \ No newline at end of file diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.f64.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.f64.ll new file mode 100644 index 00000000000..d2a655bf909 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.f64.ll @@ -0,0 +1,33 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone +declare double @llvm.sqrt.f64(double) nounwind readnone + +; FUNC-LABEL: {{^}}rcp_f64: +; SI: v_rcp_f64_e32 +define void @rcp_f64(double addrspace(1)* %out, double %src) nounwind { + %rcp = call double @llvm.AMDGPU.rcp.f64(double %src) nounwind readnone + store double %rcp, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}rcp_pat_f64: +; SI: v_rcp_f64_e32 +define void @rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind { + %rcp = fdiv double 1.0, %src + store double %rcp, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}rsq_rcp_pat_f64: +; SI-UNSAFE: v_rsq_f64_e32 +; SI-SAFE-NOT: v_rsq_f64_e32 +; SI-SAFE: v_sqrt_f64 +; SI-SAFE: v_rcp_f64 +define void @rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind { + %sqrt = call double @llvm.sqrt.f64(double %src) nounwind readnone + %rcp = call double @llvm.AMDGPU.rcp.f64(double %sqrt) nounwind readnone + store double %rcp, double addrspace(1)* %out, align 8 + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.ll new file mode 100644 index 00000000000..edd6e9a72f1 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.rcp.ll @@ -0,0 +1,50 @@ +; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s +; XUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s +; XUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s + +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG-SAFE -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare float @llvm.AMDGPU.rcp.f32(float) nounwind readnone +declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone + +declare float @llvm.sqrt.f32(float) nounwind readnone + +; FUNC-LABEL: {{^}}rcp_f32: +; SI: v_rcp_f32_e32 +; EG: RECIP_IEEE +define void @rcp_f32(float addrspace(1)* %out, float %src) nounwind { + %rcp = call float @llvm.AMDGPU.rcp.f32(float %src) nounwind readnone + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FIXME: Evergreen only ever does unsafe fp math. +; FUNC-LABEL: {{^}}rcp_pat_f32: + +; SI-SAFE: v_rcp_f32_e32 +; XSI-SAFE-SPDENORM-NOT: v_rcp_f32_e32 + +; EG: RECIP_IEEE + +define void @rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind { + %rcp = fdiv float 1.0, %src + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}rsq_rcp_pat_f32: +; SI-UNSAFE: v_rsq_f32_e32 +; SI-SAFE: v_sqrt_f32_e32 +; SI-SAFE: v_rcp_f32_e32 + +; EG: RECIPSQRT_IEEE +define void @rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind { + %sqrt = call float @llvm.sqrt.f32(float %src) nounwind readnone + %rcp = call float @llvm.AMDGPU.rcp.f32(float %sqrt) nounwind readnone + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll new file mode 100644 index 00000000000..67f1d22c717 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll @@ -0,0 +1,23 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s + +declare double @llvm.AMDGPU.rsq.clamped.f64(double) nounwind readnone + +; FUNC-LABEL: {{^}}rsq_clamped_f64: +; SI: v_rsq_clamp_f64_e32 + +; VI: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[2:3] +; TODO: this constant should be folded: +; VI: s_mov_b32 s[[ALLBITS:[0-9+]]], -1 +; VI: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff +; VI: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]] +; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]] +; VI: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff +; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]] +; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]] + +define void @rsq_clamped_f64(double addrspace(1)* %out, double %src) nounwind { + %rsq_clamped = call double @llvm.AMDGPU.rsq.clamped.f64(double %src) nounwind readnone + store double %rsq_clamped, double addrspace(1)* %out, align 8 + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.ll new file mode 100644 index 00000000000..eeff2536b23 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.ll @@ -0,0 +1,23 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + + +declare float @llvm.AMDGPU.rsq.clamped.f32(float) nounwind readnone + +; FUNC-LABEL: {{^}}rsq_clamped_f32: +; SI: v_rsq_clamp_f32_e32 + +; VI: v_rsq_f32_e32 [[RSQ:v[0-9]+]], {{s[0-9]+}} +; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0x7f7fffff, [[RSQ]] +; TODO: this constant should be folded: +; VI: v_mov_b32_e32 [[MINFLT:v[0-9]+]], 0xff7fffff +; VI: v_max_f32_e32 {{v[0-9]+}}, [[MIN]], [[MINFLT]] + +; EG: RECIPSQRT_CLAMPED + +define void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind { + %rsq_clamped = call float @llvm.AMDGPU.rsq.clamped.f32(float %src) nounwind readnone + store float %rsq_clamped, float addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.ll new file mode 100644 index 00000000000..36b72f14db1 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.ll @@ -0,0 +1,33 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare float @llvm.AMDGPU.rsq.f32(float) nounwind readnone + +; FUNC-LABEL: {{^}}rsq_f32: +; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} +; EG: RECIPSQRT_IEEE +define void @rsq_f32(float addrspace(1)* %out, float %src) nounwind { + %rsq = call float @llvm.AMDGPU.rsq.f32(float %src) nounwind readnone + store float %rsq, float addrspace(1)* %out, align 4 + ret void +} + +; TODO: Really these should be constant folded +; FUNC-LABEL: {{^}}rsq_f32_constant_4.0 +; SI: v_rsq_f32_e32 {{v[0-9]+}}, 4.0 +; EG: RECIPSQRT_IEEE +define void @rsq_f32_constant_4.0(float addrspace(1)* %out) nounwind { + %rsq = call float @llvm.AMDGPU.rsq.f32(float 4.0) nounwind readnone + store float %rsq, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}rsq_f32_constant_100.0 +; SI: v_rsq_f32_e32 {{v[0-9]+}}, 0x42c80000 +; EG: RECIPSQRT_IEEE +define void @rsq_f32_constant_100.0(float addrspace(1)* %out) nounwind { + %rsq = call float @llvm.AMDGPU.rsq.f32(float 100.0) nounwind readnone + store float %rsq, float addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.tex.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.tex.ll new file mode 100644 index 00000000000..10206609bb5 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.tex.ll @@ -0,0 +1,42 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN +;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN +;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN +;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN +;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:UUNN +;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:NNNN +;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:NNNN +;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:UUNN +;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYYW}} RID:0 SID:0 CT:NNUN +;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN +;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYYZ}} RID:0 SID:0 CT:NNUN +;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN +;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN +;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN +;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN +;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN + +define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { + %addr = load <4 x float>, <4 x float> addrspace(1)* %in + %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %addr, i32 0, i32 0, i32 1) + %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res1, i32 0, i32 0, i32 2) + %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res2, i32 0, i32 0, i32 3) + %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res3, i32 0, i32 0, i32 4) + %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res4, i32 0, i32 0, i32 5) + %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res5, i32 0, i32 0, i32 6) + %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res6, i32 0, i32 0, i32 7) + %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res7, i32 0, i32 0, i32 8) + %res9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res8, i32 0, i32 0, i32 9) + %res10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res9, i32 0, i32 0, i32 10) + %res11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res10, i32 0, i32 0, i32 11) + %res12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res11, i32 0, i32 0, i32 12) + %res13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res12, i32 0, i32 0, i32 13) + %res14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res13, i32 0, i32 0, i32 14) + %res15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res14, i32 0, i32 0, i32 15) + %res16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res15, i32 0, i32 0, i32 16) + store <4 x float> %res16, <4 x float> addrspace(1)* %out + ret void +} + +declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.trig_preop.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.trig_preop.ll new file mode 100644 index 00000000000..6b546a7e17c --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.trig_preop.ll @@ -0,0 +1,30 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare double @llvm.AMDGPU.trig.preop.f64(double, i32) nounwind readnone + +; SI-LABEL: {{^}}test_trig_preop_f64: +; SI-DAG: buffer_load_dword [[SEG:v[0-9]+]] +; SI-DAG: buffer_load_dwordx2 [[SRC:v\[[0-9]+:[0-9]+\]]], +; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], [[SEG]] +; SI: buffer_store_dwordx2 [[RESULT]], +; SI: s_endpgm +define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %a = load double, double addrspace(1)* %aptr, align 8 + %b = load i32, i32 addrspace(1)* %bptr, align 4 + %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 %b) nounwind readnone + store double %result, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}test_trig_preop_f64_imm_segment: +; SI: buffer_load_dwordx2 [[SRC:v\[[0-9]+:[0-9]+\]]], +; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], 7 +; SI: buffer_store_dwordx2 [[RESULT]], +; SI: s_endpgm +define void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind { + %a = load double, double addrspace(1)* %aptr, align 8 + %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 7) nounwind readnone + store double %result, double addrspace(1)* %out, align 8 + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll new file mode 100644 index 00000000000..74792e50017 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.trunc.ll @@ -0,0 +1,17 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 %s +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s + +; R600: {{^}}amdgpu_trunc: +; R600: TRUNC T{{[0-9]+\.[XYZW]}}, KC0[2].Z +; SI: {{^}}amdgpu_trunc: +; SI: v_trunc_f32 + +define void @amdgpu_trunc(float addrspace(1)* %out, float %x) { +entry: + %0 = call float @llvm.AMDGPU.trunc(float %x) + store float %0, float addrspace(1)* %out + ret void +} + +declare float @llvm.AMDGPU.trunc(float ) readnone diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.umad24.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.umad24.ll new file mode 100644 index 00000000000..77a073b0cb0 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.umad24.ll @@ -0,0 +1,38 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; XUN: llc -march=r600 -mcpu=rv770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s + +declare i32 @llvm.AMDGPU.umad24(i32, i32, i32) nounwind readnone +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; FUNC-LABEL: {{^}}test_umad24: +; SI: v_mad_u32_u24 +; EG: MULADD_UINT24 +; R600: MULLO_UINT +; R600: ADD_INT +define void @test_umad24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind { + %mad = call i32 @llvm.AMDGPU.umad24(i32 %src0, i32 %src1, i32 %src2) nounwind readnone + store i32 %mad, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}commute_umad24: +; SI-DAG: buffer_load_dword [[SRC0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[SRC2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: v_mad_u32_u24 [[RESULT:v[0-9]+]], 4, [[SRC0]], [[SRC2]] +; SI: buffer_store_dword [[RESULT]] +define void @commute_umad24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %src0.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %src2.gep = getelementptr i32, i32 addrspace(1)* %src0.gep, i32 1 + + %src0 = load i32, i32 addrspace(1)* %src0.gep, align 4 + %src2 = load i32, i32 addrspace(1)* %src2.gep, align 4 + %mad = call i32 @llvm.AMDGPU.umad24(i32 %src0, i32 4, i32 %src2) nounwind readnone + store i32 %mad, i32 addrspace(1)* %out.gep, align 4 + ret void +} + diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.umax.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.umax.ll new file mode 100644 index 00000000000..a97d103016d --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.umax.ll @@ -0,0 +1,48 @@ +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}vector_umax: +; SI: v_max_u32_e32 +define void @vector_umax(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 { +main_body: + %load = load i32, i32 addrspace(1)* %in, align 4 + %max = call i32 @llvm.AMDGPU.umax(i32 %p0, i32 %load) + %bc = bitcast i32 %max to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) + ret void +} + +; SI-LABEL: {{^}}scalar_umax: +; SI: s_max_u32 +define void @scalar_umax(i32 %p0, i32 %p1) #0 { +entry: + %max = call i32 @llvm.AMDGPU.umax(i32 %p0, i32 %p1) + %bc = bitcast i32 %max to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) + ret void +} + +; SI-LABEL: {{^}}trunc_zext_umax: +; SI: buffer_load_ubyte [[VREG:v[0-9]+]], +; SI: v_max_u32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]] +; SI-NOT: and +; SI: buffer_store_short [[RESULT]], +define void @trunc_zext_umax(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind { + %tmp5 = load i8, i8 addrspace(1)* %src, align 1 + %tmp2 = zext i8 %tmp5 to i32 + %tmp3 = tail call i32 @llvm.AMDGPU.umax(i32 %tmp2, i32 0) nounwind readnone + %tmp4 = trunc i32 %tmp3 to i8 + %tmp6 = zext i8 %tmp4 to i16 + store i16 %tmp6, i16 addrspace(1)* %out, align 2 + ret void +} + +; Function Attrs: readnone +declare i32 @llvm.AMDGPU.umax(i32, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } + +!0 = !{!"const", null, i32 1} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.umin.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.umin.ll new file mode 100644 index 00000000000..2acd10e0c63 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.umin.ll @@ -0,0 +1,48 @@ +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}vector_umin: +; SI: v_min_u32_e32 +define void @vector_umin(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 { +main_body: + %load = load i32, i32 addrspace(1)* %in, align 4 + %min = call i32 @llvm.AMDGPU.umin(i32 %p0, i32 %load) + %bc = bitcast i32 %min to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) + ret void +} + +; SI-LABEL: {{^}}scalar_umin: +; SI: s_min_u32 +define void @scalar_umin(i32 %p0, i32 %p1) #0 { +entry: + %min = call i32 @llvm.AMDGPU.umin(i32 %p0, i32 %p1) + %bc = bitcast i32 %min to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) + ret void +} + +; SI-LABEL: {{^}}trunc_zext_umin: +; SI: buffer_load_ubyte [[VREG:v[0-9]+]], +; SI: v_min_u32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]] +; SI-NOT: and +; SI: buffer_store_short [[RESULT]], +define void @trunc_zext_umin(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind { + %tmp5 = load i8, i8 addrspace(1)* %src, align 1 + %tmp2 = zext i8 %tmp5 to i32 + %tmp3 = tail call i32 @llvm.AMDGPU.umin(i32 %tmp2, i32 0) nounwind readnone + %tmp4 = trunc i32 %tmp3 to i8 + %tmp6 = zext i8 %tmp4 to i16 + store i16 %tmp6, i16 addrspace(1)* %out, align 2 + ret void +} + +; Function Attrs: readnone +declare i32 @llvm.AMDGPU.umin(i32, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } + +!0 = !{!"const", null, i32 1} diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.umul24.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.umul24.ll new file mode 100644 index 00000000000..76624a078b3 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.umul24.ll @@ -0,0 +1,18 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s +; XUN: llc -march=r600 -mcpu=r770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s + +declare i32 @llvm.AMDGPU.umul24(i32, i32) nounwind readnone + +; FUNC-LABEL: {{^}}test_umul24: +; SI: v_mul_u32_u24 +; R600: MUL_UINT24 +; R600: MULLO_UINT +define void @test_umul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { + %mul = call i32 @llvm.AMDGPU.umul24(i32 %src0, i32 %src1) nounwind readnone + store i32 %mul, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll b/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll new file mode 100644 index 00000000000..3d05da616e4 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll @@ -0,0 +1,59 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN %s +;RUN: llc < %s -march=amdgcn -mcpu=kabini -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=16BANK %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s + +;GCN-LABEL: {{^}}main: +;GCN-NOT: s_wqm +;GCN: s_mov_b32 +;GCN-NEXT: v_interp_mov_f32 +;GCN: v_interp_p1_f32 +;GCN: v_interp_p2_f32 + +define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) #0 { +main_body: + %5 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3) + %6 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %4) + %7 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %4) + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %6, float %7, float %7) + ret void +} + +; Thest that v_interp_p1 uses different source and destination registers +; on 16 bank LDS chips. + +; 16BANK-LABEL: {{^}}v_interp_p1_bank16_bug: +; 16BANK-NOT: v_interp_p1_f32 [[DST:v[0-9]+]], [[DST]] + +define void @v_interp_p1_bank16_bug([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 { +main_body: + %22 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %5, <2 x i32> %7) + %23 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %7) + %24 = call float @llvm.SI.fs.interp(i32 2, i32 0, i32 %5, <2 x i32> %7) + %25 = call float @fabs(float %22) + %26 = call float @fabs(float %23) + %27 = call float @fabs(float %24) + %28 = call i32 @llvm.SI.packf16(float %25, float %26) + %29 = bitcast i32 %28 to float + %30 = call i32 @llvm.SI.packf16(float %27, float 1.000000e+00) + %31 = bitcast i32 %30 to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %29, float %31, float %29, float %31) + ret void +} + +; Function Attrs: readnone +declare float @fabs(float) #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.SI.packf16(float, float) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.SI.fs.constant(i32, i32, i32) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } +attributes #2 = { readnone } diff --git a/test/CodeGen/AMDGPU/llvm.SI.gather4.ll b/test/CodeGen/AMDGPU/llvm.SI.gather4.ll new file mode 100644 index 00000000000..275cb580bc9 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.SI.gather4.ll @@ -0,0 +1,509 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}gather4_v2: +;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_v2() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4: +;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_cl: +;CHECK: image_gather4_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_l: +;CHECK: image_gather4_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_l() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_b: +;CHECK: image_gather4_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_b() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_b_cl: +;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_b_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_b_cl_v8: +;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_b_cl_v8() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_lz_v2: +;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_lz_v2() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_lz: +;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_lz() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + + + +;CHECK-LABEL: {{^}}gather4_o: +;CHECK: image_gather4_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_cl_o: +;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_cl_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_cl_o_v8: +;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_cl_o_v8() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_l_o: +;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_l_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_l_o_v8: +;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_l_o_v8() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_b_o: +;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_b_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_b_o_v8: +;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_b_o_v8() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_b_cl_o: +;CHECK: image_gather4_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_b_cl_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_lz_o: +;CHECK: image_gather4_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_lz_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + + + +;CHECK-LABEL: {{^}}gather4_c: +;CHECK: image_gather4_c {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_c_cl: +;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_c_cl_v8: +;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_cl_v8() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_c_l: +;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_l() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_c_l_v8: +;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_l_v8() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_c_b: +;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_b() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_c_b_v8: +;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_b_v8() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_c_b_cl: +;CHECK: image_gather4_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_b_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_c_lz: +;CHECK: image_gather4_c_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_lz() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + + + +;CHECK-LABEL: {{^}}gather4_c_o: +;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_c_o_v8: +;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_o_v8() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_c_cl_o: +;CHECK: image_gather4_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_cl_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_c_l_o: +;CHECK: image_gather4_c_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_l_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_c_b_o: +;CHECK: image_gather4_c_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_b_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_c_b_cl_o: +;CHECK: image_gather4_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_b_cl_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_c_lz_o: +;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_lz_o() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}gather4_c_lz_o_v8: +;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @gather4_c_lz_o_v8() #0 { +main_body: + %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + + + +declare <4 x float> @llvm.SI.gather4.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +declare <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +declare <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +declare <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.SI.getlod.ll b/test/CodeGen/AMDGPU/llvm.SI.getlod.ll new file mode 100644 index 00000000000..06ee98e91b3 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.SI.getlod.ll @@ -0,0 +1,45 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}getlod: +;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @getlod() #0 { +main_body: + %r = call <4 x float> @llvm.SI.getlod.i32(i32 undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1) + ret void +} + +;CHECK-LABEL: {{^}}getlod_v2: +;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @getlod_v2() #0 { +main_body: + %r = call <4 x float> @llvm.SI.getlod.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1) + ret void +} + +;CHECK-LABEL: {{^}}getlod_v4: +;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @getlod_v4() #0 { +main_body: + %r = call <4 x float> @llvm.SI.getlod.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1) + ret void +} + + +declare <4 x float> @llvm.SI.getlod.i32(i32, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.getlod.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.getlod.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.SI.image.ll b/test/CodeGen/AMDGPU/llvm.SI.image.ll new file mode 100644 index 00000000000..0fac8d79956 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.SI.image.ll @@ -0,0 +1,50 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}image_load: +;CHECK: image_load {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @image_load() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.load.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}image_load_mip: +;CHECK: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @image_load_mip() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}getresinfo: +;CHECK: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} +define void @getresinfo() #0 { +main_body: + %r = call <4 x float> @llvm.SI.getresinfo.i32(i32 undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.getresinfo.i32(i32, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll b/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll new file mode 100644 index 00000000000..4bc638a2806 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll @@ -0,0 +1,310 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}sample: +;CHECK: s_wqm +;CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_cl: +;CHECK: s_wqm +;CHECK: image_sample_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_d: +;CHECK-NOT: s_wqm +;CHECK: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_d() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_d_cl: +;CHECK-NOT: s_wqm +;CHECK: image_sample_d_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_d_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_l: +;CHECK-NOT: s_wqm +;CHECK: image_sample_l {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_l() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_b: +;CHECK: s_wqm +;CHECK: image_sample_b {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_b() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_b_cl: +;CHECK: s_wqm +;CHECK: image_sample_b_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_b_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_lz: +;CHECK-NOT: s_wqm +;CHECK: image_sample_lz {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_lz() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_cd: +;CHECK-NOT: s_wqm +;CHECK: image_sample_cd {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_cd() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_cd_cl: +;CHECK-NOT: s_wqm +;CHECK: image_sample_cd_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_cd_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c: +;CHECK: s_wqm +;CHECK: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_cl: +;CHECK: s_wqm +;CHECK: image_sample_c_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_d: +;CHECK-NOT: s_wqm +;CHECK: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_d() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_d_cl: +;CHECK-NOT: s_wqm +;CHECK: image_sample_c_d_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_d_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_l: +;CHECK-NOT: s_wqm +;CHECK: image_sample_c_l {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_l() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_b: +;CHECK: s_wqm +;CHECK: image_sample_c_b {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_b() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_b_cl: +;CHECK: s_wqm +;CHECK: image_sample_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_b_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_lz: +;CHECK-NOT: s_wqm +;CHECK: image_sample_c_lz {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_lz() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_cd: +;CHECK-NOT: s_wqm +;CHECK: image_sample_c_cd {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_cd() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_cd_cl: +;CHECK-NOT: s_wqm +;CHECK: image_sample_c_cd_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_cd_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + + +declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll b/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll new file mode 100644 index 00000000000..9d8935414ed --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll @@ -0,0 +1,310 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}sample: +;CHECK: s_wqm +;CHECK: image_sample_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_cl: +;CHECK: s_wqm +;CHECK: image_sample_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_d: +;CHECK-NOT: s_wqm +;CHECK: image_sample_d_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_d() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_d_cl: +;CHECK-NOT: s_wqm +;CHECK: image_sample_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_d_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_l: +;CHECK-NOT: s_wqm +;CHECK: image_sample_l_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_l() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_b: +;CHECK: s_wqm +;CHECK: image_sample_b_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_b() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_b_cl: +;CHECK: s_wqm +;CHECK: image_sample_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_b_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_lz: +;CHECK-NOT: s_wqm +;CHECK: image_sample_lz_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_lz() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_cd: +;CHECK-NOT: s_wqm +;CHECK: image_sample_cd_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_cd() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_cd_cl: +;CHECK-NOT: s_wqm +;CHECK: image_sample_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_cd_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c: +;CHECK: s_wqm +;CHECK: image_sample_c_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_cl: +;CHECK: s_wqm +;CHECK: image_sample_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_d: +;CHECK-NOT: s_wqm +;CHECK: image_sample_c_d_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_d() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_d_cl: +;CHECK-NOT: s_wqm +;CHECK: image_sample_c_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_d_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_l: +;CHECK-NOT: s_wqm +;CHECK: image_sample_c_l_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_l() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_b: +;CHECK: s_wqm +;CHECK: image_sample_c_b_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_b() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_b_cl: +;CHECK: s_wqm +;CHECK: image_sample_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_b_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_lz: +;CHECK-NOT: s_wqm +;CHECK: image_sample_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_lz() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_cd: +;CHECK-NOT: s_wqm +;CHECK: image_sample_c_cd_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_cd() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + +;CHECK-LABEL: {{^}}sample_c_cd_cl: +;CHECK-NOT: s_wqm +;CHECK: image_sample_c_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +define void @sample_c_cd_cl() #0 { +main_body: + %r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + + +declare <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +declare <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.SI.imageload.ll b/test/CodeGen/AMDGPU/llvm.SI.imageload.ll new file mode 100644 index 00000000000..b67716c3b66 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.SI.imageload.ll @@ -0,0 +1,132 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-DAG: image_load {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1 +;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0 +;CHECK-DAG: image_load_mip {{v[0-9]+}}, 2, 0, 0, 0 +;CHECK-DAG: image_load_mip {{v[0-9]+}}, 1, 0, 0, 0 +;CHECK-DAG: image_load_mip {{v[0-9]+}}, 4, 0, 0, 0 +;CHECK-DAG: image_load_mip {{v[0-9]+}}, 8, 0, 0, 0 +;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0 +;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1 +;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0 +;CHECK-DAG: image_load_mip {{v[0-9]+}}, 8, 0, 0, -1 + +define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) { + %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0 + %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1 + %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2 + %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3 + %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0 + %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1 + %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1 + %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2 + %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2 + %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3 + %res1 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v1, + <32 x i8> undef, i32 1) + %res2 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v2, + <32 x i8> undef, i32 2) + %res3 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v3, + <32 x i8> undef, i32 3) + %res4 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v4, + <32 x i8> undef, i32 4) + %res5 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v5, + <32 x i8> undef, i32 5) + %res6 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v6, + <32 x i8> undef, i32 6) + %res10 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v10, + <32 x i8> undef, i32 10) + %res11 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v11, + <32 x i8> undef, i32 11) + %res15 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v15, + <32 x i8> undef, i32 15) + %res16 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v16, + <32 x i8> undef, i32 16) + %e1 = extractelement <4 x i32> %res1, i32 0 + %e2 = extractelement <4 x i32> %res2, i32 1 + %e3 = extractelement <4 x i32> %res3, i32 2 + %e4 = extractelement <4 x i32> %res4, i32 3 + %t0 = extractelement <4 x i32> %res5, i32 0 + %t1 = extractelement <4 x i32> %res5, i32 1 + %e5 = add i32 %t0, %t1 + %t2 = extractelement <4 x i32> %res6, i32 0 + %t3 = extractelement <4 x i32> %res6, i32 2 + %e6 = add i32 %t2, %t3 + %t10 = extractelement <4 x i32> %res10, i32 2 + %t11 = extractelement <4 x i32> %res10, i32 3 + %e10 = add i32 %t10, %t11 + %t12 = extractelement <4 x i32> %res11, i32 0 + %t13 = extractelement <4 x i32> %res11, i32 1 + %t14 = extractelement <4 x i32> %res11, i32 2 + %t15 = add i32 %t12, %t13 + %e11 = add i32 %t14, %t15 + %t28 = extractelement <4 x i32> %res15, i32 0 + %t29 = extractelement <4 x i32> %res15, i32 1 + %t30 = extractelement <4 x i32> %res15, i32 2 + %t31 = extractelement <4 x i32> %res15, i32 3 + %t32 = add i32 %t28, %t29 + %t33 = add i32 %t30, %t31 + %e15 = add i32 %t32, %t33 + %e16 = extractelement <4 x i32> %res16, i32 3 + %s1 = add i32 %e1, %e2 + %s2 = add i32 %s1, %e3 + %s3 = add i32 %s2, %e4 + %s4 = add i32 %s3, %e5 + %s5 = add i32 %s4, %e6 + %s9 = add i32 %s5, %e10 + %s10 = add i32 %s9, %e11 + %s14 = add i32 %s10, %e15 + %s15 = add i32 %s14, %e16 + %s16 = bitcast i32 %s15 to float + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s16, float %s16, float %s16, float %s16) + ret void +} + +; Test that ccordinates are stored in vgprs and not sgprs +; CHECK: vgpr_coords +; CHECK: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}} +define void @vgpr_coords(float addrspace(2)* addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { +main_body: + %20 = getelementptr float addrspace(2)*, float addrspace(2)* addrspace(2)* %0, i32 0 + %21 = load float addrspace(2)*, float addrspace(2)* addrspace(2)* %20, !tbaa !2 + %22 = getelementptr float, float addrspace(2)* %21, i32 0 + %23 = load float, float addrspace(2)* %22, !tbaa !2, !invariant.load !1 + %24 = getelementptr float, float addrspace(2)* %21, i32 1 + %25 = load float, float addrspace(2)* %24, !tbaa !2, !invariant.load !1 + %26 = getelementptr float, float addrspace(2)* %21, i32 4 + %27 = load float, float addrspace(2)* %26, !tbaa !2, !invariant.load !1 + %28 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0 + %29 = load <32 x i8>, <32 x i8> addrspace(2)* %28, !tbaa !2 + %30 = bitcast float %27 to i32 + %31 = bitcast float %23 to i32 + %32 = bitcast float %25 to i32 + %33 = insertelement <4 x i32> undef, i32 %31, i32 0 + %34 = insertelement <4 x i32> %33, i32 %32, i32 1 + %35 = insertelement <4 x i32> %34, i32 %30, i32 2 + %36 = insertelement <4 x i32> %35, i32 undef, i32 3 + %37 = call <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32> %36, <32 x i8> %29, i32 2) + %38 = extractelement <4 x i32> %37, i32 0 + %39 = extractelement <4 x i32> %37, i32 1 + %40 = extractelement <4 x i32> %37, i32 2 + %41 = extractelement <4 x i32> %37, i32 3 + %42 = bitcast i32 %38 to float + %43 = bitcast i32 %39 to float + %44 = bitcast i32 %40 to float + %45 = bitcast i32 %41 to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %42, float %43, float %44, float %45) + ret void +} + +declare <4 x i32> @llvm.SI.imageload.(<4 x i32>, <32 x i8>, i32) readnone +; Function Attrs: nounwind readnone +declare <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32>, <32 x i8>, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } + +!0 = !{!"const", null} +!1 = !{} +!2 = !{!0, !0, i64 0, i32 1} diff --git a/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll b/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll new file mode 100644 index 00000000000..f6c258539d5 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll @@ -0,0 +1,53 @@ +; RUN: llc -march=amdgcn -mcpu=verde -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s + +; Example of a simple geometry shader loading vertex attributes from the +; ESGS ring buffer + +; FIXME: Out of bounds immediate offset crashes + +; CHECK-LABEL: {{^}}main: +; CHECK: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 glc slc +; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen glc slc +; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen glc slc +; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen offen glc slc +; CHECK: s_movk_i32 [[K:s[0-9]+]], 0x4d2 ; encoding +; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, [[K]] idxen offen offset:65535 glc slc + +define void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <32 x i8>] addrspace(2)* byval %arg2, [2 x <16 x i8>] addrspace(2)* byval %arg3, [17 x <16 x i8>] addrspace(2)* inreg %arg4, [17 x <16 x i8>] addrspace(2)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) #0 { +main_body: + %tmp = getelementptr [2 x <16 x i8>], [2 x <16 x i8>] addrspace(2)* %arg3, i64 0, i32 1 + %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 + %tmp11 = shl i32 %arg6, 2 + %tmp12 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0) + %tmp13 = bitcast i32 %tmp12 to float + %tmp14 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 %tmp11, i32 0, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0) + %tmp15 = bitcast i32 %tmp14 to float + %tmp16 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 %tmp11, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 0) + %tmp17 = bitcast i32 %tmp16 to float + %tmp18 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 0) + %tmp19 = bitcast i32 %tmp18 to float + + %tmp20 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 123, i32 1, i32 1, i32 1, i32 1, i32 0) + %tmp21 = bitcast i32 %tmp20 to float + + %tmp22 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 1234, i32 65535, i32 1, i32 1, i32 1, i32 1, i32 0) + %tmp23 = bitcast i32 %tmp22 to float + + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp13, float %tmp15, float %tmp17, float %tmp19) + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp21, float %tmp23, float %tmp23, float %tmp23) + ret void +} + +; Function Attrs: nounwind readonly +declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 + +; Function Attrs: nounwind readonly +declare i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="1" } +attributes #1 = { nounwind readonly } + +!0 = !{!"const", null, i32 1} diff --git a/test/CodeGen/AMDGPU/llvm.SI.resinfo.ll b/test/CodeGen/AMDGPU/llvm.SI.resinfo.ll new file mode 100644 index 00000000000..ac95fd0b83a --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.SI.resinfo.ll @@ -0,0 +1,111 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s + +; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1 +; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0 +; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 2, 0, 0, 0 +; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 1, 0, 0, 0 +; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 4, 0, 0, 0 +; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 8, 0, 0, 0 +; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0 +; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 9, 0, 0, 0 +; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 6, 0, 0, 0 +; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 10, 0, 0, -1 +; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1 +; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0 +; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 11, 0, 0, 0 +; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 13, 0, 0, 0 +; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 14, 0, 0, 0 +; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 8, 0, 0, -1 + +define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, + i32 %a9, i32 %a10, i32 %a11, i32 %a12, i32 %a13, i32 %a14, i32 %a15, i32 %a16) { + %res1 = call <4 x i32> @llvm.SI.resinfo(i32 %a1, <32 x i8> undef, i32 1) + %res2 = call <4 x i32> @llvm.SI.resinfo(i32 %a2, <32 x i8> undef, i32 2) + %res3 = call <4 x i32> @llvm.SI.resinfo(i32 %a3, <32 x i8> undef, i32 3) + %res4 = call <4 x i32> @llvm.SI.resinfo(i32 %a4, <32 x i8> undef, i32 4) + %res5 = call <4 x i32> @llvm.SI.resinfo(i32 %a5, <32 x i8> undef, i32 5) + %res6 = call <4 x i32> @llvm.SI.resinfo(i32 %a6, <32 x i8> undef, i32 6) + %res7 = call <4 x i32> @llvm.SI.resinfo(i32 %a7, <32 x i8> undef, i32 7) + %res8 = call <4 x i32> @llvm.SI.resinfo(i32 %a8, <32 x i8> undef, i32 8) + %res9 = call <4 x i32> @llvm.SI.resinfo(i32 %a9, <32 x i8> undef, i32 9) + %res10 = call <4 x i32> @llvm.SI.resinfo(i32 %a10, <32 x i8> undef, i32 10) + %res11 = call <4 x i32> @llvm.SI.resinfo(i32 %a11, <32 x i8> undef, i32 11) + %res12 = call <4 x i32> @llvm.SI.resinfo(i32 %a12, <32 x i8> undef, i32 12) + %res13 = call <4 x i32> @llvm.SI.resinfo(i32 %a13, <32 x i8> undef, i32 13) + %res14 = call <4 x i32> @llvm.SI.resinfo(i32 %a14, <32 x i8> undef, i32 14) + %res15 = call <4 x i32> @llvm.SI.resinfo(i32 %a15, <32 x i8> undef, i32 15) + %res16 = call <4 x i32> @llvm.SI.resinfo(i32 %a16, <32 x i8> undef, i32 16) + %e1 = extractelement <4 x i32> %res1, i32 0 + %e2 = extractelement <4 x i32> %res2, i32 1 + %e3 = extractelement <4 x i32> %res3, i32 2 + %e4 = extractelement <4 x i32> %res4, i32 3 + %t0 = extractelement <4 x i32> %res5, i32 0 + %t1 = extractelement <4 x i32> %res5, i32 1 + %e5 = add i32 %t0, %t1 + %t2 = extractelement <4 x i32> %res6, i32 0 + %t3 = extractelement <4 x i32> %res6, i32 2 + %e6 = add i32 %t2, %t3 + %t4 = extractelement <4 x i32> %res7, i32 0 + %t5 = extractelement <4 x i32> %res7, i32 3 + %e7 = add i32 %t4, %t5 + %t6 = extractelement <4 x i32> %res8, i32 1 + %t7 = extractelement <4 x i32> %res8, i32 2 + %e8 = add i32 %t6, %t7 + %t8 = extractelement <4 x i32> %res9, i32 1 + %t9 = extractelement <4 x i32> %res9, i32 3 + %e9 = add i32 %t8, %t9 + %t10 = extractelement <4 x i32> %res10, i32 2 + %t11 = extractelement <4 x i32> %res10, i32 3 + %e10 = add i32 %t10, %t11 + %t12 = extractelement <4 x i32> %res11, i32 0 + %t13 = extractelement <4 x i32> %res11, i32 1 + %t14 = extractelement <4 x i32> %res11, i32 2 + %t15 = add i32 %t12, %t13 + %e11 = add i32 %t14, %t15 + %t16 = extractelement <4 x i32> %res12, i32 0 + %t17 = extractelement <4 x i32> %res12, i32 1 + %t18 = extractelement <4 x i32> %res12, i32 3 + %t19 = add i32 %t16, %t17 + %e12 = add i32 %t18, %t19 + %t20 = extractelement <4 x i32> %res13, i32 0 + %t21 = extractelement <4 x i32> %res13, i32 2 + %t22 = extractelement <4 x i32> %res13, i32 3 + %t23 = add i32 %t20, %t21 + %e13 = add i32 %t22, %t23 + %t24 = extractelement <4 x i32> %res14, i32 1 + %t25 = extractelement <4 x i32> %res14, i32 2 + %t26 = extractelement <4 x i32> %res14, i32 3 + %t27 = add i32 %t24, %t25 + %e14 = add i32 %t26, %t27 + %t28 = extractelement <4 x i32> %res15, i32 0 + %t29 = extractelement <4 x i32> %res15, i32 1 + %t30 = extractelement <4 x i32> %res15, i32 2 + %t31 = extractelement <4 x i32> %res15, i32 3 + %t32 = add i32 %t28, %t29 + %t33 = add i32 %t30, %t31 + %e15 = add i32 %t32, %t33 + %e16 = extractelement <4 x i32> %res16, i32 3 + %s1 = add i32 %e1, %e2 + %s2 = add i32 %s1, %e3 + %s3 = add i32 %s2, %e4 + %s4 = add i32 %s3, %e5 + %s5 = add i32 %s4, %e6 + %s6 = add i32 %s5, %e7 + %s7 = add i32 %s6, %e8 + %s8 = add i32 %s7, %e9 + %s9 = add i32 %s8, %e10 + %s10 = add i32 %s9, %e11 + %s11 = add i32 %s10, %e12 + %s12 = add i32 %s11, %e13 + %s13 = add i32 %s12, %e14 + %s14 = add i32 %s13, %e15 + %s15 = add i32 %s14, %e16 + %s16 = bitcast i32 %s15 to float + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s16, float %s16, float %s16, float %s16) + ret void +} + +declare <4 x i32> @llvm.SI.resinfo(i32, <32 x i8>, i32) readnone + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/test/CodeGen/AMDGPU/llvm.SI.sample-masked.ll b/test/CodeGen/AMDGPU/llvm.SI.sample-masked.ll new file mode 100644 index 00000000000..ce9558cbf81 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.SI.sample-masked.ll @@ -0,0 +1,96 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s + +; CHECK-LABEL: {{^}}v1: +; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 13 +define void @v1(i32 %a1) #0 { +entry: + %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 + %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) + %2 = extractelement <4 x float> %1, i32 0 + %3 = extractelement <4 x float> %1, i32 2 + %4 = extractelement <4 x float> %1, i32 3 + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4) + ret void +} + +; CHECK-LABEL: {{^}}v2: +; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 11 +define void @v2(i32 %a1) #0 { +entry: + %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 + %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) + %2 = extractelement <4 x float> %1, i32 0 + %3 = extractelement <4 x float> %1, i32 1 + %4 = extractelement <4 x float> %1, i32 3 + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4) + ret void +} + +; CHECK-LABEL: {{^}}v3: +; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 14 +define void @v3(i32 %a1) #0 { +entry: + %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 + %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) + %2 = extractelement <4 x float> %1, i32 1 + %3 = extractelement <4 x float> %1, i32 2 + %4 = extractelement <4 x float> %1, i32 3 + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4) + ret void +} + +; CHECK-LABEL: {{^}}v4: +; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 7 +define void @v4(i32 %a1) #0 { +entry: + %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 + %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) + %2 = extractelement <4 x float> %1, i32 0 + %3 = extractelement <4 x float> %1, i32 1 + %4 = extractelement <4 x float> %1, i32 2 + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4) + ret void +} + +; CHECK-LABEL: {{^}}v5: +; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 10 +define void @v5(i32 %a1) #0 { +entry: + %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 + %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) + %2 = extractelement <4 x float> %1, i32 1 + %3 = extractelement <4 x float> %1, i32 3 + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3) + ret void +} + +; CHECK-LABEL: {{^}}v6: +; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 6 +define void @v6(i32 %a1) #0 { +entry: + %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 + %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) + %2 = extractelement <4 x float> %1, i32 1 + %3 = extractelement <4 x float> %1, i32 2 + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3) + ret void +} + +; CHECK-LABEL: {{^}}v7: +; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 9 +define void @v7(i32 %a1) #0 { +entry: + %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 + %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) + %2 = extractelement <4 x float> %1, i32 0 + %3 = extractelement <4 x float> %1, i32 3 + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3) + ret void +} + +declare <4 x float> @llvm.SI.sample.v1i32(<1 x i32>, <32 x i8>, <16 x i8>, i32) readnone + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } diff --git a/test/CodeGen/AMDGPU/llvm.SI.sample.ll b/test/CodeGen/AMDGPU/llvm.SI.sample.ll new file mode 100644 index 00000000000..509c45f588b --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.SI.sample.ll @@ -0,0 +1,160 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15 +;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 3 +;CHECK-DAG: image_sample {{v[0-9]+}}, 2 +;CHECK-DAG: image_sample {{v[0-9]+}}, 1 +;CHECK-DAG: image_sample {{v[0-9]+}}, 4 +;CHECK-DAG: image_sample {{v[0-9]+}}, 8 +;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 5 +;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 9 +;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 6 +;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 10 +;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 12 +;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 7 +;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 11 +;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 13 +;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 14 +;CHECK-DAG: image_sample {{v[0-9]+}}, 8 + +define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) #0 { + %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0 + %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1 + %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2 + %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3 + %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0 + %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1 + %v7 = insertelement <4 x i32> undef, i32 %a2, i32 2 + %v8 = insertelement <4 x i32> undef, i32 %a2, i32 3 + %v9 = insertelement <4 x i32> undef, i32 %a3, i32 0 + %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1 + %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2 + %v12 = insertelement <4 x i32> undef, i32 %a3, i32 3 + %v13 = insertelement <4 x i32> undef, i32 %a4, i32 0 + %v14 = insertelement <4 x i32> undef, i32 %a4, i32 1 + %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2 + %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3 + %res1 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v1, + <32 x i8> undef, <16 x i8> undef, i32 1) + %res2 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v2, + <32 x i8> undef, <16 x i8> undef, i32 2) + %res3 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v3, + <32 x i8> undef, <16 x i8> undef, i32 3) + %res4 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v4, + <32 x i8> undef, <16 x i8> undef, i32 4) + %res5 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v5, + <32 x i8> undef, <16 x i8> undef, i32 5) + %res6 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v6, + <32 x i8> undef, <16 x i8> undef, i32 6) + %res7 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v7, + <32 x i8> undef, <16 x i8> undef, i32 7) + %res8 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v8, + <32 x i8> undef, <16 x i8> undef, i32 8) + %res9 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v9, + <32 x i8> undef, <16 x i8> undef, i32 9) + %res10 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v10, + <32 x i8> undef, <16 x i8> undef, i32 10) + %res11 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v11, + <32 x i8> undef, <16 x i8> undef, i32 11) + %res12 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v12, + <32 x i8> undef, <16 x i8> undef, i32 12) + %res13 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v13, + <32 x i8> undef, <16 x i8> undef, i32 13) + %res14 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v14, + <32 x i8> undef, <16 x i8> undef, i32 14) + %res15 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v15, + <32 x i8> undef, <16 x i8> undef, i32 15) + %res16 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v16, + <32 x i8> undef, <16 x i8> undef, i32 16) + %e1 = extractelement <4 x float> %res1, i32 0 + %e2 = extractelement <4 x float> %res2, i32 1 + %e3 = extractelement <4 x float> %res3, i32 2 + %e4 = extractelement <4 x float> %res4, i32 3 + %t0 = extractelement <4 x float> %res5, i32 0 + %t1 = extractelement <4 x float> %res5, i32 1 + %e5 = fadd float %t0, %t1 + %t2 = extractelement <4 x float> %res6, i32 0 + %t3 = extractelement <4 x float> %res6, i32 2 + %e6 = fadd float %t2, %t3 + %t4 = extractelement <4 x float> %res7, i32 0 + %t5 = extractelement <4 x float> %res7, i32 3 + %e7 = fadd float %t4, %t5 + %t6 = extractelement <4 x float> %res8, i32 1 + %t7 = extractelement <4 x float> %res8, i32 2 + %e8 = fadd float %t6, %t7 + %t8 = extractelement <4 x float> %res9, i32 1 + %t9 = extractelement <4 x float> %res9, i32 3 + %e9 = fadd float %t8, %t9 + %t10 = extractelement <4 x float> %res10, i32 2 + %t11 = extractelement <4 x float> %res10, i32 3 + %e10 = fadd float %t10, %t11 + %t12 = extractelement <4 x float> %res11, i32 0 + %t13 = extractelement <4 x float> %res11, i32 1 + %t14 = extractelement <4 x float> %res11, i32 2 + %t15 = fadd float %t12, %t13 + %e11 = fadd float %t14, %t15 + %t16 = extractelement <4 x float> %res12, i32 0 + %t17 = extractelement <4 x float> %res12, i32 1 + %t18 = extractelement <4 x float> %res12, i32 3 + %t19 = fadd float %t16, %t17 + %e12 = fadd float %t18, %t19 + %t20 = extractelement <4 x float> %res13, i32 0 + %t21 = extractelement <4 x float> %res13, i32 2 + %t22 = extractelement <4 x float> %res13, i32 3 + %t23 = fadd float %t20, %t21 + %e13 = fadd float %t22, %t23 + %t24 = extractelement <4 x float> %res14, i32 1 + %t25 = extractelement <4 x float> %res14, i32 2 + %t26 = extractelement <4 x float> %res14, i32 3 + %t27 = fadd float %t24, %t25 + %e14 = fadd float %t26, %t27 + %t28 = extractelement <4 x float> %res15, i32 0 + %t29 = extractelement <4 x float> %res15, i32 1 + %t30 = extractelement <4 x float> %res15, i32 2 + %t31 = extractelement <4 x float> %res15, i32 3 + %t32 = fadd float %t28, %t29 + %t33 = fadd float %t30, %t31 + %e15 = fadd float %t32, %t33 + %e16 = extractelement <4 x float> %res16, i32 3 + %s1 = fadd float %e1, %e2 + %s2 = fadd float %s1, %e3 + %s3 = fadd float %s2, %e4 + %s4 = fadd float %s3, %e5 + %s5 = fadd float %s4, %e6 + %s6 = fadd float %s5, %e7 + %s7 = fadd float %s6, %e8 + %s8 = fadd float %s7, %e9 + %s9 = fadd float %s8, %e10 + %s10 = fadd float %s9, %e11 + %s11 = fadd float %s10, %e12 + %s12 = fadd float %s11, %e13 + %s13 = fadd float %s12, %e14 + %s14 = fadd float %s13, %e15 + %s15 = fadd float %s14, %e16 + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s15, float %s15, float %s15, float %s15) + ret void +} + +; CHECK: {{^}}v1: +; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15 +define void @v1(i32 %a1) #0 { +entry: + %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 + %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) + %2 = extractelement <4 x float> %1, i32 0 + %3 = extractelement <4 x float> %1, i32 1 + %4 = extractelement <4 x float> %1, i32 2 + %5 = extractelement <4 x float> %1, i32 3 + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %5) + ret void +} + + +declare <4 x float> @llvm.SI.sample.v1i32(<1 x i32>, <32 x i8>, <16 x i8>, i32) readnone + +declare <4 x float> @llvm.SI.sample.(<4 x i32>, <32 x i8>, <16 x i8>, i32) readnone + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } diff --git a/test/CodeGen/AMDGPU/llvm.SI.sampled.ll b/test/CodeGen/AMDGPU/llvm.SI.sampled.ll new file mode 100644 index 00000000000..f2badff2a99 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.SI.sampled.ll @@ -0,0 +1,143 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 15 +;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 3 +;CHECK-DAG: image_sample_d {{v[0-9]+}}, 2 +;CHECK-DAG: image_sample_d {{v[0-9]+}}, 1 +;CHECK-DAG: image_sample_d {{v[0-9]+}}, 4 +;CHECK-DAG: image_sample_d {{v[0-9]+}}, 8 +;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 5 +;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 9 +;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 6 +;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 10 +;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 12 +;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 7 +;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 11 +;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 13 +;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 14 +;CHECK-DAG: image_sample_d {{v[0-9]+}}, 8 + +define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) #0 { + %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0 + %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1 + %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2 + %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3 + %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0 + %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1 + %v7 = insertelement <4 x i32> undef, i32 %a2, i32 2 + %v8 = insertelement <4 x i32> undef, i32 %a2, i32 3 + %v9 = insertelement <4 x i32> undef, i32 %a3, i32 0 + %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1 + %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2 + %v12 = insertelement <4 x i32> undef, i32 %a3, i32 3 + %v13 = insertelement <4 x i32> undef, i32 %a4, i32 0 + %v14 = insertelement <4 x i32> undef, i32 %a4, i32 1 + %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2 + %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3 + %res1 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v1, + <32 x i8> undef, <16 x i8> undef, i32 1) + %res2 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v2, + <32 x i8> undef, <16 x i8> undef, i32 2) + %res3 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v3, + <32 x i8> undef, <16 x i8> undef, i32 3) + %res4 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v4, + <32 x i8> undef, <16 x i8> undef, i32 4) + %res5 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v5, + <32 x i8> undef, <16 x i8> undef, i32 5) + %res6 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v6, + <32 x i8> undef, <16 x i8> undef, i32 6) + %res7 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v7, + <32 x i8> undef, <16 x i8> undef, i32 7) + %res8 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v8, + <32 x i8> undef, <16 x i8> undef, i32 8) + %res9 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v9, + <32 x i8> undef, <16 x i8> undef, i32 9) + %res10 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v10, + <32 x i8> undef, <16 x i8> undef, i32 10) + %res11 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v11, + <32 x i8> undef, <16 x i8> undef, i32 11) + %res12 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v12, + <32 x i8> undef, <16 x i8> undef, i32 12) + %res13 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v13, + <32 x i8> undef, <16 x i8> undef, i32 13) + %res14 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v14, + <32 x i8> undef, <16 x i8> undef, i32 14) + %res15 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v15, + <32 x i8> undef, <16 x i8> undef, i32 15) + %res16 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v16, + <32 x i8> undef, <16 x i8> undef, i32 16) + %e1 = extractelement <4 x float> %res1, i32 0 + %e2 = extractelement <4 x float> %res2, i32 1 + %e3 = extractelement <4 x float> %res3, i32 2 + %e4 = extractelement <4 x float> %res4, i32 3 + %t0 = extractelement <4 x float> %res5, i32 0 + %t1 = extractelement <4 x float> %res5, i32 1 + %e5 = fadd float %t0, %t1 + %t2 = extractelement <4 x float> %res6, i32 0 + %t3 = extractelement <4 x float> %res6, i32 2 + %e6 = fadd float %t2, %t3 + %t4 = extractelement <4 x float> %res7, i32 0 + %t5 = extractelement <4 x float> %res7, i32 3 + %e7 = fadd float %t4, %t5 + %t6 = extractelement <4 x float> %res8, i32 1 + %t7 = extractelement <4 x float> %res8, i32 2 + %e8 = fadd float %t6, %t7 + %t8 = extractelement <4 x float> %res9, i32 1 + %t9 = extractelement <4 x float> %res9, i32 3 + %e9 = fadd float %t8, %t9 + %t10 = extractelement <4 x float> %res10, i32 2 + %t11 = extractelement <4 x float> %res10, i32 3 + %e10 = fadd float %t10, %t11 + %t12 = extractelement <4 x float> %res11, i32 0 + %t13 = extractelement <4 x float> %res11, i32 1 + %t14 = extractelement <4 x float> %res11, i32 2 + %t15 = fadd float %t12, %t13 + %e11 = fadd float %t14, %t15 + %t16 = extractelement <4 x float> %res12, i32 0 + %t17 = extractelement <4 x float> %res12, i32 1 + %t18 = extractelement <4 x float> %res12, i32 3 + %t19 = fadd float %t16, %t17 + %e12 = fadd float %t18, %t19 + %t20 = extractelement <4 x float> %res13, i32 0 + %t21 = extractelement <4 x float> %res13, i32 2 + %t22 = extractelement <4 x float> %res13, i32 3 + %t23 = fadd float %t20, %t21 + %e13 = fadd float %t22, %t23 + %t24 = extractelement <4 x float> %res14, i32 1 + %t25 = extractelement <4 x float> %res14, i32 2 + %t26 = extractelement <4 x float> %res14, i32 3 + %t27 = fadd float %t24, %t25 + %e14 = fadd float %t26, %t27 + %t28 = extractelement <4 x float> %res15, i32 0 + %t29 = extractelement <4 x float> %res15, i32 1 + %t30 = extractelement <4 x float> %res15, i32 2 + %t31 = extractelement <4 x float> %res15, i32 3 + %t32 = fadd float %t28, %t29 + %t33 = fadd float %t30, %t31 + %e15 = fadd float %t32, %t33 + %e16 = extractelement <4 x float> %res16, i32 3 + %s1 = fadd float %e1, %e2 + %s2 = fadd float %s1, %e3 + %s3 = fadd float %s2, %e4 + %s4 = fadd float %s3, %e5 + %s5 = fadd float %s4, %e6 + %s6 = fadd float %s5, %e7 + %s7 = fadd float %s6, %e8 + %s8 = fadd float %s7, %e9 + %s9 = fadd float %s8, %e10 + %s10 = fadd float %s9, %e11 + %s11 = fadd float %s10, %e12 + %s12 = fadd float %s11, %e13 + %s13 = fadd float %s12, %e14 + %s14 = fadd float %s13, %e15 + %s15 = fadd float %s14, %e16 + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s15, float %s15, float %s15, float %s15) + ret void +} + +declare <4 x float> @llvm.SI.sampled.(<4 x i32>, <32 x i8>, <16 x i8>, i32) readnone + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } diff --git a/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll b/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll new file mode 100644 index 00000000000..2198590f2df --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll @@ -0,0 +1,20 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=BOTH %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=BOTH %s + +; BOTH-LABEL: {{^}}main: +; BOTH: s_mov_b32 m0, s0 +; VI-NEXT: s_nop 0 +; BOTH-NEXT: s_sendmsg Gs_done(nop) +; BOTH-NEXT: s_endpgm + +define void @main(i32 inreg %a) #0 { +main_body: + call void @llvm.SI.sendmsg(i32 3, i32 %a) + ret void +} + +; Function Attrs: nounwind +declare void @llvm.SI.sendmsg(i32, i32) #1 + +attributes #0 = { "ShaderType"="2" "unsafe-fp-math"="true" } +attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll b/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll new file mode 100644 index 00000000000..09675d50335 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll @@ -0,0 +1,24 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +; CHECK-LABEL: {{^}}main: +; CHECK: s_mov_b32 m0, 0 +; CHECK-NOT: s_mov_b32 m0 +; CHECK: s_sendmsg Gs(emit stream 0) +; CHECK: s_sendmsg Gs(cut stream 1) +; CHECK: s_sendmsg Gs(emit-cut stream 2) +; CHECK: s_sendmsg Gs_done(nop) + +define void @main() { +main_body: + call void @llvm.SI.sendmsg(i32 34, i32 0); + call void @llvm.SI.sendmsg(i32 274, i32 0); + call void @llvm.SI.sendmsg(i32 562, i32 0); + call void @llvm.SI.sendmsg(i32 3, i32 0); + ret void +} + +; Function Attrs: nounwind +declare void @llvm.SI.sendmsg(i32, i32) #0 + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll b/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll new file mode 100644 index 00000000000..71f51548a5f --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll @@ -0,0 +1,47 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}test1: +;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, 0x20, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 +define void @test1(i32 %a1, i32 %vaddr) #0 { + %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 + call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, + i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1, + i32 1, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}test2: +;CHECK: tbuffer_store_format_xyz {{v\[[0-9]+:[0-9]+\]}}, 0x18, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 +define void @test2(i32 %a1, i32 %vaddr) #0 { + %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 + call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, + i32 3, i32 %vaddr, i32 0, i32 24, i32 13, i32 4, i32 1, i32 0, i32 1, + i32 1, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}test3: +;CHECK: tbuffer_store_format_xy {{v\[[0-9]+:[0-9]+\]}}, 0x10, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 +define void @test3(i32 %a1, i32 %vaddr) #0 { + %vdata = insertelement <2 x i32> undef, i32 %a1, i32 0 + call void @llvm.SI.tbuffer.store.v2i32(<16 x i8> undef, <2 x i32> %vdata, + i32 2, i32 %vaddr, i32 0, i32 16, i32 11, i32 4, i32 1, i32 0, i32 1, + i32 1, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}test4: +;CHECK: tbuffer_store_format_x {{v[0-9]+}}, 0x8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 +define void @test4(i32 %vdata, i32 %vaddr) #0 { + call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %vdata, + i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1, + i32 1, i32 0) + ret void +} + +declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) +declare void @llvm.SI.tbuffer.store.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) +declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) + +attributes #0 = { "ShaderType"="1" } diff --git a/test/CodeGen/AMDGPU/llvm.SI.tid.ll b/test/CodeGen/AMDGPU/llvm.SI.tid.ll new file mode 100644 index 00000000000..f6e6d7050ba --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.SI.tid.ll @@ -0,0 +1,18 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN %s + +;GCN: v_mbcnt_lo_u32_b32_e64 +;SI: v_mbcnt_hi_u32_b32_e32 +;VI: v_mbcnt_hi_u32_b32_e64 + +define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" { +main_body: + %4 = call i32 @llvm.SI.tid() + %5 = bitcast i32 %4 to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %5, float %5, float %5) + ret void +} + +declare i32 @llvm.SI.tid() readnone + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/test/CodeGen/AMDGPU/llvm.amdgpu.dp4.ll b/test/CodeGen/AMDGPU/llvm.amdgpu.dp4.ll new file mode 100644 index 00000000000..036cd2ca82a --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgpu.dp4.ll @@ -0,0 +1,11 @@ +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s + +declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) nounwind readnone + +define void @test_dp4(float addrspace(1)* %out, <4 x float> addrspace(1)* %a, <4 x float> addrspace(1)* %b) nounwind { + %src0 = load <4 x float>, <4 x float> addrspace(1)* %a, align 16 + %src1 = load <4 x float>, <4 x float> addrspace(1)* %b, align 16 + %dp4 = call float @llvm.AMDGPU.dp4(<4 x float> %src0, <4 x float> %src1) nounwind readnone + store float %dp4, float addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.amdgpu.kilp.ll b/test/CodeGen/AMDGPU/llvm.amdgpu.kilp.ll new file mode 100644 index 00000000000..42df6db1ccf --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgpu.kilp.ll @@ -0,0 +1,21 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}kilp_gs_const: +; SI: s_mov_b64 exec, 0 +define void @kilp_gs_const() #0 { +main_body: + %0 = icmp ule i32 0, 3 + %1 = select i1 %0, float 1.000000e+00, float -1.000000e+00 + call void @llvm.AMDGPU.kilp(float %1) + %2 = icmp ule i32 3, 0 + %3 = select i1 %2, float 1.000000e+00, float -1.000000e+00 + call void @llvm.AMDGPU.kilp(float %3) + ret void +} + +declare void @llvm.AMDGPU.kilp(float) + +attributes #0 = { "ShaderType"="2" } + +!0 = !{!"const", null, i32 1} diff --git a/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll b/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll new file mode 100644 index 00000000000..4e4c2ec7791 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll @@ -0,0 +1,13 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare float @llvm.AMDGPU.lrp(float, float, float) nounwind readnone + +; FUNC-LABEL: {{^}}test_lrp: +; SI: v_sub_f32 +; SI: v_mad_f32 +define void @test_lrp(float addrspace(1)* %out, float %src0, float %src1, float %src2) nounwind { + %mad = call float @llvm.AMDGPU.lrp(float %src0, float %src1, float %src2) nounwind readnone + store float %mad, float addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.cos.ll b/test/CodeGen/AMDGPU/llvm.cos.ll new file mode 100644 index 00000000000..c65df8b3e8d --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.cos.ll @@ -0,0 +1,41 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -check-prefix=EG -check-prefix=FUNC +;RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s -check-prefix=SI -check-prefix=FUNC +;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s -check-prefix=SI -check-prefix=FUNC + +;FUNC-LABEL: test +;EG: MULADD_IEEE * +;EG: FRACT * +;EG: ADD * +;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +;EG-NOT: COS +;SI: v_cos_f32 +;SI-NOT: v_cos_f32 + +define void @test(float addrspace(1)* %out, float %x) #1 { + %cos = call float @llvm.cos.f32(float %x) + store float %cos, float addrspace(1)* %out + ret void +} + +;FUNC-LABEL: testv +;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +;EG-NOT: COS +;SI: v_cos_f32 +;SI: v_cos_f32 +;SI: v_cos_f32 +;SI: v_cos_f32 +;SI-NOT: v_cos_f32 + +define void @testv(<4 x float> addrspace(1)* %out, <4 x float> inreg %vx) #1 { + %cos = call <4 x float> @llvm.cos.v4f32(<4 x float> %vx) + store <4 x float> %cos, <4 x float> addrspace(1)* %out + ret void +} + +declare float @llvm.cos.f32(float) readnone +declare <4 x float> @llvm.cos.v4f32(<4 x float>) readnone + +attributes #0 = { "ShaderType"="0" } diff --git a/test/CodeGen/AMDGPU/llvm.exp2.ll b/test/CodeGen/AMDGPU/llvm.exp2.ll new file mode 100644 index 00000000000..42698925aae --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -0,0 +1,80 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC +;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC +;RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s --check-prefix=SI --check-prefix=FUNC +;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=SI --check-prefix=FUNC + +;FUNC-LABEL: {{^}}test: +;EG: EXP_IEEE +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} +;SI: v_exp_f32 + +define void @test(float addrspace(1)* %out, float %in) { +entry: + %0 = call float @llvm.exp2.f32(float %in) + store float %0, float addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}testv2: +;EG: EXP_IEEE +;EG: EXP_IEEE +; FIXME: We should be able to merge these packets together on Cayman so we +; have a maximum of 4 instructions. +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} +;SI: v_exp_f32 +;SI: v_exp_f32 + +define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) { +entry: + %0 = call <2 x float> @llvm.exp2.v2f32(<2 x float> %in) + store <2 x float> %0, <2 x float> addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}testv4: +;EG: EXP_IEEE +;EG: EXP_IEEE +;EG: EXP_IEEE +;EG: EXP_IEEE +; FIXME: We should be able to merge these packets together on Cayman so we +; have a maximum of 4 instructions. +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} +;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} +;SI: v_exp_f32 +;SI: v_exp_f32 +;SI: v_exp_f32 +;SI: v_exp_f32 +define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) { +entry: + %0 = call <4 x float> @llvm.exp2.v4f32(<4 x float> %in) + store <4 x float> %0, <4 x float> addrspace(1)* %out + ret void +} + +declare float @llvm.exp2.f32(float) readnone +declare <2 x float> @llvm.exp2.v2f32(<2 x float>) readnone +declare <4 x float> @llvm.exp2.v4f32(<4 x float>) readnone diff --git a/test/CodeGen/AMDGPU/llvm.log2.ll b/test/CodeGen/AMDGPU/llvm.log2.ll new file mode 100644 index 00000000000..c75e7850b35 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.log2.ll @@ -0,0 +1,80 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC +;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC +;RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s --check-prefix=SI --check-prefix=FUNC +;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=SI --check-prefix=FUNC + +;FUNC-LABEL: {{^}}test: +;EG: LOG_IEEE +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +;SI: v_log_f32 + +define void @test(float addrspace(1)* %out, float %in) { +entry: + %0 = call float @llvm.log2.f32(float %in) + store float %0, float addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}testv2: +;EG: LOG_IEEE +;EG: LOG_IEEE +; FIXME: We should be able to merge these packets together on Cayman so we +; have a maximum of 4 instructions. +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +;SI: v_log_f32 +;SI: v_log_f32 + +define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) { +entry: + %0 = call <2 x float> @llvm.log2.v2f32(<2 x float> %in) + store <2 x float> %0, <2 x float> addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}testv4: +;EG: LOG_IEEE +;EG: LOG_IEEE +;EG: LOG_IEEE +;EG: LOG_IEEE +; FIXME: We should be able to merge these packets together on Cayman so we +; have a maximum of 4 instructions. +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +;SI: v_log_f32 +;SI: v_log_f32 +;SI: v_log_f32 +;SI: v_log_f32 +define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) { +entry: + %0 = call <4 x float> @llvm.log2.v4f32(<4 x float> %in) + store <4 x float> %0, <4 x float> addrspace(1)* %out + ret void +} + +declare float @llvm.log2.f32(float) readnone +declare <2 x float> @llvm.log2.v2f32(<2 x float>) readnone +declare <4 x float> @llvm.log2.v4f32(<4 x float>) readnone diff --git a/test/CodeGen/AMDGPU/llvm.memcpy.ll b/test/CodeGen/AMDGPU/llvm.memcpy.ll new file mode 100644 index 00000000000..e491732cf9c --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.memcpy.ll @@ -0,0 +1,365 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(3)* nocapture, i32, i32, i1) nounwind +declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i32, i1) nounwind + + +; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align1: +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 + +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 + +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 + +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 + +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 + +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 + +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 + +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 + +; SI: s_endpgm +define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { + %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* + %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* + call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 1, i1 false) nounwind + ret void +} + +; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align2: +; SI: ds_read_u16 +; SI: ds_read_u16 +; SI: ds_read_u16 +; SI: ds_read_u16 +; SI: ds_read_u16 +; SI: ds_read_u16 +; SI: ds_read_u16 +; SI: ds_read_u16 + +; SI: ds_read_u16 +; SI: ds_read_u16 +; SI: ds_read_u16 +; SI: ds_read_u16 +; SI: ds_read_u16 +; SI: ds_read_u16 +; SI: ds_read_u16 +; SI: ds_read_u16 + +; SI: ds_write_b16 +; SI: ds_write_b16 +; SI: ds_write_b16 +; SI: ds_write_b16 +; SI: ds_write_b16 +; SI: ds_write_b16 +; SI: ds_write_b16 +; SI: ds_write_b16 + +; SI: ds_write_b16 +; SI: ds_write_b16 +; SI: ds_write_b16 +; SI: ds_write_b16 +; SI: ds_write_b16 +; SI: ds_write_b16 +; SI: ds_write_b16 +; SI: ds_write_b16 + +; SI: s_endpgm +define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { + %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* + %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* + call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 2, i1 false) nounwind + ret void +} + +; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align4: +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI: s_endpgm +define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { + %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* + %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* + call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 4, i1 false) nounwind + ret void +} + +; FIXME: Use 64-bit ops +; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align8: + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: ds_read_b32 +; SI-DAG: ds_write_b32 + +; SI-DAG: s_endpgm +define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { + %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* + %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* + call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 8, i1 false) nounwind + ret void +} + +; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align1: +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte + +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte + +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte + +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_load_ubyte +; SI-DAG: buffer_store_byte + +; SI: s_endpgm +define void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { + %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* + %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 1, i1 false) nounwind + ret void +} + +; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align2: +; SI-DAG: buffer_load_ushort +; SI-DAG: buffer_load_ushort +; SI-DAG: buffer_load_ushort +; SI-DAG: buffer_load_ushort +; SI-DAG: buffer_load_ushort +; SI-DAG: buffer_load_ushort +; SI-DAG: buffer_load_ushort +; SI-DAG: buffer_load_ushort +; SI-DAG: buffer_load_ushort +; SI-DAG: buffer_load_ushort +; SI-DAG: buffer_load_ushort +; SI-DAG: buffer_load_ushort +; SI-DAG: buffer_load_ushort +; SI-DAG: buffer_load_ushort +; SI-DAG: buffer_load_ushort +; SI-DAG: buffer_load_ushort + +; SI-DAG: buffer_store_short +; SI-DAG: buffer_store_short +; SI-DAG: buffer_store_short +; SI-DAG: buffer_store_short +; SI-DAG: buffer_store_short +; SI-DAG: buffer_store_short +; SI-DAG: buffer_store_short +; SI-DAG: buffer_store_short +; SI-DAG: buffer_store_short +; SI-DAG: buffer_store_short +; SI-DAG: buffer_store_short +; SI-DAG: buffer_store_short +; SI-DAG: buffer_store_short +; SI-DAG: buffer_store_short +; SI-DAG: buffer_store_short +; SI-DAG: buffer_store_short + +; SI: s_endpgm +define void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { + %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* + %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 2, i1 false) nounwind + ret void +} + +; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align4: +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: s_endpgm +define void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { + %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* + %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 4, i1 false) nounwind + ret void +} + +; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align8: +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: s_endpgm +define void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { + %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* + %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 8, i1 false) nounwind + ret void +} + +; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align16: +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: s_endpgm +define void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { + %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* + %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 16, i1 false) nounwind + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.pow.ll b/test/CodeGen/AMDGPU/llvm.pow.ll new file mode 100644 index 00000000000..c4ae652619c --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.pow.ll @@ -0,0 +1,40 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +;CHECK-LABEL: test1: +;CHECK: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, +;CHECK-NEXT: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}, +;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}, + +define void @test1(<4 x float> inreg %reg0) #0 { + %r0 = extractelement <4 x float> %reg0, i32 0 + %r1 = extractelement <4 x float> %reg0, i32 1 + %r2 = call float @llvm.pow.f32( float %r0, float %r1) + %vec = insertelement <4 x float> undef, float %r2, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: test2: +;CHECK: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, +;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}, +;CHECK-NEXT: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, +;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}, +;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}, +;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}, +;CHECK-NEXT: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, +;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}, +;CHECK-NEXT: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, +;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}, +;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}, +;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}, +define void @test2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { + %vec = call <4 x float> @llvm.pow.v4f32( <4 x float> %reg0, <4 x float> %reg1) + call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) + ret void +} + +declare float @llvm.pow.f32(float ,float ) readonly +declare <4 x float> @llvm.pow.v4f32(<4 x float> ,<4 x float> ) readonly +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="0" } diff --git a/test/CodeGen/AMDGPU/llvm.rint.f64.ll b/test/CodeGen/AMDGPU/llvm.rint.f64.ll new file mode 100644 index 00000000000..c63fb172794 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.rint.f64.ll @@ -0,0 +1,46 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}rint_f64: +; CI: v_rndne_f64_e32 + +; SI-DAG: v_add_f64 +; SI-DAG: v_add_f64 +; SI-DAG v_cmp_gt_f64_e64 +; SI: v_cndmask_b32 +; SI: v_cndmask_b32 +; SI: s_endpgm +define void @rint_f64(double addrspace(1)* %out, double %in) { +entry: + %0 = call double @llvm.rint.f64(double %in) + store double %0, double addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}rint_v2f64: +; CI: v_rndne_f64_e32 +; CI: v_rndne_f64_e32 +define void @rint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { +entry: + %0 = call <2 x double> @llvm.rint.v2f64(<2 x double> %in) + store <2 x double> %0, <2 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}rint_v4f64: +; CI: v_rndne_f64_e32 +; CI: v_rndne_f64_e32 +; CI: v_rndne_f64_e32 +; CI: v_rndne_f64_e32 +define void @rint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { +entry: + %0 = call <4 x double> @llvm.rint.v4f64(<4 x double> %in) + store <4 x double> %0, <4 x double> addrspace(1)* %out + ret void +} + + +declare double @llvm.rint.f64(double) #0 +declare <2 x double> @llvm.rint.v2f64(<2 x double>) #0 +declare <4 x double> @llvm.rint.v4f64(<4 x double>) #0 diff --git a/test/CodeGen/AMDGPU/llvm.rint.ll b/test/CodeGen/AMDGPU/llvm.rint.ll new file mode 100644 index 00000000000..661db51ad03 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.rint.ll @@ -0,0 +1,62 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}rint_f32: +; R600: RNDNE + +; SI: v_rndne_f32_e32 +define void @rint_f32(float addrspace(1)* %out, float %in) { +entry: + %0 = call float @llvm.rint.f32(float %in) #0 + store float %0, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}rint_v2f32: +; R600: RNDNE +; R600: RNDNE + +; SI: v_rndne_f32_e32 +; SI: v_rndne_f32_e32 +define void @rint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { +entry: + %0 = call <2 x float> @llvm.rint.v2f32(<2 x float> %in) #0 + store <2 x float> %0, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}rint_v4f32: +; R600: RNDNE +; R600: RNDNE +; R600: RNDNE +; R600: RNDNE + +; SI: v_rndne_f32_e32 +; SI: v_rndne_f32_e32 +; SI: v_rndne_f32_e32 +; SI: v_rndne_f32_e32 +define void @rint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { +entry: + %0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %in) #0 + store <4 x float> %0, <4 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}legacy_amdil_round_nearest_f32: +; R600: RNDNE + +; SI: v_rndne_f32_e32 +define void @legacy_amdil_round_nearest_f32(float addrspace(1)* %out, float %in) { +entry: + %0 = call float @llvm.AMDIL.round.nearest.f32(float %in) #0 + store float %0, float addrspace(1)* %out + ret void +} + +declare float @llvm.AMDIL.round.nearest.f32(float) #0 +declare float @llvm.rint.f32(float) #0 +declare <2 x float> @llvm.rint.v2f32(<2 x float>) #0 +declare <4 x float> @llvm.rint.v4f32(<4 x float>) #0 + +attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.round.f64.ll b/test/CodeGen/AMDGPU/llvm.round.f64.ll new file mode 100644 index 00000000000..3d0f57e3328 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -0,0 +1,74 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}round_f64: +; SI: s_endpgm +define void @round_f64(double addrspace(1)* %out, double %x) #0 { + %result = call double @llvm.round.f64(double %x) #1 + store double %result, double addrspace(1)* %out + ret void +} + +; This is a pretty large function, so just test a few of the +; instructions that are necessary. + +; FUNC-LABEL: {{^}}v_round_f64: +; SI: buffer_load_dwordx2 +; SI: v_bfe_u32 [[EXP:v[0-9]+]], v{{[0-9]+}}, 20, 11 + +; SI-DAG: v_not_b32_e32 +; SI-DAG: v_not_b32_e32 + +; SI-DAG: v_cmp_eq_i32 + +; SI-DAG: s_mov_b32 [[BFIMASK:s[0-9]+]], 0x7fffffff +; SI-DAG: v_cmp_gt_i32_e64 +; SI-DAG: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[BFIMASK]] + +; SI-DAG: v_cmp_gt_i32_e64 + + +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep = getelementptr double, double addrspace(1)* %in, i32 %tid + %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid + %x = load double, double addrspace(1)* %gep + %result = call double @llvm.round.f64(double %x) #1 + store double %result, double addrspace(1)* %out.gep + ret void +} + +; FUNC-LABEL: {{^}}round_v2f64: +; SI: s_endpgm +define void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 { + %result = call <2 x double> @llvm.round.v2f64(<2 x double> %in) #1 + store <2 x double> %result, <2 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}round_v4f64: +; SI: s_endpgm +define void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 { + %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1 + store <4 x double> %result, <4 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}round_v8f64: +; SI: s_endpgm +define void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 { + %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1 + store <8 x double> %result, <8 x double> addrspace(1)* %out + ret void +} + +declare i32 @llvm.r600.read.tidig.x() #1 + +declare double @llvm.round.f64(double) #1 +declare <2 x double> @llvm.round.v2f64(<2 x double>) #1 +declare <4 x double> @llvm.round.v4f64(<4 x double>) #1 +declare <8 x double> @llvm.round.v8f64(<8 x double>) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.round.ll b/test/CodeGen/AMDGPU/llvm.round.ll new file mode 100644 index 00000000000..f5f124d915a --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.round.ll @@ -0,0 +1,67 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}round_f32: +; SI-DAG: s_load_dword [[SX:s[0-9]+]] +; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x7fffffff +; SI: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[SX]] +; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]] +; SI: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]] +; SI: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]] +; SI: v_cmp_le_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0.5, |[[SUB]]| +; SI: v_cndmask_b32_e64 [[SEL:v[0-9]+]], 0, [[VX]], [[CMP]] +; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]] +; SI: buffer_store_dword [[RESULT]] + +; R600: TRUNC {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]] +; R600-DAG: ADD {{.*}}, +; R600-DAG: BFI_INT +; R600-DAG: SETGE +; R600-DAG: CNDE +; R600-DAG: ADD +define void @round_f32(float addrspace(1)* %out, float %x) #0 { + %result = call float @llvm.round.f32(float %x) #1 + store float %result, float addrspace(1)* %out + ret void +} + +; The vector tests are really difficult to verify, since it can be hard to +; predict how the scheduler will order the instructions. We already have +; a test for the scalar case, so the vector tests just check that the +; compiler doesn't crash. + +; FUNC-LABEL: {{^}}round_v2f32: +; SI: s_endpgm +; R600: CF_END +define void @round_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #0 { + %result = call <2 x float> @llvm.round.v2f32(<2 x float> %in) #1 + store <2 x float> %result, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}round_v4f32: +; SI: s_endpgm +; R600: CF_END +define void @round_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #0 { + %result = call <4 x float> @llvm.round.v4f32(<4 x float> %in) #1 + store <4 x float> %result, <4 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}round_v8f32: +; SI: s_endpgm +; R600: CF_END +define void @round_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %in) #0 { + %result = call <8 x float> @llvm.round.v8f32(<8 x float> %in) #1 + store <8 x float> %result, <8 x float> addrspace(1)* %out + ret void +} + +declare float @llvm.round.f32(float) #1 +declare <2 x float> @llvm.round.v2f32(<2 x float>) #1 +declare <4 x float> @llvm.round.v4f32(<4 x float>) #1 +declare <8 x float> @llvm.round.v8f32(<8 x float>) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/llvm.sin.ll b/test/CodeGen/AMDGPU/llvm.sin.ll new file mode 100644 index 00000000000..3bb245c2e24 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.sin.ll @@ -0,0 +1,92 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-UNSAFE -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-UNSAFE -check-prefix=FUNC %s + +; FUNC-LABEL: sin_f32 +; EG: MULADD_IEEE * +; EG: FRACT * +; EG: ADD * +; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; EG-NOT: SIN +; SI: v_mul_f32 +; SI: v_fract_f32 +; SI: v_sin_f32 +; SI-NOT: v_sin_f32 + +define void @sin_f32(float addrspace(1)* %out, float %x) #1 { + %sin = call float @llvm.sin.f32(float %x) + store float %sin, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sin_3x_f32: +; SI-UNSAFE-NOT: v_add_f32 +; SI-UNSAFE: 0x3ef47644 +; SI-UNSAFE: v_mul_f32 +; SI-SAFE: v_mul_f32 +; SI-SAFE: v_mul_f32 +; SI: v_fract_f32 +; SI: v_sin_f32 +; SI-NOT: v_sin_f32 +define void @sin_3x_f32(float addrspace(1)* %out, float %x) #1 { + %y = fmul float 3.0, %x + %sin = call float @llvm.sin.f32(float %y) + store float %sin, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sin_2x_f32: +; SI-UNSAFE-NOT: v_add_f32 +; SI-UNSAFE: 0x3ea2f983 +; SI-UNSAFE: v_mul_f32 +; SI-SAFE: v_add_f32 +; SI-SAFE: v_mul_f32 +; SI: v_fract_f32 +; SI: v_sin_f32 +; SI-NOT: v_sin_f32 +define void @sin_2x_f32(float addrspace(1)* %out, float %x) #1 { + %y = fmul float 2.0, %x + %sin = call float @llvm.sin.f32(float %y) + store float %sin, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_2sin_f32: +; SI-UNSAFE: 0x3ea2f983 +; SI-UNSAFE: v_mul_f32 +; SI-SAFE: v_add_f32 +; SI-SAFE: v_mul_f32 +; SI: v_fract_f32 +; SI: v_sin_f32 +; SI-NOT: v_sin_f32 +define void @test_2sin_f32(float addrspace(1)* %out, float %x) #1 { + %y = fmul float 2.0, %x + %sin = call float @llvm.sin.f32(float %y) + store float %sin, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sin_v4f32: +; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} +; EG-NOT: SIN +; SI: v_sin_f32 +; SI: v_sin_f32 +; SI: v_sin_f32 +; SI: v_sin_f32 +; SI-NOT: v_sin_f32 + +define void @sin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %vx) #1 { + %sin = call <4 x float> @llvm.sin.v4f32( <4 x float> %vx) + store <4 x float> %sin, <4 x float> addrspace(1)* %out + ret void +} + +declare float @llvm.sin.f32(float) readnone +declare <4 x float> @llvm.sin.v4f32(<4 x float>) readnone + +attributes #0 = { "ShaderType"="0" } diff --git a/test/CodeGen/AMDGPU/llvm.sqrt.ll b/test/CodeGen/AMDGPU/llvm.sqrt.ll new file mode 100644 index 00000000000..c6da047f539 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.sqrt.ll @@ -0,0 +1,105 @@ +; RUN: llc < %s -march=r600 --mcpu=redwood | FileCheck %s --check-prefix=R600 +; RUN: llc < %s -march=amdgcn --mcpu=SI -verify-machineinstrs| FileCheck %s --check-prefix=SI +; RUN: llc < %s -march=amdgcn --mcpu=tonga -verify-machineinstrs| FileCheck %s --check-prefix=SI + +; R600-LABEL: {{^}}sqrt_f32: +; R600: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z +; R600: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].Z, PS +; SI-LABEL: {{^}}sqrt_f32: +; SI: v_sqrt_f32_e32 +define void @sqrt_f32(float addrspace(1)* %out, float %in) { +entry: + %0 = call float @llvm.sqrt.f32(float %in) + store float %0, float addrspace(1)* %out + ret void +} + +; R600-LABEL: {{^}}sqrt_v2f32: +; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].W +; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].W, PS +; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].X +; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].X, PS +; SI-LABEL: {{^}}sqrt_v2f32: +; SI: v_sqrt_f32_e32 +; SI: v_sqrt_f32_e32 +define void @sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { +entry: + %0 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) + store <2 x float> %0, <2 x float> addrspace(1)* %out + ret void +} + +; R600-LABEL: {{^}}sqrt_v4f32: +; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Y +; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Y, PS +; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Z +; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Z, PS +; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].W +; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].W, PS +; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[4].X +; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[4].X, PS +; SI-LABEL: {{^}}sqrt_v4f32: +; SI: v_sqrt_f32_e32 +; SI: v_sqrt_f32_e32 +; SI: v_sqrt_f32_e32 +; SI: v_sqrt_f32_e32 +define void @sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { +entry: + %0 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) + store <4 x float> %0, <4 x float> addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}elim_redun_check: +; SI: v_sqrt_f32_e32 +; SI-NOT: v_cndmask +define void @elim_redun_check(float addrspace(1)* %out, float %in) { +entry: + %sqrt = call float @llvm.sqrt.f32(float %in) + %cmp = fcmp olt float %in, -0.000000e+00 + %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt + store float %res, float addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}elim_redun_check_ult: +; SI: v_sqrt_f32_e32 +; SI-NOT: v_cndmask +define void @elim_redun_check_ult(float addrspace(1)* %out, float %in) { +entry: + %sqrt = call float @llvm.sqrt.f32(float %in) + %cmp = fcmp ult float %in, -0.000000e+00 + %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt + store float %res, float addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}elim_redun_check_v2: +; SI: v_sqrt_f32_e32 +; SI: v_sqrt_f32_e32 +; SI-NOT: v_cndmask +define void @elim_redun_check_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) { +entry: + %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) + %cmp = fcmp olt <2 x float> %in, + %res = select <2 x i1> %cmp, <2 x float> , <2 x float> %sqrt + store <2 x float> %res, <2 x float> addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}elim_redun_check_v2_ult +; SI: v_sqrt_f32_e32 +; SI: v_sqrt_f32_e32 +; SI-NOT: v_cndmask +define void @elim_redun_check_v2_ult(<2 x float> addrspace(1)* %out, <2 x float> %in) { +entry: + %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) + %cmp = fcmp ult <2 x float> %in, + %res = select <2 x i1> %cmp, <2 x float> , <2 x float> %sqrt + store <2 x float> %res, <2 x float> addrspace(1)* %out + ret void +} + +declare float @llvm.sqrt.f32(float %in) +declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) +declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) diff --git a/test/CodeGen/AMDGPU/load-i1.ll b/test/CodeGen/AMDGPU/load-i1.ll new file mode 100644 index 00000000000..0ca49fde3e7 --- /dev/null +++ b/test/CodeGen/AMDGPU/load-i1.ll @@ -0,0 +1,149 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}global_copy_i1_to_i1: +; SI: buffer_load_ubyte +; SI: v_and_b32_e32 v{{[0-9]+}}, 1 +; SI: buffer_store_byte +; SI: s_endpgm + +; EG: VTX_READ_8 +; EG: AND_INT +define void @global_copy_i1_to_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + store i1 %load, i1 addrspace(1)* %out, align 1 + ret void +} + +; FUNC-LABEL: {{^}}local_copy_i1_to_i1: +; SI: ds_read_u8 +; SI: v_and_b32_e32 v{{[0-9]+}}, 1 +; SI: ds_write_b8 +; SI: s_endpgm + +; EG: LDS_UBYTE_READ_RET +; EG: AND_INT +; EG: LDS_BYTE_WRITE +define void @local_copy_i1_to_i1(i1 addrspace(3)* %out, i1 addrspace(3)* %in) nounwind { + %load = load i1, i1 addrspace(3)* %in + store i1 %load, i1 addrspace(3)* %out, align 1 + ret void +} + +; FUNC-LABEL: {{^}}constant_copy_i1_to_i1: +; SI: buffer_load_ubyte +; SI: v_and_b32_e32 v{{[0-9]+}}, 1 +; SI: buffer_store_byte +; SI: s_endpgm + +; EG: VTX_READ_8 +; EG: AND_INT +define void @constant_copy_i1_to_i1(i1 addrspace(1)* %out, i1 addrspace(2)* %in) nounwind { + %load = load i1, i1 addrspace(2)* %in + store i1 %load, i1 addrspace(1)* %out, align 1 + ret void +} + +; FUNC-LABEL: {{^}}global_sextload_i1_to_i32: +; SI: buffer_load_ubyte +; SI: v_bfe_i32 +; SI: buffer_store_dword +; SI: s_endpgm + +; EG: VTX_READ_8 +; EG: BFE_INT +define void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = sext i1 %load to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}global_zextload_i1_to_i32: +; SI: buffer_load_ubyte +; SI: buffer_store_dword +; SI: s_endpgm + +define void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = zext i1 %load to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}global_sextload_i1_to_i64: +; SI: buffer_load_ubyte +; SI: v_bfe_i32 +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = sext i1 %load to i64 + store i64 %ext, i64 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}global_zextload_i1_to_i64: +; SI: buffer_load_ubyte +; SI: v_mov_b32_e32 {{v[0-9]+}}, 0 +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = zext i1 %load to i64 + store i64 %ext, i64 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}i1_arg: +; SI: buffer_load_ubyte +; SI: v_and_b32_e32 +; SI: buffer_store_byte +; SI: s_endpgm +define void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind { + store i1 %x, i1 addrspace(1)* %out, align 1 + ret void +} + +; FUNC-LABEL: {{^}}i1_arg_zext_i32: +; SI: buffer_load_ubyte +; SI: buffer_store_dword +; SI: s_endpgm +define void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { + %ext = zext i1 %x to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}i1_arg_zext_i64: +; SI: buffer_load_ubyte +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { + %ext = zext i1 %x to i64 + store i64 %ext, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}i1_arg_sext_i32: +; SI: buffer_load_ubyte +; SI: buffer_store_dword +; SI: s_endpgm +define void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { + %ext = sext i1 %x to i32 + store i32 %ext, i32addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}i1_arg_sext_i64: +; SI: buffer_load_ubyte +; SI: v_bfe_i32 +; SI: v_ashrrev_i32 +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { + %ext = sext i1 %x to i64 + store i64 %ext, i64 addrspace(1)* %out, align 8 + ret void +} diff --git a/test/CodeGen/AMDGPU/load-input-fold.ll b/test/CodeGen/AMDGPU/load-input-fold.ll new file mode 100644 index 00000000000..1daf0e6527b --- /dev/null +++ b/test/CodeGen/AMDGPU/load-input-fold.ll @@ -0,0 +1,117 @@ +;RUN: llc < %s -march=r600 -mcpu=cayman + +define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) #0 { +main_body: + %0 = extractelement <4 x float> %reg1, i32 0 + %1 = extractelement <4 x float> %reg1, i32 1 + %2 = extractelement <4 x float> %reg1, i32 2 + %3 = extractelement <4 x float> %reg1, i32 3 + %4 = extractelement <4 x float> %reg2, i32 0 + %5 = extractelement <4 x float> %reg2, i32 1 + %6 = extractelement <4 x float> %reg2, i32 2 + %7 = extractelement <4 x float> %reg2, i32 3 + %8 = extractelement <4 x float> %reg3, i32 0 + %9 = extractelement <4 x float> %reg3, i32 1 + %10 = extractelement <4 x float> %reg3, i32 2 + %11 = extractelement <4 x float> %reg3, i32 3 + %12 = load <4 x float>, <4 x float> addrspace(8)* null + %13 = extractelement <4 x float> %12, i32 0 + %14 = fmul float %0, %13 + %15 = load <4 x float>, <4 x float> addrspace(8)* null + %16 = extractelement <4 x float> %15, i32 1 + %17 = fmul float %0, %16 + %18 = load <4 x float>, <4 x float> addrspace(8)* null + %19 = extractelement <4 x float> %18, i32 2 + %20 = fmul float %0, %19 + %21 = load <4 x float>, <4 x float> addrspace(8)* null + %22 = extractelement <4 x float> %21, i32 3 + %23 = fmul float %0, %22 + %24 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %25 = extractelement <4 x float> %24, i32 0 + %26 = fmul float %1, %25 + %27 = fadd float %26, %14 + %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %29 = extractelement <4 x float> %28, i32 1 + %30 = fmul float %1, %29 + %31 = fadd float %30, %17 + %32 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %33 = extractelement <4 x float> %32, i32 2 + %34 = fmul float %1, %33 + %35 = fadd float %34, %20 + %36 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %37 = extractelement <4 x float> %36, i32 3 + %38 = fmul float %1, %37 + %39 = fadd float %38, %23 + %40 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %41 = extractelement <4 x float> %40, i32 0 + %42 = fmul float %2, %41 + %43 = fadd float %42, %27 + %44 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %45 = extractelement <4 x float> %44, i32 1 + %46 = fmul float %2, %45 + %47 = fadd float %46, %31 + %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %49 = extractelement <4 x float> %48, i32 2 + %50 = fmul float %2, %49 + %51 = fadd float %50, %35 + %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %53 = extractelement <4 x float> %52, i32 3 + %54 = fmul float %2, %53 + %55 = fadd float %54, %39 + %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %57 = extractelement <4 x float> %56, i32 0 + %58 = fmul float %3, %57 + %59 = fadd float %58, %43 + %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %61 = extractelement <4 x float> %60, i32 1 + %62 = fmul float %3, %61 + %63 = fadd float %62, %47 + %64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %65 = extractelement <4 x float> %64, i32 2 + %66 = fmul float %3, %65 + %67 = fadd float %66, %51 + %68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %69 = extractelement <4 x float> %68, i32 3 + %70 = fmul float %3, %69 + %71 = fadd float %70, %55 + %72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) + %73 = extractelement <4 x float> %72, i32 0 + %74 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) + %75 = extractelement <4 x float> %74, i32 1 + %76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) + %77 = extractelement <4 x float> %76, i32 2 + %78 = insertelement <4 x float> undef, float %4, i32 0 + %79 = insertelement <4 x float> %78, float %5, i32 1 + %80 = insertelement <4 x float> %79, float %6, i32 2 + %81 = insertelement <4 x float> %80, float 0.000000e+00, i32 3 + %82 = insertelement <4 x float> undef, float %73, i32 0 + %83 = insertelement <4 x float> %82, float %75, i32 1 + %84 = insertelement <4 x float> %83, float %77, i32 2 + %85 = insertelement <4 x float> %84, float 0.000000e+00, i32 3 + %86 = call float @llvm.AMDGPU.dp4(<4 x float> %81, <4 x float> %85) + %87 = insertelement <4 x float> undef, float %86, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %87, i32 2, i32 2) + ret void +} + +; Function Attrs: readnone +declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 + +; Function Attrs: readonly +declare float @fabs(float) #2 + +; Function Attrs: readnone +declare float @llvm.AMDGPU.rsq(float) #1 + +; Function Attrs: readnone +declare float @llvm.AMDIL.clamp.(float, float, float) #1 + +; Function Attrs: nounwind readonly +declare float @llvm.pow.f32(float, float) #3 + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="1" } +attributes #1 = { readnone } +attributes #2 = { readonly } +attributes #3 = { nounwind readonly } diff --git a/test/CodeGen/AMDGPU/load.ll b/test/CodeGen/AMDGPU/load.ll new file mode 100644 index 00000000000..93b1b51a0d0 --- /dev/null +++ b/test/CodeGen/AMDGPU/load.ll @@ -0,0 +1,709 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=R600 --check-prefix=FUNC %s +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s + +;===------------------------------------------------------------------------===; +; GLOBAL ADDRESS SPACE +;===------------------------------------------------------------------------===; + +; Load an i8 value from the global address space. +; FUNC-LABEL: {{^}}load_i8: +; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}} + +; SI: buffer_load_ubyte v{{[0-9]+}}, +define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { + %1 = load i8, i8 addrspace(1)* %in + %2 = zext i8 %1 to i32 + store i32 %2, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_i8_sext: +; R600: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]] +; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal +; R600: 8 +; SI: buffer_load_sbyte +define void @load_i8_sext(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { +entry: + %0 = load i8, i8 addrspace(1)* %in + %1 = sext i8 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v2i8: +; R600: VTX_READ_8 +; R600: VTX_READ_8 +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +define void @load_v2i8(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) { +entry: + %0 = load <2 x i8>, <2 x i8> addrspace(1)* %in + %1 = zext <2 x i8> %0 to <2 x i32> + store <2 x i32> %1, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v2i8_sext: +; R600-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] +; R600-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] +; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal +; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal +; R600-DAG: 8 +; R600-DAG: 8 + +; SI: buffer_load_sbyte +; SI: buffer_load_sbyte +define void @load_v2i8_sext(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) { +entry: + %0 = load <2 x i8>, <2 x i8> addrspace(1)* %in + %1 = sext <2 x i8> %0 to <2 x i32> + store <2 x i32> %1, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v4i8: +; R600: VTX_READ_8 +; R600: VTX_READ_8 +; R600: VTX_READ_8 +; R600: VTX_READ_8 +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +define void @load_v4i8(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) { +entry: + %0 = load <4 x i8>, <4 x i8> addrspace(1)* %in + %1 = zext <4 x i8> %0 to <4 x i32> + store <4 x i32> %1, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v4i8_sext: +; R600-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] +; R600-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] +; R600-DAG: VTX_READ_8 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]] +; R600-DAG: VTX_READ_8 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]] +; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal +; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal +; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Z]], 0.0, literal +; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_W]], 0.0, literal +; R600-DAG: 8 +; R600-DAG: 8 +; R600-DAG: 8 +; R600-DAG: 8 +; SI: buffer_load_sbyte +; SI: buffer_load_sbyte +; SI: buffer_load_sbyte +; SI: buffer_load_sbyte +define void @load_v4i8_sext(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) { +entry: + %0 = load <4 x i8>, <4 x i8> addrspace(1)* %in + %1 = sext <4 x i8> %0 to <4 x i32> + store <4 x i32> %1, <4 x i32> addrspace(1)* %out + ret void +} + +; Load an i16 value from the global address space. +; FUNC-LABEL: {{^}}load_i16: +; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}} +; SI: buffer_load_ushort +define void @load_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { +entry: + %0 = load i16 , i16 addrspace(1)* %in + %1 = zext i16 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_i16_sext: +; R600: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]] +; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal +; R600: 16 +; SI: buffer_load_sshort +define void @load_i16_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { +entry: + %0 = load i16, i16 addrspace(1)* %in + %1 = sext i16 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v2i16: +; R600: VTX_READ_16 +; R600: VTX_READ_16 +; SI: buffer_load_ushort +; SI: buffer_load_ushort +define void @load_v2i16(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { +entry: + %0 = load <2 x i16>, <2 x i16> addrspace(1)* %in + %1 = zext <2 x i16> %0 to <2 x i32> + store <2 x i32> %1, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v2i16_sext: +; R600-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] +; R600-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] +; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal +; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal +; R600-DAG: 16 +; R600-DAG: 16 +; SI: buffer_load_sshort +; SI: buffer_load_sshort +define void @load_v2i16_sext(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { +entry: + %0 = load <2 x i16>, <2 x i16> addrspace(1)* %in + %1 = sext <2 x i16> %0 to <2 x i32> + store <2 x i32> %1, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v4i16: +; R600: VTX_READ_16 +; R600: VTX_READ_16 +; R600: VTX_READ_16 +; R600: VTX_READ_16 +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +define void @load_v4i16(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { +entry: + %0 = load <4 x i16>, <4 x i16> addrspace(1)* %in + %1 = zext <4 x i16> %0 to <4 x i32> + store <4 x i32> %1, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v4i16_sext: +; R600-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] +; R600-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] +; R600-DAG: VTX_READ_16 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]] +; R600-DAG: VTX_READ_16 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]] +; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal +; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal +; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Z]], 0.0, literal +; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_W]], 0.0, literal +; R600-DAG: 16 +; R600-DAG: 16 +; R600-DAG: 16 +; R600-DAG: 16 +; SI: buffer_load_sshort +; SI: buffer_load_sshort +; SI: buffer_load_sshort +; SI: buffer_load_sshort +define void @load_v4i16_sext(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { +entry: + %0 = load <4 x i16>, <4 x i16> addrspace(1)* %in + %1 = sext <4 x i16> %0 to <4 x i32> + store <4 x i32> %1, <4 x i32> addrspace(1)* %out + ret void +} + +; load an i32 value from the global address space. +; FUNC-LABEL: {{^}}load_i32: +; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0 + +; SI: buffer_load_dword v{{[0-9]+}} +define void @load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +entry: + %0 = load i32, i32 addrspace(1)* %in + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; load a f32 value from the global address space. +; FUNC-LABEL: {{^}}load_f32: +; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0 + +; SI: buffer_load_dword v{{[0-9]+}} +define void @load_f32(float addrspace(1)* %out, float addrspace(1)* %in) { +entry: + %0 = load float, float addrspace(1)* %in + store float %0, float addrspace(1)* %out + ret void +} + +; load a v2f32 value from the global address space +; FUNC-LABEL: {{^}}load_v2f32: +; R600: MEM_RAT +; R600: VTX_READ_64 +; SI: buffer_load_dwordx2 +define void @load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) { +entry: + %0 = load <2 x float>, <2 x float> addrspace(1)* %in + store <2 x float> %0, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_i64: +; R600: VTX_READ_64 +; SI: buffer_load_dwordx2 +define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +entry: + %0 = load i64, i64 addrspace(1)* %in + store i64 %0, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_i64_sext: +; R600: MEM_RAT +; R600: MEM_RAT +; R600: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal.x +; R600: 31 +; SI: buffer_load_dword + +define void @load_i64_sext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { +entry: + %0 = load i32, i32 addrspace(1)* %in + %1 = sext i32 %0 to i64 + store i64 %1, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_i64_zext: +; R600: MEM_RAT +; R600: MEM_RAT +define void @load_i64_zext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { +entry: + %0 = load i32, i32 addrspace(1)* %in + %1 = zext i32 %0 to i64 + store i64 %1, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v8i32: +; R600: VTX_READ_128 +; R600: VTX_READ_128 +; XXX: We should be using DWORDX4 instructions on SI. +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +define void @load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) { +entry: + %0 = load <8 x i32>, <8 x i32> addrspace(1)* %in + store <8 x i32> %0, <8 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v16i32: +; R600: VTX_READ_128 +; R600: VTX_READ_128 +; R600: VTX_READ_128 +; R600: VTX_READ_128 +; XXX: We should be using DWORDX4 instructions on SI. +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +define void @load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) { +entry: + %0 = load <16 x i32>, <16 x i32> addrspace(1)* %in + store <16 x i32> %0, <16 x i32> addrspace(1)* %out + ret void +} + +;===------------------------------------------------------------------------===; +; CONSTANT ADDRESS SPACE +;===------------------------------------------------------------------------===; + +; Load a sign-extended i8 value +; FUNC-LABEL: {{^}}load_const_i8_sext: +; R600: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]] +; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal +; R600: 8 +; SI: buffer_load_sbyte v{{[0-9]+}}, +define void @load_const_i8_sext(i32 addrspace(1)* %out, i8 addrspace(2)* %in) { +entry: + %0 = load i8, i8 addrspace(2)* %in + %1 = sext i8 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; Load an aligned i8 value +; FUNC-LABEL: {{^}}load_const_i8_aligned: +; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}} +; SI: buffer_load_ubyte v{{[0-9]+}}, +define void @load_const_i8_aligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) { +entry: + %0 = load i8, i8 addrspace(2)* %in + %1 = zext i8 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; Load an un-aligned i8 value +; FUNC-LABEL: {{^}}load_const_i8_unaligned: +; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}} +; SI: buffer_load_ubyte v{{[0-9]+}}, +define void @load_const_i8_unaligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) { +entry: + %0 = getelementptr i8, i8 addrspace(2)* %in, i32 1 + %1 = load i8, i8 addrspace(2)* %0 + %2 = zext i8 %1 to i32 + store i32 %2, i32 addrspace(1)* %out + ret void +} + +; Load a sign-extended i16 value +; FUNC-LABEL: {{^}}load_const_i16_sext: +; R600: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]] +; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal +; R600: 16 +; SI: buffer_load_sshort +define void @load_const_i16_sext(i32 addrspace(1)* %out, i16 addrspace(2)* %in) { +entry: + %0 = load i16, i16 addrspace(2)* %in + %1 = sext i16 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; Load an aligned i16 value +; FUNC-LABEL: {{^}}load_const_i16_aligned: +; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}} +; SI: buffer_load_ushort +define void @load_const_i16_aligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) { +entry: + %0 = load i16, i16 addrspace(2)* %in + %1 = zext i16 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; Load an un-aligned i16 value +; FUNC-LABEL: {{^}}load_const_i16_unaligned: +; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}} +; SI: buffer_load_ushort +define void @load_const_i16_unaligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) { +entry: + %0 = getelementptr i16, i16 addrspace(2)* %in, i32 1 + %1 = load i16, i16 addrspace(2)* %0 + %2 = zext i16 %1 to i32 + store i32 %2, i32 addrspace(1)* %out + ret void +} + +; Load an i32 value from the constant address space. +; FUNC-LABEL: {{^}}load_const_addrspace_i32: +; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0 + +; SI: s_load_dword s{{[0-9]+}} +define void @load_const_addrspace_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { +entry: + %0 = load i32, i32 addrspace(2)* %in + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; Load a f32 value from the constant address space. +; FUNC-LABEL: {{^}}load_const_addrspace_f32: +; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0 + +; SI: s_load_dword s{{[0-9]+}} +define void @load_const_addrspace_f32(float addrspace(1)* %out, float addrspace(2)* %in) { + %1 = load float, float addrspace(2)* %in + store float %1, float addrspace(1)* %out + ret void +} + +;===------------------------------------------------------------------------===; +; LOCAL ADDRESS SPACE +;===------------------------------------------------------------------------===; + +; Load an i8 value from the local address space. +; FUNC-LABEL: {{^}}load_i8_local: +; R600: LDS_UBYTE_READ_RET +; SI-NOT: s_wqm_b64 +; SI: s_mov_b32 m0 +; SI: ds_read_u8 +define void @load_i8_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) { + %1 = load i8, i8 addrspace(3)* %in + %2 = zext i8 %1 to i32 + store i32 %2, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_i8_sext_local: +; R600: LDS_UBYTE_READ_RET +; R600: BFE_INT +; SI-NOT: s_wqm_b64 +; SI: s_mov_b32 m0 +; SI: ds_read_i8 +define void @load_i8_sext_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) { +entry: + %0 = load i8, i8 addrspace(3)* %in + %1 = sext i8 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v2i8_local: +; R600: LDS_UBYTE_READ_RET +; R600: LDS_UBYTE_READ_RET +; SI-NOT: s_wqm_b64 +; SI: s_mov_b32 m0 +; SI: ds_read_u8 +; SI: ds_read_u8 +define void @load_v2i8_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) { +entry: + %0 = load <2 x i8>, <2 x i8> addrspace(3)* %in + %1 = zext <2 x i8> %0 to <2 x i32> + store <2 x i32> %1, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v2i8_sext_local: +; R600-DAG: LDS_UBYTE_READ_RET +; R600-DAG: LDS_UBYTE_READ_RET +; R600-DAG: BFE_INT +; R600-DAG: BFE_INT +; SI-NOT: s_wqm_b64 +; SI: s_mov_b32 m0 +; SI: ds_read_i8 +; SI: ds_read_i8 +define void @load_v2i8_sext_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) { +entry: + %0 = load <2 x i8>, <2 x i8> addrspace(3)* %in + %1 = sext <2 x i8> %0 to <2 x i32> + store <2 x i32> %1, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v4i8_local: +; R600: LDS_UBYTE_READ_RET +; R600: LDS_UBYTE_READ_RET +; R600: LDS_UBYTE_READ_RET +; R600: LDS_UBYTE_READ_RET +; SI-NOT: s_wqm_b64 +; SI: s_mov_b32 m0 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +define void @load_v4i8_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) { +entry: + %0 = load <4 x i8>, <4 x i8> addrspace(3)* %in + %1 = zext <4 x i8> %0 to <4 x i32> + store <4 x i32> %1, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v4i8_sext_local: +; R600-DAG: LDS_UBYTE_READ_RET +; R600-DAG: LDS_UBYTE_READ_RET +; R600-DAG: LDS_UBYTE_READ_RET +; R600-DAG: LDS_UBYTE_READ_RET +; R600-DAG: BFE_INT +; R600-DAG: BFE_INT +; R600-DAG: BFE_INT +; R600-DAG: BFE_INT +; SI-NOT: s_wqm_b64 +; SI: s_mov_b32 m0 +; SI: ds_read_i8 +; SI: ds_read_i8 +; SI: ds_read_i8 +; SI: ds_read_i8 +define void @load_v4i8_sext_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) { +entry: + %0 = load <4 x i8>, <4 x i8> addrspace(3)* %in + %1 = sext <4 x i8> %0 to <4 x i32> + store <4 x i32> %1, <4 x i32> addrspace(1)* %out + ret void +} + +; Load an i16 value from the local address space. +; FUNC-LABEL: {{^}}load_i16_local: +; R600: LDS_USHORT_READ_RET +; SI-NOT: s_wqm_b64 +; SI: s_mov_b32 m0 +; SI: ds_read_u16 +define void @load_i16_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) { +entry: + %0 = load i16 , i16 addrspace(3)* %in + %1 = zext i16 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_i16_sext_local: +; R600: LDS_USHORT_READ_RET +; R600: BFE_INT +; SI-NOT: s_wqm_b64 +; SI: s_mov_b32 m0 +; SI: ds_read_i16 +define void @load_i16_sext_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) { +entry: + %0 = load i16, i16 addrspace(3)* %in + %1 = sext i16 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v2i16_local: +; R600: LDS_USHORT_READ_RET +; R600: LDS_USHORT_READ_RET +; SI-NOT: s_wqm_b64 +; SI: s_mov_b32 m0 +; SI: ds_read_u16 +; SI: ds_read_u16 +define void @load_v2i16_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) { +entry: + %0 = load <2 x i16>, <2 x i16> addrspace(3)* %in + %1 = zext <2 x i16> %0 to <2 x i32> + store <2 x i32> %1, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v2i16_sext_local: +; R600-DAG: LDS_USHORT_READ_RET +; R600-DAG: LDS_USHORT_READ_RET +; R600-DAG: BFE_INT +; R600-DAG: BFE_INT +; SI-NOT: s_wqm_b64 +; SI: s_mov_b32 m0 +; SI: ds_read_i16 +; SI: ds_read_i16 +define void @load_v2i16_sext_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) { +entry: + %0 = load <2 x i16>, <2 x i16> addrspace(3)* %in + %1 = sext <2 x i16> %0 to <2 x i32> + store <2 x i32> %1, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v4i16_local: +; R600: LDS_USHORT_READ_RET +; R600: LDS_USHORT_READ_RET +; R600: LDS_USHORT_READ_RET +; R600: LDS_USHORT_READ_RET +; SI-NOT: s_wqm_b64 +; SI: s_mov_b32 m0 +; SI: ds_read_u16 +; SI: ds_read_u16 +; SI: ds_read_u16 +; SI: ds_read_u16 +define void @load_v4i16_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) { +entry: + %0 = load <4 x i16>, <4 x i16> addrspace(3)* %in + %1 = zext <4 x i16> %0 to <4 x i32> + store <4 x i32> %1, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_v4i16_sext_local: +; R600-DAG: LDS_USHORT_READ_RET +; R600-DAG: LDS_USHORT_READ_RET +; R600-DAG: LDS_USHORT_READ_RET +; R600-DAG: LDS_USHORT_READ_RET +; R600-DAG: BFE_INT +; R600-DAG: BFE_INT +; R600-DAG: BFE_INT +; R600-DAG: BFE_INT +; SI-NOT: s_wqm_b64 +; SI: s_mov_b32 m0 +; SI: ds_read_i16 +; SI: ds_read_i16 +; SI: ds_read_i16 +; SI: ds_read_i16 +define void @load_v4i16_sext_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) { +entry: + %0 = load <4 x i16>, <4 x i16> addrspace(3)* %in + %1 = sext <4 x i16> %0 to <4 x i32> + store <4 x i32> %1, <4 x i32> addrspace(1)* %out + ret void +} + +; load an i32 value from the local address space. +; FUNC-LABEL: {{^}}load_i32_local: +; R600: LDS_READ_RET +; SI-NOT: s_wqm_b64 +; SI: s_mov_b32 m0 +; SI: ds_read_b32 +define void @load_i32_local(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { +entry: + %0 = load i32, i32 addrspace(3)* %in + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; load a f32 value from the local address space. +; FUNC-LABEL: {{^}}load_f32_local: +; R600: LDS_READ_RET +; SI: s_mov_b32 m0 +; SI: ds_read_b32 +define void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) { +entry: + %0 = load float, float addrspace(3)* %in + store float %0, float addrspace(1)* %out + ret void +} + +; load a v2f32 value from the local address space +; FUNC-LABEL: {{^}}load_v2f32_local: +; R600: LDS_READ_RET +; R600: LDS_READ_RET +; SI: s_mov_b32 m0 +; SI: ds_read_b64 +define void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) { +entry: + %0 = load <2 x float>, <2 x float> addrspace(3)* %in + store <2 x float> %0, <2 x float> addrspace(1)* %out + ret void +} + +; Test loading a i32 and v2i32 value from the same base pointer. +; FUNC-LABEL: {{^}}load_i32_v2i32_local: +; R600: LDS_READ_RET +; R600: LDS_READ_RET +; R600: LDS_READ_RET +; SI-DAG: ds_read_b32 +; SI-DAG: ds_read2_b32 +define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) { + %scalar = load i32, i32 addrspace(3)* %in + %tmp0 = bitcast i32 addrspace(3)* %in to <2 x i32> addrspace(3)* + %vec_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(3)* %tmp0, i32 2 + %vec0 = load <2 x i32>, <2 x i32> addrspace(3)* %vec_ptr, align 4 + %vec1 = insertelement <2 x i32> , i32 %scalar, i32 0 + %vec = add <2 x i32> %vec0, %vec1 + store <2 x i32> %vec, <2 x i32> addrspace(1)* %out + ret void +} + + +@lds = addrspace(3) global [512 x i32] undef, align 4 + +; On SI we need to make sure that the base offset is a register and not +; an immediate. +; FUNC-LABEL: {{^}}load_i32_local_const_ptr: +; SI: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0 +; SI: ds_read_b32 v0, v[[ZERO]] offset:4 +; R600: LDS_READ_RET +define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { +entry: + %tmp0 = getelementptr [512 x i32], [512 x i32] addrspace(3)* @lds, i32 0, i32 1 + %tmp1 = load i32, i32 addrspace(3)* %tmp0 + %tmp2 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + store i32 %tmp1, i32 addrspace(1)* %tmp2 + ret void +} diff --git a/test/CodeGen/AMDGPU/load.vec.ll b/test/CodeGen/AMDGPU/load.vec.ll new file mode 100644 index 00000000000..02f883cd8e9 --- /dev/null +++ b/test/CodeGen/AMDGPU/load.vec.ll @@ -0,0 +1,25 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s + +; load a v2i32 value from the global address space. +; EG: {{^}}load_v2i32: +; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0 +; SI: {{^}}load_v2i32: +; SI: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}] +define void @load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %a = load <2 x i32>, <2 x i32> addrspace(1) * %in + store <2 x i32> %a, <2 x i32> addrspace(1)* %out + ret void +} + +; load a v4i32 value from the global address space. +; EG: {{^}}load_v4i32: +; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0 +; SI: {{^}}load_v4i32: +; SI: buffer_load_dwordx4 v[{{[0-9]+:[0-9]+}}] +define void @load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %a = load <4 x i32>, <4 x i32> addrspace(1) * %in + store <4 x i32> %a, <4 x i32> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/load64.ll b/test/CodeGen/AMDGPU/load64.ll new file mode 100644 index 00000000000..74beabdc007 --- /dev/null +++ b/test/CodeGen/AMDGPU/load64.ll @@ -0,0 +1,31 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +; load a f64 value from the global address space. +; CHECK-LABEL: {{^}}load_f64: +; CHECK: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}] +; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}] +define void @load_f64(double addrspace(1)* %out, double addrspace(1)* %in) { + %1 = load double, double addrspace(1)* %in + store double %1, double addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}load_i64: +; CHECK: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}] +; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}] +define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { + %tmp = load i64, i64 addrspace(1)* %in + store i64 %tmp, i64 addrspace(1)* %out, align 8 + ret void +} + +; Load a f64 value from the constant address space. +; CHECK-LABEL: {{^}}load_const_addrspace_f64: +; CHECK: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}] +; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}] +define void @load_const_addrspace_f64(double addrspace(1)* %out, double addrspace(2)* %in) { + %1 = load double, double addrspace(2)* %in + store double %1, double addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/local-64.ll b/test/CodeGen/AMDGPU/local-64.ll new file mode 100644 index 00000000000..33f3159d13e --- /dev/null +++ b/test/CodeGen/AMDGPU/local-64.ll @@ -0,0 +1,167 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=BOTH %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s + +; BOTH-LABEL: {{^}}local_i32_load +; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28 +; BOTH: buffer_store_dword [[REG]], +define void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 + %val = load i32, i32 addrspace(3)* %gep, align 4 + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; BOTH-LABEL: {{^}}local_i32_load_0_offset +; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} +; BOTH: buffer_store_dword [[REG]], +define void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind { + %val = load i32, i32 addrspace(3)* %in, align 4 + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; BOTH-LABEL: {{^}}local_i8_load_i16_max_offset: +; BOTH-NOT: ADD +; BOTH: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535 +; BOTH: buffer_store_byte [[REG]], +define void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind { + %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65535 + %val = load i8, i8 addrspace(3)* %gep, align 4 + store i8 %val, i8 addrspace(1)* %out, align 4 + ret void +} + +; BOTH-LABEL: {{^}}local_i8_load_over_i16_max_offset: +; The LDS offset will be 65536 bytes, which is larger than the size of LDS on +; SI, which is why it is being OR'd with the base pointer. +; SI: s_or_b32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000 +; CI: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000 +; BOTH: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]] +; BOTH: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]] +; BOTH: buffer_store_byte [[REG]], +define void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind { + %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65536 + %val = load i8, i8 addrspace(3)* %gep, align 4 + store i8 %val, i8 addrspace(1)* %out, align 4 + ret void +} + +; BOTH-LABEL: {{^}}local_i64_load: +; BOTH-NOT: ADD +; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 +; BOTH: buffer_store_dwordx2 [[REG]], +define void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %in, i32 7 + %val = load i64, i64 addrspace(3)* %gep, align 8 + store i64 %val, i64 addrspace(1)* %out, align 8 + ret void +} + +; BOTH-LABEL: {{^}}local_i64_load_0_offset +; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} +; BOTH: buffer_store_dwordx2 [[REG]], +define void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind { + %val = load i64, i64 addrspace(3)* %in, align 8 + store i64 %val, i64 addrspace(1)* %out, align 8 + ret void +} + +; BOTH-LABEL: {{^}}local_f64_load: +; BOTH-NOT: ADD +; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 +; BOTH: buffer_store_dwordx2 [[REG]], +define void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind { + %gep = getelementptr double, double addrspace(3)* %in, i32 7 + %val = load double, double addrspace(3)* %gep, align 8 + store double %val, double addrspace(1)* %out, align 8 + ret void +} + +; BOTH-LABEL: {{^}}local_f64_load_0_offset +; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} +; BOTH: buffer_store_dwordx2 [[REG]], +define void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind { + %val = load double, double addrspace(3)* %in, align 8 + store double %val, double addrspace(1)* %out, align 8 + ret void +} + +; BOTH-LABEL: {{^}}local_i64_store: +; BOTH-NOT: ADD +; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 +define void @local_i64_store(i64 addrspace(3)* %out) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %out, i32 7 + store i64 5678, i64 addrspace(3)* %gep, align 8 + ret void +} + +; BOTH-LABEL: {{^}}local_i64_store_0_offset: +; BOTH-NOT: ADD +; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} +define void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind { + store i64 1234, i64 addrspace(3)* %out, align 8 + ret void +} + +; BOTH-LABEL: {{^}}local_f64_store: +; BOTH-NOT: ADD +; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 +define void @local_f64_store(double addrspace(3)* %out) nounwind { + %gep = getelementptr double, double addrspace(3)* %out, i32 7 + store double 16.0, double addrspace(3)* %gep, align 8 + ret void +} + +; BOTH-LABEL: {{^}}local_f64_store_0_offset +; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} +define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind { + store double 20.0, double addrspace(3)* %out, align 8 + ret void +} + +; BOTH-LABEL: {{^}}local_v2i64_store: +; BOTH-NOT: ADD +; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:112 +; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:120 +; BOTH: s_endpgm +define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind { + %gep = getelementptr <2 x i64>, <2 x i64> addrspace(3)* %out, i32 7 + store <2 x i64> , <2 x i64> addrspace(3)* %gep, align 16 + ret void +} + +; BOTH-LABEL: {{^}}local_v2i64_store_0_offset: +; BOTH-NOT: ADD +; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} +; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8 +; BOTH: s_endpgm +define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind { + store <2 x i64> , <2 x i64> addrspace(3)* %out, align 16 + ret void +} + +; BOTH-LABEL: {{^}}local_v4i64_store: +; BOTH-NOT: ADD +; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:224 +; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:232 +; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:240 +; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:248 +; BOTH: s_endpgm +define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind { + %gep = getelementptr <4 x i64>, <4 x i64> addrspace(3)* %out, i32 7 + store <4 x i64> , <4 x i64> addrspace(3)* %gep, align 16 + ret void +} + +; BOTH-LABEL: {{^}}local_v4i64_store_0_offset: +; BOTH-NOT: ADD +; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} +; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8 +; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:16 +; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:24 +; BOTH: s_endpgm +define void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind { + store <4 x i64> , <4 x i64> addrspace(3)* %out, align 16 + ret void +} diff --git a/test/CodeGen/AMDGPU/local-atomics.ll b/test/CodeGen/AMDGPU/local-atomics.ll new file mode 100644 index 00000000000..2aaf977ab90 --- /dev/null +++ b/test/CodeGen/AMDGPU/local-atomics.ll @@ -0,0 +1,551 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32: +; EG: LDS_WRXCHG_RET * +; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 +; GCN: s_load_dword [[SPTR:s[0-9]+]], +; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] +; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm +define void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32_offset: +; EG: LDS_WRXCHG_RET * +; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; XXX - Is it really necessary to load 4 into VGPR? +; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32: +; EG: LDS_ADD_RET * +; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 +; GCN: s_load_dword [[SPTR:s[0-9]+]], +; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] +; GCN: ds_add_rtn_u32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] +; GCN: buffer_store_dword [[RESULT]], +; GCN: s_endpgm +define void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32_offset: +; EG: LDS_ADD_RET * +; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32_bad_si_offset: +; EG: LDS_ADD_RET * +; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { + %sub = sub i32 %a, %b + %add = add i32 %sub, 4 + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add + %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32: +; EG: LDS_ADD_RET * +; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 +; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] +; GCN: s_endpgm +define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32_offset: +; EG: LDS_ADD_RET * +; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 +; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16 +; GCN: s_endpgm +define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32_bad_si_offset: +; EG: LDS_ADD_RET * +; SI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CIVI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_inc_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { + %sub = sub i32 %a, %b + %add = add i32 %sub, 4 + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add + %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i32: +; EG: LDS_SUB_RET * +; GCN: ds_sub_rtn_u32 +; GCN: s_endpgm +define void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i32_offset: +; EG: LDS_SUB_RET * +; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32: +; EG: LDS_SUB_RET * +; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 +; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] +; GCN: s_endpgm +define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32_offset: +; EG: LDS_SUB_RET * +; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 +; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16 +; GCN: s_endpgm +define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_and_ret_i32: +; EG: LDS_AND_RET * +; GCN: ds_and_rtn_b32 +; GCN: s_endpgm +define void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_and_ret_i32_offset: +; EG: LDS_AND_RET * +; GCN: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_or_ret_i32: +; EG: LDS_OR_RET * +; GCN: ds_or_rtn_b32 +; GCN: s_endpgm +define void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_or_ret_i32_offset: +; EG: LDS_OR_RET * +; GCN: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i32: +; EG: LDS_XOR_RET * +; GCN: ds_xor_rtn_b32 +; GCN: s_endpgm +define void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i32_offset: +; EG: LDS_XOR_RET * +; GCN: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FIXME: There is no atomic nand instr +; XFUNC-LABEL: {{^}}lds_atomic_nand_ret_i32:uction, so we somehow need to expand this. +; define void @lds_atomic_nand_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { +; %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst +; store i32 %result, i32 addrspace(1)* %out, align 4 +; ret void +; } + +; FUNC-LABEL: {{^}}lds_atomic_min_ret_i32: +; EG: LDS_MIN_INT_RET * +; GCN: ds_min_rtn_i32 +; GCN: s_endpgm +define void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_min_ret_i32_offset: +; EG: LDS_MIN_INT_RET * +; GCN: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_max_ret_i32: +; EG: LDS_MAX_INT_RET * +; GCN: ds_max_rtn_i32 +; GCN: s_endpgm +define void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_max_ret_i32_offset: +; EG: LDS_MAX_INT_RET * +; GCN: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i32: +; EG: LDS_MIN_UINT_RET * +; GCN: ds_min_rtn_u32 +; GCN: s_endpgm +define void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i32_offset: +; EG: LDS_MIN_UINT_RET * +; GCN: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i32: +; EG: LDS_MAX_UINT_RET * +; GCN: ds_max_rtn_u32 +; GCN: s_endpgm +define void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i32_offset: +; EG: LDS_MAX_UINT_RET * +; GCN: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32: +; GCN: s_load_dword [[SPTR:s[0-9]+]], +; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 +; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] +; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] +; GCN: s_endpgm +define void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32_offset: +; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst + ret void +} + +; XXX - Is it really necessary to load 4 into VGPR? +; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32: +; GCN: s_load_dword [[SPTR:s[0-9]+]], +; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 +; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] +; GCN: ds_add_u32 [[VPTR]], [[DATA]] +; GCN: s_endpgm +define void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_offset: +; GCN: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_bad_si_offset +; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} +; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { + %sub = sub i32 %a, %b + %add = add i32 %sub, 4 + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add + %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32: +; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 +; GCN: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]] +; GCN: s_endpgm +define void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_offset: +; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 +; GCN: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]] offset:16 +; GCN: s_endpgm +define void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_bad_si_offset: +; SI: ds_inc_u32 v{{[0-9]+}}, v{{[0-9]+}} +; CIVI: ds_inc_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_inc_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { + %sub = sub i32 %a, %b + %add = add i32 %sub, 4 + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add + %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32: +; GCN: ds_sub_u32 +; GCN: s_endpgm +define void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32_offset: +; GCN: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32: +; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 +; GCN: ds_dec_u32 v{{[0-9]+}}, [[NEGONE]] +; GCN: s_endpgm +define void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32_offset: +; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 +; GCN: ds_dec_u32 v{{[0-9]+}}, [[NEGONE]] offset:16 +; GCN: s_endpgm +define void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32: +; GCN: ds_and_b32 +; GCN: s_endpgm +define void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32_offset: +; GCN: ds_and_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32: +; GCN: ds_or_b32 +; GCN: s_endpgm +define void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32_offset: +; GCN: ds_or_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32: +; GCN: ds_xor_b32 +; GCN: s_endpgm +define void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32_offset: +; GCN: ds_xor_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst + ret void +} + +; FIXME: There is no atomic nand instr +; XFUNC-LABEL: {{^}}lds_atomic_nand_noret_i32:uction, so we somehow need to expand this. +; define void @lds_atomic_nand_noret_i32(i32 addrspace(3)* %ptr) nounwind { +; %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst +; ret void +; } + +; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32: +; GCN: ds_min_i32 +; GCN: s_endpgm +define void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32_offset: +; GCN: ds_min_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32: +; GCN: ds_max_i32 +; GCN: s_endpgm +define void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32_offset: +; GCN: ds_max_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32: +; GCN: ds_min_u32 +; GCN: s_endpgm +define void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32_offset: +; GCN: ds_min_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32: +; GCN: ds_max_u32 +; GCN: s_endpgm +define void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32_offset: +; GCN: ds_max_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define void @lds_atomic_umax_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst + ret void +} diff --git a/test/CodeGen/AMDGPU/local-atomics64.ll b/test/CodeGen/AMDGPU/local-atomics64.ll new file mode 100644 index 00000000000..0ffa5e751b7 --- /dev/null +++ b/test/CodeGen/AMDGPU/local-atomics64.ll @@ -0,0 +1,470 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=VI -check-prefix=GCN %s + +; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i64: +; GCN: ds_wrxchg_rtn_b64 +; GCN: s_endpgm +define void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i64_offset: +; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64: +; GCN: ds_add_rtn_u64 +; GCN: s_endpgm +define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64_offset: +; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9 +; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0 +; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] +; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 +; GCN: buffer_store_dwordx2 [[RESULT]], +; GCN: s_endpgm +define void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4 + %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i64: +; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1 +; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1 +; GCN: ds_inc_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} +; GCN: buffer_store_dwordx2 [[RESULT]], +; GCN: s_endpgm +define void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i64_offset: +; GCN: ds_inc_rtn_u64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i64: +; GCN: ds_sub_rtn_u64 +; GCN: s_endpgm +define void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i64_offset: +; GCN: ds_sub_rtn_u64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i64: +; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1 +; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1 +; GCN: ds_dec_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} +; GCN: buffer_store_dwordx2 [[RESULT]], +; GCN: s_endpgm +define void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i64_offset: +; GCN: ds_dec_rtn_u64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_and_ret_i64: +; GCN: ds_and_rtn_b64 +; GCN: s_endpgm +define void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_and_ret_i64_offset: +; GCN: ds_and_rtn_b64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_or_ret_i64: +; GCN: ds_or_rtn_b64 +; GCN: s_endpgm +define void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_or_ret_i64_offset: +; GCN: ds_or_rtn_b64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i64: +; GCN: ds_xor_rtn_b64 +; GCN: s_endpgm +define void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i64_offset: +; GCN: ds_xor_rtn_b64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FIXME: There is no atomic nand instr +; XFUNC-LABEL: {{^}}lds_atomic_nand_ret_i64:uction, so we somehow need to expand this. +; define void @lds_atomic_nand_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { +; %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst +; store i64 %result, i64 addrspace(1)* %out, align 8 +; ret void +; } + +; FUNC-LABEL: {{^}}lds_atomic_min_ret_i64: +; GCN: ds_min_rtn_i64 +; GCN: s_endpgm +define void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_min_ret_i64_offset: +; GCN: ds_min_rtn_i64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_max_ret_i64: +; GCN: ds_max_rtn_i64 +; GCN: s_endpgm +define void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_max_ret_i64_offset: +; GCN: ds_max_rtn_i64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i64: +; GCN: ds_min_rtn_u64 +; GCN: s_endpgm +define void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i64_offset: +; GCN: ds_min_rtn_u64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i64: +; GCN: ds_max_rtn_u64 +; GCN: s_endpgm +define void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i64_offset: +; GCN: ds_max_rtn_u64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i64: +; GCN: ds_wrxchg_rtn_b64 +; GCN: s_endpgm +define void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i64_offset: +; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_add_noret_i64: +; GCN: ds_add_u64 +; GCN: s_endpgm +define void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_add_noret_i64_offset: +; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9 +; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24 +; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9 +; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0 +; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] +; GCN: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4 + %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64: +; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1 +; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1 +; GCN: ds_inc_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} +; GCN: s_endpgm +define void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64_offset: +; GCN: ds_inc_u64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i64: +; GCN: ds_sub_u64 +; GCN: s_endpgm +define void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i64_offset: +; GCN: ds_sub_u64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64: +; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1 +; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1 +; GCN: ds_dec_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} +; GCN: s_endpgm +define void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64_offset: +; GCN: ds_dec_u64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_and_noret_i64: +; GCN: ds_and_b64 +; GCN: s_endpgm +define void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_and_noret_i64_offset: +; GCN: ds_and_b64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_or_noret_i64: +; GCN: ds_or_b64 +; GCN: s_endpgm +define void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_or_noret_i64_offset: +; GCN: ds_or_b64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i64: +; GCN: ds_xor_b64 +; GCN: s_endpgm +define void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i64_offset: +; GCN: ds_xor_b64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst + ret void +} + +; FIXME: There is no atomic nand instr +; XFUNC-LABEL: {{^}}lds_atomic_nand_noret_i64:uction, so we somehow need to expand this. +; define void @lds_atomic_nand_noret_i64(i64 addrspace(3)* %ptr) nounwind { +; %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst +; ret void +; } + +; FUNC-LABEL: {{^}}lds_atomic_min_noret_i64: +; GCN: ds_min_i64 +; GCN: s_endpgm +define void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_min_noret_i64_offset: +; GCN: ds_min_i64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_max_noret_i64: +; GCN: ds_max_i64 +; GCN: s_endpgm +define void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_max_noret_i64_offset: +; GCN: ds_max_i64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i64: +; GCN: ds_min_u64 +; GCN: s_endpgm +define void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i64_offset: +; GCN: ds_min_u64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i64: +; GCN: ds_max_u64 +; GCN: s_endpgm +define void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i64_offset: +; GCN: ds_max_u64 {{.*}} offset:32 +; GCN: s_endpgm +define void @lds_atomic_umax_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst + ret void +} diff --git a/test/CodeGen/AMDGPU/local-memory-two-objects.ll b/test/CodeGen/AMDGPU/local-memory-two-objects.ll new file mode 100644 index 00000000000..06a8b1246e6 --- /dev/null +++ b/test/CodeGen/AMDGPU/local-memory-two-objects.ll @@ -0,0 +1,63 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=SI %s +; RUN: llc < %s -march=amdgcn -mcpu=bonaire -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=CI %s + +@local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4 +@local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4 + + +; Check that the LDS size emitted correctly +; EG: .long 166120 +; EG-NEXT: .long 8 +; GCN: .long 47180 +; GCN-NEXT: .long 38792 + +; EG: {{^}}local_memory_two_objects: + +; We would like to check the the lds writes are using different +; addresses, but due to variations in the scheduler, we can't do +; this consistently on evergreen GPUs. +; EG: LDS_WRITE +; EG: LDS_WRITE +; GCN: ds_write_b32 {{v[0-9]*}}, v[[ADDRW:[0-9]*]] +; GCN-NOT: ds_write_b32 {{v[0-9]*}}, v[[ADDRW]] + +; GROUP_BARRIER must be the last instruction in a clause +; EG: GROUP_BARRIER +; EG-NEXT: ALU clause + +; Make sure the lds reads are using different addresses, at different +; constant offsets. +; EG: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]] +; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]] +; SI: v_add_i32_e32 [[SIPTR:v[0-9]+]], 16, v{{[0-9]+}} +; SI: ds_read_b32 {{v[0-9]+}}, [[SIPTR]] +; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]] offset:16 +; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR]] + +define void @local_memory_two_objects(i32 addrspace(1)* %out) { +entry: + %x.i = call i32 @llvm.r600.read.tidig.x() #0 + %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i + store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4 + %mul = shl nsw i32 %x.i, 1 + %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i + store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4 + %sub = sub nsw i32 3, %x.i + call void @llvm.AMDGPU.barrier.local() + %arrayidx2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub + %0 = load i32, i32 addrspace(3)* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %x.i + store i32 %0, i32 addrspace(1)* %arrayidx3, align 4 + %arrayidx4 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub + %1 = load i32, i32 addrspace(3)* %arrayidx4, align 4 + %add = add nsw i32 %x.i, 4 + %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add + store i32 %1, i32 addrspace(1)* %arrayidx5, align 4 + ret void +} + +declare i32 @llvm.r600.read.tidig.x() #0 +declare void @llvm.AMDGPU.barrier.local() + +attributes #0 = { readnone } diff --git a/test/CodeGen/AMDGPU/local-memory.ll b/test/CodeGen/AMDGPU/local-memory.ll new file mode 100644 index 00000000000..9494ed75bd0 --- /dev/null +++ b/test/CodeGen/AMDGPU/local-memory.ll @@ -0,0 +1,49 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s + +@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4 + + +; Check that the LDS size emitted correctly +; EG: .long 166120 +; EG-NEXT: .long 128 +; SI: .long 47180 +; SI-NEXT: .long 71560 +; CI: .long 47180 +; CI-NEXT: .long 38792 + +; FUNC-LABEL: {{^}}local_memory: + +; EG: LDS_WRITE +; SI-NOT: s_wqm_b64 +; SI: ds_write_b32 + +; GROUP_BARRIER must be the last instruction in a clause +; EG: GROUP_BARRIER +; EG-NEXT: ALU clause +; SI: s_barrier + +; EG: LDS_READ_RET +; SI: ds_read_b32 {{v[0-9]+}}, + +define void @local_memory(i32 addrspace(1)* %out) { +entry: + %y.i = call i32 @llvm.r600.read.tidig.x() #0 + %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i + store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4 + %add = add nsw i32 %y.i, 1 + %cmp = icmp eq i32 %add, 16 + %.add = select i1 %cmp, i32 0, i32 %add + call void @llvm.AMDGPU.barrier.local() + %arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add + %0 = load i32, i32 addrspace(3)* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i + store i32 %0, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +declare i32 @llvm.r600.read.tidig.x() #0 +declare void @llvm.AMDGPU.barrier.local() + +attributes #0 = { readnone } diff --git a/test/CodeGen/AMDGPU/loop-address.ll b/test/CodeGen/AMDGPU/loop-address.ll new file mode 100644 index 00000000000..f60d574497d --- /dev/null +++ b/test/CodeGen/AMDGPU/loop-address.ll @@ -0,0 +1,34 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood < %s | FileCheck %s + +;CHECK: ALU_PUSH +;CHECK: LOOP_START_DX10 @11 +;CHECK: LOOP_BREAK @10 +;CHECK: POP @10 + +define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) #0 { +entry: + %cmp5 = icmp sgt i32 %iterations, 0 + br i1 %cmp5, label %for.body, label %for.end + +for.body: ; preds = %for.body, %entry + %i.07.in = phi i32 [ %i.07, %for.body ], [ %iterations, %entry ] + %ai.06 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %i.07 = add nsw i32 %i.07.in, -1 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %ai.06 + store i32 %i.07, i32 addrspace(1)* %arrayidx, align 4 + %add = add nsw i32 %ai.06, 1 + %exitcond = icmp eq i32 %add, %iterations + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +attributes #0 = { nounwind "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" } + +!opencl.kernels = !{!0, !1, !2, !3} + +!0 = !{void (i32 addrspace(1)*, i32)* @loop_ge} +!1 = !{null} +!2 = !{null} +!3 = !{null} diff --git a/test/CodeGen/AMDGPU/loop-idiom.ll b/test/CodeGen/AMDGPU/loop-idiom.ll new file mode 100644 index 00000000000..5fd9806813c --- /dev/null +++ b/test/CodeGen/AMDGPU/loop-idiom.ll @@ -0,0 +1,51 @@ +; RUN: opt -basicaa -loop-idiom -S < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s +; RUN: opt -basicaa -loop-idiom -S < %s -march=amdgcn -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s +; RUN: opt -basicaa -loop-idiom -S < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s + + +; Make sure loop-idiom doesn't create memcpy or memset. There are no library +; implementations of these for R600. + +; FUNC: @no_memcpy +; R600-NOT: {{^}}llvm.memcpy +; SI-NOT: {{^}}llvm.memcpy +define void @no_memcpy(i8 addrspace(3)* %in, i32 %size) { +entry: + %dest = alloca i8, i32 32 + br label %for.body + +for.body: + %0 = phi i32 [0, %entry], [%4, %for.body] + %1 = getelementptr i8, i8 addrspace(3)* %in, i32 %0 + %2 = getelementptr i8, i8* %dest, i32 %0 + %3 = load i8, i8 addrspace(3)* %1 + store i8 %3, i8* %2 + %4 = add i32 %0, 1 + %5 = icmp eq i32 %4, %size + br i1 %5, label %for.end, label %for.body + +for.end: + ret void +} + +; FUNC: @no_memset +; R600-NOT: {{^}}llvm.memset +; R600-NOT: {{^}}memset_pattern16: +; SI-NOT: {{^}}llvm.memset +; SI-NOT: {{^}}memset_pattern16: +define void @no_memset(i32 %size) { +entry: + %dest = alloca i8, i32 32 + br label %for.body + +for.body: + %0 = phi i32 [0, %entry], [%2, %for.body] + %1 = getelementptr i8, i8* %dest, i32 %0 + store i8 0, i8* %1 + %2 = add i32 %0, 1 + %3 = icmp eq i32 %2, %size + br i1 %3, label %for.end, label %for.body + +for.end: + ret void +} diff --git a/test/CodeGen/AMDGPU/lshl.ll b/test/CodeGen/AMDGPU/lshl.ll new file mode 100644 index 00000000000..9ac988d38d1 --- /dev/null +++ b/test/CodeGen/AMDGPU/lshl.ll @@ -0,0 +1,15 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK: s_lshl_b32 s{{[0-9]}}, s{{[0-9]}}, 1 + +define void @test(i32 %p) { + %i = mul i32 %p, 2 + %r = bitcast i32 %i to float + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) + ret void +} + +declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/test/CodeGen/AMDGPU/lshr.ll b/test/CodeGen/AMDGPU/lshr.ll new file mode 100644 index 00000000000..50e444ac26b --- /dev/null +++ b/test/CodeGen/AMDGPU/lshr.ll @@ -0,0 +1,15 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK: s_lshr_b32 s{{[0-9]}}, s{{[0-9]}}, 1 + +define void @test(i32 %p) { + %i = udiv i32 %p, 2 + %r = bitcast i32 %i to float + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) + ret void +} + +declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/test/CodeGen/AMDGPU/m0-spill.ll b/test/CodeGen/AMDGPU/m0-spill.ll new file mode 100644 index 00000000000..1dddc85f775 --- /dev/null +++ b/test/CodeGen/AMDGPU/m0-spill.ll @@ -0,0 +1,35 @@ +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +@lds = external addrspace(3) global [64 x float] + +; CHECK-LABEL: {{^}}main: +; CHECK-NOT: v_readlane_b32 m0 +define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" { +main_body: + %4 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3) + %cmp = fcmp ueq float 0.0, %4 + br i1 %cmp, label %if, label %else + +if: + %lds_ptr = getelementptr [64 x float], [64 x float] addrspace(3)* @lds, i32 0, i32 0 + %lds_data = load float, float addrspace(3)* %lds_ptr + br label %endif + +else: + %interp = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3) + br label %endif + +endif: + %export = phi float [%lds_data, %if], [%interp, %else] + %5 = call i32 @llvm.SI.packf16(float %export, float %export) + %6 = bitcast i32 %5 to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %6, float %6, float %6, float %6) + ret void +} + +declare float @llvm.SI.fs.constant(i32, i32, i32) readnone + +declare i32 @llvm.SI.packf16(float, float) readnone + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/test/CodeGen/AMDGPU/mad-combine.ll b/test/CodeGen/AMDGPU/mad-combine.ll new file mode 100644 index 00000000000..bc071628ead --- /dev/null +++ b/test/CodeGen/AMDGPU/mad-combine.ll @@ -0,0 +1,567 @@ +; Make sure we still form mad even when unsafe math or fp-contract is allowed instead of fma. + +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s + +; Make sure we don't form mad with denormals +; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=verde -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() #0 +declare float @llvm.fabs.f32(float) #0 +declare float @llvm.fma.f32(float, float, float) #0 +declare float @llvm.fmuladd.f32(float, float, float) #0 + +; (fadd (fmul x, y), z) -> (fma x, y, z) +; FUNC-LABEL: {{^}}combine_to_mad_f32_0: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} + +; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] + +; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] + +; SI-DENORM-SLOWFMAF-NOT: v_fma +; SI-DENORM-SLOWFMAF-NOT: v_mad + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] + +; SI: buffer_store_dword [[RESULT]] +define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %gep.0 + %b = load float, float addrspace(1)* %gep.1 + %c = load float, float addrspace(1)* %gep.2 + + %mul = fmul float %a, %b + %fma = fadd float %mul, %c + store float %fma, float addrspace(1)* %gep.out + ret void +} + +; (fadd (fmul x, y), z) -> (fma x, y, z) +; FUNC-LABEL: {{^}}combine_to_mad_f32_0_2use: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} + +; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]] +; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]] + +; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]] +; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]] +; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] + +; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI: s_endpgm +define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 + %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0 + %b = load float, float addrspace(1)* %gep.1 + %c = load float, float addrspace(1)* %gep.2 + %d = load float, float addrspace(1)* %gep.3 + + %mul = fmul float %a, %b + %fma0 = fadd float %mul, %c + %fma1 = fadd float %mul, %d + + store float %fma0, float addrspace(1)* %gep.out.0 + store float %fma1, float addrspace(1)* %gep.out.1 + ret void +} + +; (fadd x, (fmul y, z)) -> (fma y, z, x) +; FUNC-LABEL: {{^}}combine_to_mad_f32_1: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} + +; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] +; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] + +; SI: buffer_store_dword [[RESULT]] +define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %gep.0 + %b = load float, float addrspace(1)* %gep.1 + %c = load float, float addrspace(1)* %gep.2 + + %mul = fmul float %a, %b + %fma = fadd float %c, %mul + store float %fma, float addrspace(1)* %gep.out + ret void +} + +; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) +; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} + +; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]] +; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] + +; SI: buffer_store_dword [[RESULT]] +define void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %gep.0 + %b = load float, float addrspace(1)* %gep.1 + %c = load float, float addrspace(1)* %gep.2 + + %mul = fmul float %a, %b + %fma = fsub float %mul, %c + store float %fma, float addrspace(1)* %gep.out + ret void +} + +; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) +; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32_2use: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} + +; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]] +; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] + +; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]] +; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]] +; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] + +; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI: s_endpgm +define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 + %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0 + %b = load float, float addrspace(1)* %gep.1 + %c = load float, float addrspace(1)* %gep.2 + %d = load float, float addrspace(1)* %gep.3 + + %mul = fmul float %a, %b + %fma0 = fsub float %mul, %c + %fma1 = fsub float %mul, %d + store float %fma0, float addrspace(1)* %gep.out.0 + store float %fma1, float addrspace(1)* %gep.out.1 + ret void +} + +; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) +; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} + +; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]] +; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] + +; SI: buffer_store_dword [[RESULT]] +define void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %gep.0 + %b = load float, float addrspace(1)* %gep.1 + %c = load float, float addrspace(1)* %gep.2 + + %mul = fmul float %a, %b + %fma = fsub float %c, %mul + store float %fma, float addrspace(1)* %gep.out + ret void +} + +; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) +; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32_2use: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} + +; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]] +; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]] + +; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]] +; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]] +; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]] + +; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI: s_endpgm +define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 + %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0 + %b = load float, float addrspace(1)* %gep.1 + %c = load float, float addrspace(1)* %gep.2 + %d = load float, float addrspace(1)* %gep.3 + + %mul = fmul float %a, %b + %fma0 = fsub float %c, %mul + %fma1 = fsub float %d, %mul + store float %fma0, float addrspace(1)* %gep.out.0 + store float %fma1, float addrspace(1)* %gep.out.1 + ret void +} + +; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) +; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} + +; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]] + +; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[TMP]], [[C]] + +; SI: buffer_store_dword [[RESULT]] +define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %gep.0 + %b = load float, float addrspace(1)* %gep.1 + %c = load float, float addrspace(1)* %gep.2 + + %mul = fmul float %a, %b + %mul.neg = fsub float -0.0, %mul + %fma = fsub float %mul.neg, %c + + store float %fma, float addrspace(1)* %gep.out + ret void +} + +; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) +; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_neg: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} + +; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] +; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]] + +; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] +; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]] +; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT1:v[0-9]+]], -[[TMP]], [[D]] + +; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI: s_endpgm +define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 + %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0 + %b = load float, float addrspace(1)* %gep.1 + %c = load float, float addrspace(1)* %gep.2 + %d = load float, float addrspace(1)* %gep.3 + + %mul = fmul float %a, %b + %mul.neg = fsub float -0.0, %mul + %fma0 = fsub float %mul.neg, %c + %fma1 = fsub float %mul.neg, %d + + store float %fma0, float addrspace(1)* %gep.out.0 + store float %fma1, float addrspace(1)* %gep.out.1 + ret void +} + +; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) +; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_mul: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} + +; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] +; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] + +; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] +; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]] +; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] + +; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI: s_endpgm +define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 + %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 + + %a = load float, float addrspace(1)* %gep.0 + %b = load float, float addrspace(1)* %gep.1 + %c = load float, float addrspace(1)* %gep.2 + %d = load float, float addrspace(1)* %gep.3 + + %mul = fmul float %a, %b + %mul.neg = fsub float -0.0, %mul + %fma0 = fsub float %mul.neg, %c + %fma1 = fsub float %mul, %d + + store float %fma0, float addrspace(1)* %gep.out.0 + store float %fma1, float addrspace(1)* %gep.out.1 + ret void +} + +; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) + +; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_0_f32: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} +; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} + +; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]] +; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP1]] + +; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], [[D]], [[E]], -[[C]] +; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP0]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]] +; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[C]], [[TMP1]] + +; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 + %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %x = load float, float addrspace(1)* %gep.0 + %y = load float, float addrspace(1)* %gep.1 + %z = load float, float addrspace(1)* %gep.2 + %u = load float, float addrspace(1)* %gep.3 + %v = load float, float addrspace(1)* %gep.4 + + %tmp0 = fmul float %u, %v + %tmp1 = call float @llvm.fma.f32(float %x, float %y, float %tmp0) #0 + %tmp2 = fsub float %tmp1, %z + + store float %tmp2, float addrspace(1)* %gep.out + ret void +} + +; fold (fsub x, (fma y, z, (fmul u, v))) +; -> (fma (fneg y), z, (fma (fneg u), v, x)) + +; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_1_f32: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} +; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} + +; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]] +; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]] + +; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], -[[D]], [[E]], [[A]] +; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP0]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]] +; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]] + +; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: s_endpgm +define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 + %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %x = load float, float addrspace(1)* %gep.0 + %y = load float, float addrspace(1)* %gep.1 + %z = load float, float addrspace(1)* %gep.2 + %u = load float, float addrspace(1)* %gep.3 + %v = load float, float addrspace(1)* %gep.4 + + %tmp0 = fmul float %u, %v + %tmp1 = call float @llvm.fma.f32(float %y, float %z, float %tmp0) #0 + %tmp2 = fsub float %x, %tmp1 + + store float %tmp2, float addrspace(1)* %gep.out + ret void +} + +; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) + +; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_2_f32: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} +; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} + +; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]] +; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]] + +; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]] +; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]] +; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP2]] + +; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: s_endpgm +define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 + %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %x = load float, float addrspace(1)* %gep.0 + %y = load float, float addrspace(1)* %gep.1 + %z = load float, float addrspace(1)* %gep.2 + %u = load float, float addrspace(1)* %gep.3 + %v = load float, float addrspace(1)* %gep.4 + + %tmp0 = fmul float %u, %v + %tmp1 = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0) #0 + %tmp2 = fsub float %tmp1, %z + + store float %tmp2, float addrspace(1)* %gep.out + ret void +} + +; fold (fsub x, (fmuladd y, z, (fmul u, v))) +; -> (fmuladd (fneg y), z, (fmuladd (fneg u), v, x)) + +; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_3_f32: +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} +; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} + +; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]] +; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]] + +; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]] +; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]] + +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[C]], [[B]] +; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]] +; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[A]] + +; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI: s_endpgm +define void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 + %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 + %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %x = load float, float addrspace(1)* %gep.0 + %y = load float, float addrspace(1)* %gep.1 + %z = load float, float addrspace(1)* %gep.2 + %u = load float, float addrspace(1)* %gep.3 + %v = load float, float addrspace(1)* %gep.4 + + %tmp0 = fmul float %u, %v + %tmp1 = call float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0 + %tmp2 = fsub float %x, %tmp1 + + store float %tmp2, float addrspace(1)* %gep.out + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/mad-sub.ll b/test/CodeGen/AMDGPU/mad-sub.ll new file mode 100644 index 00000000000..aa4194ff610 --- /dev/null +++ b/test/CodeGen/AMDGPU/mad-sub.ll @@ -0,0 +1,215 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() #0 +declare float @llvm.fabs.f32(float) #0 + +; FUNC-LABEL: {{^}}mad_sub_f32: +; SI: buffer_load_dword [[REGA:v[0-9]+]] +; SI: buffer_load_dword [[REGB:v[0-9]+]] +; SI: buffer_load_dword [[REGC:v[0-9]+]] +; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] +; SI: buffer_store_dword [[RESULT]] +define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext + %a = load float, float addrspace(1)* %gep0, align 4 + %b = load float, float addrspace(1)* %gep1, align 4 + %c = load float, float addrspace(1)* %gep2, align 4 + %mul = fmul float %a, %b + %sub = fsub float %mul, %c + store float %sub, float addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: {{^}}mad_sub_inv_f32: +; SI: buffer_load_dword [[REGA:v[0-9]+]] +; SI: buffer_load_dword [[REGB:v[0-9]+]] +; SI: buffer_load_dword [[REGC:v[0-9]+]] +; SI: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] +; SI: buffer_store_dword [[RESULT]] +define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext + %a = load float, float addrspace(1)* %gep0, align 4 + %b = load float, float addrspace(1)* %gep1, align 4 + %c = load float, float addrspace(1)* %gep2, align 4 + %mul = fmul float %a, %b + %sub = fsub float %c, %mul + store float %sub, float addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: {{^}}mad_sub_f64: +; SI: v_mul_f64 +; SI: v_add_f64 +define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr double, double addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr double, double addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr double, double addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr double, double addrspace(1)* %out, i64 %tid.ext + %a = load double, double addrspace(1)* %gep0, align 8 + %b = load double, double addrspace(1)* %gep1, align 8 + %c = load double, double addrspace(1)* %gep2, align 8 + %mul = fmul double %a, %b + %sub = fsub double %mul, %c + store double %sub, double addrspace(1)* %outgep, align 8 + ret void +} + +; FUNC-LABEL: {{^}}mad_sub_fabs_f32: +; SI: buffer_load_dword [[REGA:v[0-9]+]] +; SI: buffer_load_dword [[REGB:v[0-9]+]] +; SI: buffer_load_dword [[REGC:v[0-9]+]] +; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| +; SI: buffer_store_dword [[RESULT]] +define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext + %a = load float, float addrspace(1)* %gep0, align 4 + %b = load float, float addrspace(1)* %gep1, align 4 + %c = load float, float addrspace(1)* %gep2, align 4 + %c.abs = call float @llvm.fabs.f32(float %c) #0 + %mul = fmul float %a, %b + %sub = fsub float %mul, %c.abs + store float %sub, float addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: {{^}}mad_sub_fabs_inv_f32: +; SI: buffer_load_dword [[REGA:v[0-9]+]] +; SI: buffer_load_dword [[REGB:v[0-9]+]] +; SI: buffer_load_dword [[REGC:v[0-9]+]] +; SI: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| +; SI: buffer_store_dword [[RESULT]] +define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext + %a = load float, float addrspace(1)* %gep0, align 4 + %b = load float, float addrspace(1)* %gep1, align 4 + %c = load float, float addrspace(1)* %gep2, align 4 + %c.abs = call float @llvm.fabs.f32(float %c) #0 + %mul = fmul float %a, %b + %sub = fsub float %c.abs, %mul + store float %sub, float addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: {{^}}neg_neg_mad_f32: +; SI: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext + %a = load float, float addrspace(1)* %gep0, align 4 + %b = load float, float addrspace(1)* %gep1, align 4 + %c = load float, float addrspace(1)* %gep2, align 4 + %nega = fsub float -0.000000e+00, %a + %negb = fsub float -0.000000e+00, %b + %mul = fmul float %nega, %negb + %sub = fadd float %mul, %c + store float %sub, float addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: {{^}}mad_fabs_sub_f32: +; SI: buffer_load_dword [[REGA:v[0-9]+]] +; SI: buffer_load_dword [[REGB:v[0-9]+]] +; SI: buffer_load_dword [[REGC:v[0-9]+]] +; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] +; SI: buffer_store_dword [[RESULT]] +define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { + %tid = tail call i32 @llvm.r600.read.tidig.x() #0 + %tid.ext = sext i32 %tid to i64 + %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext + %add1 = add i64 %tid.ext, 1 + %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 + %add2 = add i64 %tid.ext, 2 + %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 + %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext + %a = load float, float addrspace(1)* %gep0, align 4 + %b = load float, float addrspace(1)* %gep1, align 4 + %c = load float, float addrspace(1)* %gep2, align 4 + %b.abs = call float @llvm.fabs.f32(float %b) #0 + %mul = fmul float %a, %b.abs + %sub = fsub float %mul, %c + store float %sub, float addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: {{^}}fsub_c_fadd_a_a: +; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]] +; SI: buffer_store_dword [[RESULT]] +define void @fsub_c_fadd_a_a(float addrspace(1)* %out, float addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load float, float addrspace(1)* %gep.0 + %r2 = load float, float addrspace(1)* %gep.1 + + %add = fadd float %r1, %r1 + %r3 = fsub float %r2, %add + + store float %r3, float addrspace(1)* %gep.out + ret void +} + +; FUNC-LABEL: {{^}}fsub_fadd_a_a_c: +; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]] +; SI: buffer_store_dword [[RESULT]] +define void @fsub_fadd_a_a_c(float addrspace(1)* %out, float addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load float, float addrspace(1)* %gep.0 + %r2 = load float, float addrspace(1)* %gep.1 + + %add = fadd float %r1, %r1 + %r3 = fsub float %add, %r2 + + store float %r3, float addrspace(1)* %gep.out + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/mad_int24.ll b/test/CodeGen/AMDGPU/mad_int24.ll new file mode 100644 index 00000000000..86d75a63ca4 --- /dev/null +++ b/test/CodeGen/AMDGPU/mad_int24.ll @@ -0,0 +1,35 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC + +declare i32 @llvm.AMDGPU.imul24(i32, i32) nounwind readnone + +; FUNC-LABEL: {{^}}i32_mad24: +; Signed 24-bit multiply is not supported on pre-Cayman GPUs. +; EG: MULLO_INT +; Make sure we aren't masking the inputs. +; CM-NOT: AND +; CM: MULADD_INT24 +; SI-NOT: and +; SI: v_mad_i32_i24 +define void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +entry: + %0 = shl i32 %a, 8 + %a_24 = ashr i32 %0, 8 + %1 = shl i32 %b, 8 + %b_24 = ashr i32 %1, 8 + %2 = mul i32 %a_24, %b_24 + %3 = add i32 %2, %c + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @test_imul24 +; SI: v_mad_i32_i24 +define void @test_imul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind { + %mul = call i32 @llvm.AMDGPU.imul24(i32 %src0, i32 %src1) nounwind readnone + %add = add i32 %mul, %src2 + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/mad_uint24.ll b/test/CodeGen/AMDGPU/mad_uint24.ll new file mode 100644 index 00000000000..95fe3411959 --- /dev/null +++ b/test/CodeGen/AMDGPU/mad_uint24.ll @@ -0,0 +1,76 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC + +; FUNC-LABEL: {{^}}u32_mad24: +; EG: MULADD_UINT24 +; SI: v_mad_u32_u24 + +define void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +entry: + %0 = shl i32 %a, 8 + %a_24 = lshr i32 %0, 8 + %1 = shl i32 %b, 8 + %b_24 = lshr i32 %1, 8 + %2 = mul i32 %a_24, %b_24 + %3 = add i32 %2, %c + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i16_mad24: +; The order of A and B does not matter. +; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]] +; The result must be sign-extended +; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x +; EG: 16 +; SI: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}} +; SI: v_bfe_i32 v{{[0-9]}}, [[MAD]], 0, 16 + +define void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) { +entry: + %0 = mul i16 %a, %b + %1 = add i16 %0, %c + %2 = sext i16 %1 to i32 + store i32 %2, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i8_mad24: +; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]] +; The result must be sign-extended +; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x +; EG: 8 +; SI: v_mad_u32_u24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} +; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8 + +define void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) { +entry: + %0 = mul i8 %a, %b + %1 = add i8 %0, %c + %2 = sext i8 %1 to i32 + store i32 %2, i32 addrspace(1)* %out + ret void +} + +; This tests for a bug where the mad_u24 pattern matcher would call +; SimplifyDemandedBits on the first operand of the mul instruction +; assuming that the pattern would be matched to a 24-bit mad. This +; led to some instructions being incorrectly erased when the entire +; 24-bit mad pattern wasn't being matched. + +; Check that the select instruction is not deleted. +; FUNC-LABEL: {{^}}i24_i32_i32_mad: +; EG: CNDE_INT +; SI: v_cndmask +define void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { +entry: + %0 = ashr i32 %a, 8 + %1 = icmp ne i32 %c, 0 + %2 = select i1 %1, i32 %0, i32 34 + %3 = mul i32 %2, %c + %4 = add i32 %3, %d + store i32 %4, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/madak.ll b/test/CodeGen/AMDGPU/madak.ll new file mode 100644 index 00000000000..933bb016d2c --- /dev/null +++ b/test/CodeGen/AMDGPU/madak.ll @@ -0,0 +1,193 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s +; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s + +; FIXME: Enable VI + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone +declare float @llvm.fabs.f32(float) nounwind readnone + +; GCN-LABEL: {{^}}madak_f32: +; GCN: buffer_load_dword [[VA:v[0-9]+]] +; GCN: buffer_load_dword [[VB:v[0-9]+]] +; GCN: v_madak_f32_e32 {{v[0-9]+}}, [[VB]], [[VA]], 0x41200000 +define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid + %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %in.a.gep, align 4 + %b = load float, float addrspace(1)* %in.b.gep, align 4 + + %mul = fmul float %a, %b + %madak = fadd float %mul, 10.0 + store float %madak, float addrspace(1)* %out.gep, align 4 + ret void +} + +; Make sure this is only folded with one use. This is a code size +; optimization and if we fold the immediate multiple times, we'll undo +; it. + +; GCN-LABEL: {{^}}madak_2_use_f32: +; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 +; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 +; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], [[VK]] +; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VC]], [[VK]] +; GCN: s_endpgm +define void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + + %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 + %in.gep.2 = getelementptr float, float addrspace(1)* %in.gep.0, i32 2 + + %out.gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %out.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 + + %a = load float, float addrspace(1)* %in.gep.0, align 4 + %b = load float, float addrspace(1)* %in.gep.1, align 4 + %c = load float, float addrspace(1)* %in.gep.2, align 4 + + %mul0 = fmul float %a, %b + %mul1 = fmul float %a, %c + %madak0 = fadd float %mul0, 10.0 + %madak1 = fadd float %mul1, 10.0 + + store float %madak0, float addrspace(1)* %out.gep.0, align 4 + store float %madak1, float addrspace(1)* %out.gep.1, align 4 + ret void +} + +; GCN-LABEL: {{^}}madak_m_inline_imm_f32: +; GCN: buffer_load_dword [[VA:v[0-9]+]] +; GCN: v_madak_f32_e32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 +define void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %in.a.gep, align 4 + + %mul = fmul float 4.0, %a + %madak = fadd float %mul, 10.0 + store float %madak, float addrspace(1)* %out.gep, align 4 + ret void +} + +; Make sure nothing weird happens with a value that is also allowed as +; an inline immediate. + +; GCN-LABEL: {{^}}madak_inline_imm_f32: +; GCN: buffer_load_dword [[VA:v[0-9]+]] +; GCN: buffer_load_dword [[VB:v[0-9]+]] +; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 +define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid + %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %in.a.gep, align 4 + %b = load float, float addrspace(1)* %in.b.gep, align 4 + + %mul = fmul float %a, %b + %madak = fadd float %mul, 4.0 + store float %madak, float addrspace(1)* %out.gep, align 4 + ret void +} + +; We can't use an SGPR when forming madak +; GCN-LABEL: {{^}}s_v_madak_f32: +; GCN: s_load_dword [[SB:s[0-9]+]] +; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 +; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]] +; GCN-NOT: v_madak_f32 +; GCN: v_mad_f32 {{v[0-9]+}}, [[SB]], [[VA]], [[VK]] +define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %in.a.gep, align 4 + + %mul = fmul float %a, %b + %madak = fadd float %mul, 10.0 + store float %madak, float addrspace(1)* %out.gep, align 4 + ret void +} + +; GCN-LABEL: @v_s_madak_f32 +; GCN-DAG: s_load_dword [[SB:s[0-9]+]] +; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 +; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]] +; GCN-NOT: v_madak_f32 +; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[SB]], [[VK]] +define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + + %b = load float, float addrspace(1)* %in.b.gep, align 4 + + %mul = fmul float %a, %b + %madak = fadd float %mul, 10.0 + store float %madak, float addrspace(1)* %out.gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}s_s_madak_f32: +; GCN-NOT: v_madak_f32 +; GCN: v_mad_f32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +define void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind { + %mul = fmul float %a, %b + %madak = fadd float %mul, 10.0 + store float %madak, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}no_madak_src0_modifier_f32: +; GCN: buffer_load_dword [[VA:v[0-9]+]] +; GCN: buffer_load_dword [[VB:v[0-9]+]] +; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}} +; GCN: s_endpgm +define void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid + %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %in.a.gep, align 4 + %b = load float, float addrspace(1)* %in.b.gep, align 4 + + %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone + + %mul = fmul float %a.fabs, %b + %madak = fadd float %mul, 10.0 + store float %madak, float addrspace(1)* %out.gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}no_madak_src1_modifier_f32: +; GCN: buffer_load_dword [[VA:v[0-9]+]] +; GCN: buffer_load_dword [[VB:v[0-9]+]] +; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}} +; GCN: s_endpgm +define void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid + %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %in.a.gep, align 4 + %b = load float, float addrspace(1)* %in.b.gep, align 4 + + %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone + + %mul = fmul float %a, %b.fabs + %madak = fadd float %mul, 10.0 + store float %madak, float addrspace(1)* %out.gep, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/madmk.ll b/test/CodeGen/AMDGPU/madmk.ll new file mode 100644 index 00000000000..ba7bb221a99 --- /dev/null +++ b/test/CodeGen/AMDGPU/madmk.ll @@ -0,0 +1,205 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone +declare float @llvm.fabs.f32(float) nounwind readnone + +; GCN-LABEL: {{^}}madmk_f32: +; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; GCN: v_madmk_f32_e32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 +define void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %mul = fmul float %a, 10.0 + %madmk = fadd float %mul, %b + store float %madmk, float addrspace(1)* %out.gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}madmk_2_use_f32: +; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 +; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 +; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VK]], [[VB]] +; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VK]], [[VC]] +; GCN: s_endpgm +define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + + %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 + %in.gep.2 = getelementptr float, float addrspace(1)* %in.gep.0, i32 2 + + %out.gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %out.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 + + %a = load float, float addrspace(1)* %in.gep.0, align 4 + %b = load float, float addrspace(1)* %in.gep.1, align 4 + %c = load float, float addrspace(1)* %in.gep.2, align 4 + + %mul0 = fmul float %a, 10.0 + %mul1 = fmul float %a, 10.0 + %madmk0 = fadd float %mul0, %b + %madmk1 = fadd float %mul1, %c + + store float %madmk0, float addrspace(1)* %out.gep.0, align 4 + store float %madmk1, float addrspace(1)* %out.gep.1, align 4 + ret void +} + +; We don't get any benefit if the constant is an inline immediate. +; GCN-LABEL: {{^}}madmk_inline_imm_f32: +; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; GCN: v_mad_f32 {{v[0-9]+}}, 4.0, [[VA]], [[VB]] +define void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %mul = fmul float %a, 4.0 + %madmk = fadd float %mul, %b + store float %madmk, float addrspace(1)* %out.gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}s_s_madmk_f32: +; GCN-NOT: v_madmk_f32 +; GCN: v_mad_f32 +; GCN: s_endpgm +define void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + + %mul = fmul float %a, 10.0 + %madmk = fadd float %mul, %b + store float %madmk, float addrspace(1)* %out.gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}v_s_madmk_f32: +; GCN-NOT: v_madmk_f32 +; GCN: v_mad_f32 +; GCN: s_endpgm +define void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %b) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %a = load float, float addrspace(1)* %gep.0, align 4 + + %mul = fmul float %a, 10.0 + %madmk = fadd float %mul, %b + store float %madmk, float addrspace(1)* %out.gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}scalar_vector_madmk_f32: +; GCN-NOT: v_madmk_f32 +; GCN: v_mad_f32 +; GCN: s_endpgm +define void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %a) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %b = load float, float addrspace(1)* %gep.0, align 4 + + %mul = fmul float %a, 10.0 + %madmk = fadd float %mul, %b + store float %madmk, float addrspace(1)* %out.gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}no_madmk_src0_modifier_f32: +; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}} +define void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone + + %mul = fmul float %a.fabs, 10.0 + %madmk = fadd float %mul, %b + store float %madmk, float addrspace(1)* %out.gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}no_madmk_src2_modifier_f32: +; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, |{{[sv][0-9]+}}| +define void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %gep.0, align 4 + %b = load float, float addrspace(1)* %gep.1, align 4 + + %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone + + %mul = fmul float %a, 10.0 + %madmk = fadd float %mul, %b.fabs + store float %madmk, float addrspace(1)* %out.gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}madmk_add_inline_imm_f32: +; GCN: buffer_load_dword [[A:v[0-9]+]] +; GCN: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 +; GCN: v_mad_f32 {{v[0-9]+}}, [[VK]], [[A]], 2.0 +define void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { + %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + + %a = load float, float addrspace(1)* %gep.0, align 4 + + %mul = fmul float %a, 10.0 + %madmk = fadd float %mul, 2.0 + store float %madmk, float addrspace(1)* %out.gep, align 4 + ret void +} + +; SI-LABEL: {{^}}kill_madmk_verifier_error: +; SI: s_xor_b64 +; SI: v_madmk_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, 0x472aee8c +; SI: s_or_b64 +define void @kill_madmk_verifier_error() nounwind { +bb: + br label %bb2 + +bb1: ; preds = %bb2 + ret void + +bb2: ; preds = %bb6, %bb + %tmp = phi float [ undef, %bb ], [ %tmp8, %bb6 ] + %tmp3 = fsub float undef, %tmp + %tmp5 = fcmp oeq float %tmp3, 1.000000e+04 + br i1 %tmp5, label %bb1, label %bb6 + +bb6: ; preds = %bb2 + %tmp4 = fmul float %tmp, undef + %tmp7 = fmul float %tmp4, 0x40E55DD180000000 + %tmp8 = fadd float %tmp7, undef + br label %bb2 +} diff --git a/test/CodeGen/AMDGPU/max-literals.ll b/test/CodeGen/AMDGPU/max-literals.ll new file mode 100644 index 00000000000..c357524b140 --- /dev/null +++ b/test/CodeGen/AMDGPU/max-literals.ll @@ -0,0 +1,67 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; CHECK-LABEL: {{^}}main: +; CHECK: ADD * + +define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 { +main_body: + %0 = extractelement <4 x float> %reg1, i32 0 + %1 = extractelement <4 x float> %reg1, i32 1 + %2 = extractelement <4 x float> %reg1, i32 2 + %3 = extractelement <4 x float> %reg1, i32 3 + %4 = extractelement <4 x float> %reg2, i32 0 + %5 = fadd float %0, 2.0 + %6 = fadd float %1, 3.0 + %7 = fadd float %2, 4.0 + %8 = fadd float %3, 5.0 + %9 = bitcast float %4 to i32 + %10 = mul i32 %9, 6 + %11 = bitcast i32 %10 to float + %12 = insertelement <4 x float> undef, float %5, i32 0 + %13 = insertelement <4 x float> %12, float %6, i32 1 + %14 = insertelement <4 x float> %13, float %7, i32 2 + %15 = insertelement <4 x float> %14, float %8, i32 3 + %16 = insertelement <4 x float> %15, float %11, i32 3 + + %17 = call float @llvm.AMDGPU.dp4(<4 x float> %15,<4 x float> %16) + %18 = insertelement <4 x float> undef, float %17, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 2) + ret void +} + +; CHECK-LABEL: {{^}}main2: +; CHECK-NOT: ADD * + +define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 { +main_body: + %0 = extractelement <4 x float> %reg1, i32 0 + %1 = extractelement <4 x float> %reg1, i32 1 + %2 = extractelement <4 x float> %reg1, i32 2 + %3 = extractelement <4 x float> %reg1, i32 3 + %4 = extractelement <4 x float> %reg2, i32 0 + %5 = fadd float %0, 2.0 + %6 = fadd float %1, 3.0 + %7 = fadd float %2, 4.0 + %8 = fadd float %3, 2.0 + %9 = bitcast float %4 to i32 + %10 = mul i32 %9, 6 + %11 = bitcast i32 %10 to float + %12 = insertelement <4 x float> undef, float %5, i32 0 + %13 = insertelement <4 x float> %12, float %6, i32 1 + %14 = insertelement <4 x float> %13, float %7, i32 2 + %15 = insertelement <4 x float> %14, float %8, i32 3 + %16 = insertelement <4 x float> %15, float %11, i32 3 + + %17 = call float @llvm.AMDGPU.dp4(<4 x float> %15,<4 x float> %16) + %18 = insertelement <4 x float> undef, float %17, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 2) + ret void +} + +; Function Attrs: readnone +declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="1" } +attributes #1 = { readnone } diff --git a/test/CodeGen/AMDGPU/max.ll b/test/CodeGen/AMDGPU/max.ll new file mode 100644 index 00000000000..fef3e2f0a21 --- /dev/null +++ b/test/CodeGen/AMDGPU/max.ll @@ -0,0 +1,168 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; FUNC-LABEL: @v_test_imax_sge_i32 +; SI: v_max_i32_e32 +define void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0, align 4 + %b = load i32, i32 addrspace(1)* %gep1, align 4 + %cmp = icmp sge i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: @s_test_imax_sge_i32 +; SI: s_max_i32 +define void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %cmp = icmp sge i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_test_imax_sge_imm_i32: +; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9 +define void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { + %cmp = icmp sge i32 %a, 9 + %val = select i1 %cmp, i32 %a, i32 9 + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_test_imax_sgt_imm_i32: +; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9 +define void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { + %cmp = icmp sgt i32 %a, 9 + %val = select i1 %cmp, i32 %a, i32 9 + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @v_test_imax_sgt_i32 +; SI: v_max_i32_e32 +define void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0, align 4 + %b = load i32, i32 addrspace(1)* %gep1, align 4 + %cmp = icmp sgt i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: @s_test_imax_sgt_i32 +; SI: s_max_i32 +define void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %cmp = icmp sgt i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umax_uge_i32 +; SI: v_max_u32_e32 +define void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0, align 4 + %b = load i32, i32 addrspace(1)* %gep1, align 4 + %cmp = icmp uge i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: @s_test_umax_uge_i32 +; SI: s_max_u32 +define void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %cmp = icmp uge i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umax_ugt_i32 +; SI: v_max_u32_e32 +define void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0, align 4 + %b = load i32, i32 addrspace(1)* %gep1, align 4 + %cmp = icmp ugt i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: @s_test_umax_ugt_i32 +; SI: s_max_u32 +define void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %cmp = icmp ugt i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; Make sure redundant and removed +; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umax_ugt_i16: +; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc +; SI: s_max_u32 [[MIN:s[0-9]+]], [[A]], [[B]] +; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] +; SI-NEXT: buffer_store_dword [[VMIN]] +define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind { + %a.ext = zext i16 %a to i32 + %b.ext = zext i16 %b to i32 + %cmp = icmp ugt i32 %a.ext, %b.ext + %val = select i1 %cmp, i32 %a.ext, i32 %b.ext + %mask = and i32 %val, 65535 + store i32 %mask, i32 addrspace(1)* %out + ret void +} + +; Make sure redundant sign_extend_inreg removed. + +; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16: +; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc +; SI: s_max_i32 [[MIN:s[0-9]+]], [[A]], [[B]] +; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] +; SI-NEXT: buffer_store_dword [[VMIN]] +define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind { + %a.ext = sext i16 %a to i32 + %b.ext = sext i16 %b to i32 + %cmp = icmp sgt i32 %a.ext, %b.ext + %val = select i1 %cmp, i32 %a.ext, i32 %b.ext + %shl = shl i32 %val, 16 + %sextinreg = ashr i32 %shl, 16 + store i32 %sextinreg, i32 addrspace(1)* %out + ret void +} + +; FIXME: Should get match min/max through extends inserted by +; legalization. + +; FUNC-LABEL: {{^}}s_test_imin_sge_i16: +; SI: s_sext_i32_i16 +; SI: s_sext_i32_i16 +; SI: v_cmp_ge_i32_e32 +; SI: v_cndmask_b32 +define void @s_test_imin_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { + %cmp = icmp sge i16 %a, %b + %val = select i1 %cmp, i16 %a, i16 %b + store i16 %val, i16 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/max3.ll b/test/CodeGen/AMDGPU/max3.ll new file mode 100644 index 00000000000..cfb94b272e5 --- /dev/null +++ b/test/CodeGen/AMDGPU/max3.ll @@ -0,0 +1,41 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; FUNC-LABEL: @v_test_imax3_sgt_i32 +; SI: v_max3_i32 +define void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0, align 4 + %b = load i32, i32 addrspace(1)* %gep1, align 4 + %c = load i32, i32 addrspace(1)* %gep2, align 4 + %icmp0 = icmp sgt i32 %a, %b + %i0 = select i1 %icmp0, i32 %a, i32 %b + %icmp1 = icmp sgt i32 %i0, %c + %i1 = select i1 %icmp1, i32 %i0, i32 %c + store i32 %i1, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umax3_ugt_i32 +; SI: v_max3_u32 +define void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0, align 4 + %b = load i32, i32 addrspace(1)* %gep1, align 4 + %c = load i32, i32 addrspace(1)* %gep2, align 4 + %icmp0 = icmp ugt i32 %a, %b + %i0 = select i1 %icmp0, i32 %a, i32 %b + %icmp1 = icmp ugt i32 %i0, %c + %i1 = select i1 %icmp1, i32 %i0, i32 %c + store i32 %i1, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/merge-stores.ll b/test/CodeGen/AMDGPU/merge-stores.ll new file mode 100644 index 00000000000..dbf9d4481ff --- /dev/null +++ b/test/CodeGen/AMDGPU/merge-stores.ll @@ -0,0 +1,536 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s + +; Run with devices with different unaligned load restrictions. + +; TODO: Vector element tests +; TODO: Non-zero base offset for load and store combinations +; TODO: Same base addrspacecasted + + +; GCN-LABEL: {{^}}merge_global_store_2_constants_i8: +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: s_endpgm +define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 + + store i8 123, i8 addrspace(1)* %out.gep.1 + store i8 456, i8 addrspace(1)* %out, align 2 + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align: +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: s_endpgm +define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 + + store i8 123, i8 addrspace(1)* %out.gep.1 + store i8 456, i8 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_2_constants_i16: +; GCN: buffer_store_dword v +define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 + + store i16 123, i16 addrspace(1)* %out.gep.1 + store i16 456, i16 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16: +; GCN: buffer_store_dword v +define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 + + store i16 0, i16 addrspace(1)* %out.gep.1 + store i16 0, i16 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align: +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: s_endpgm +define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 + + store i16 123, i16 addrspace(1)* %out.gep.1 + store i16 456, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_2_constants_i32: +; SI-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8 +; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b +; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]] +; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]] +; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + + store i32 123, i32 addrspace(1)* %out.gep.1 + store i32 456, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32: +; GCN: buffer_store_dwordx2 +define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)* + store float 1.0, float addrspace(1)* %out.gep.1.bc + store i32 456, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32: +; GCN: buffer_store_dwordx2 +define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 + %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)* + store i32 123, i32 addrspace(1)* %out.gep.1.bc + store float 4.0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_4_constants_i32: +; GCN: buffer_store_dwordx4 +define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 + + store i32 123, i32 addrspace(1)* %out.gep.1 + store i32 456, i32 addrspace(1)* %out.gep.2 + store i32 333, i32 addrspace(1)* %out.gep.3 + store i32 1234, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order: +; XGCN: buffer_store_dwordx4 +; GCN: buffer_store_dword v +; GCN: buffer_store_dword v +; GCN: buffer_store_dwordx2 v +define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 + %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 + + store float 8.0, float addrspace(1)* %out + store float 1.0, float addrspace(1)* %out.gep.1 + store float 2.0, float addrspace(1)* %out.gep.2 + store float 4.0, float addrspace(1)* %out.gep.3 + ret void +} + +; First store is out of order. Because of order of combines, the +; consecutive store fails because only some of the stores have been +; replaced with integer constant stores, and then won't merge because +; the types are different. + +; GCN-LABEL: {{^}}merge_global_store_4_constants_f32: +; XGCN: buffer_store_dwordx4 +; GCN: buffer_store_dword v +; GCN: buffer_store_dword v +; GCN: buffer_store_dword v +; GCN: buffer_store_dword v +define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 + %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 + + store float 1.0, float addrspace(1)* %out.gep.1 + store float 2.0, float addrspace(1)* %out.gep.2 + store float 4.0, float addrspace(1)* %out.gep.3 + store float 8.0, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_3_constants_i32: +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dword +; SI-NOT: buffer_store_dword +; GCN: s_endpgm +define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 + + store i32 123, i32 addrspace(1)* %out.gep.1 + store i32 456, i32 addrspace(1)* %out.gep.2 + store i32 1234, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_2_constants_i64: +; XGCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx2 +; GCN: buffer_store_dwordx2 +define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 + + store i64 123, i64 addrspace(1)* %out.gep.1 + store i64 456, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_4_constants_i64: +; XGCN: buffer_store_dwordx4 +; XGCN: buffer_store_dwordx4 + +; GCN: buffer_store_dwordx2 +; GCN: buffer_store_dwordx2 +; GCN: buffer_store_dwordx2 +; GCN: buffer_store_dwordx2 +define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 { + %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 + %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2 + %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3 + + store i64 123, i64 addrspace(1)* %out.gep.1 + store i64 456, i64 addrspace(1)* %out.gep.2 + store i64 333, i64 addrspace(1)* %out.gep.3 + store i64 1234, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32: +; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] +; GCN: buffer_store_dwordx2 [[LOAD]] +define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 + + %lo = load i32, i32 addrspace(1)* %in + %hi = load i32, i32 addrspace(1)* %in.gep.1 + + store i32 %lo, i32 addrspace(1)* %out + store i32 %hi, i32 addrspace(1)* %out.gep.1 + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base: +; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 +; GCN: buffer_store_dwordx2 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 +define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2 + %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3 + + %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2 + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3 + %lo = load i32, i32 addrspace(1)* %in.gep.0 + %hi = load i32, i32 addrspace(1)* %in.gep.1 + + store i32 %lo, i32 addrspace(1)* %out.gep.0 + store i32 %hi, i32 addrspace(1)* %out.gep.1 + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32: +; GCN: buffer_load_dword v +; GCN: buffer_load_dword v +; GCN: buffer_store_dword v +; GCN: buffer_store_dword v +define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 + + %lo = load i32, i32 addrspace(1)* %in + %hi = load i32, i32 addrspace(1)* %in.gep.1 + + store i32 %hi, i32 addrspace(1)* %out + store i32 %lo, i32 addrspace(1)* %out.gep.1 + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32: +; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] +; GCN: buffer_store_dwordx4 [[LOAD]] +define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 + %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 + %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 + + %x = load i32, i32 addrspace(1)* %in + %y = load i32, i32 addrspace(1)* %in.gep.1 + %z = load i32, i32 addrspace(1)* %in.gep.2 + %w = load i32, i32 addrspace(1)* %in.gep.3 + + store i32 %x, i32 addrspace(1)* %out + store i32 %y, i32 addrspace(1)* %out.gep.1 + store i32 %z, i32 addrspace(1)* %out.gep.2 + store i32 %w, i32 addrspace(1)* %out.gep.3 + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32: +; SI-DAG: buffer_load_dwordx2 +; SI-DAG: buffer_load_dword v +; GCN: s_waitcnt +; SI-DAG: buffer_store_dword v +; SI-DAG: buffer_store_dwordx2 v +; GCN: s_endpgm +define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 + %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 + + %x = load i32, i32 addrspace(1)* %in + %y = load i32, i32 addrspace(1)* %in.gep.1 + %z = load i32, i32 addrspace(1)* %in.gep.2 + + store i32 %x, i32 addrspace(1)* %out + store i32 %y, i32 addrspace(1)* %out.gep.1 + store i32 %z, i32 addrspace(1)* %out.gep.2 + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32: +; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] +; GCN: buffer_store_dwordx4 [[LOAD]] +define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 + %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 + %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1 + %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2 + %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3 + + %x = load float, float addrspace(1)* %in + %y = load float, float addrspace(1)* %in.gep.1 + %z = load float, float addrspace(1)* %in.gep.2 + %w = load float, float addrspace(1)* %in.gep.3 + + store float %x, float addrspace(1)* %out + store float %y, float addrspace(1)* %out.gep.1 + store float %z, float addrspace(1)* %out.gep.2 + store float %w, float addrspace(1)* %out.gep.3 + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base: +; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 +; GCN: buffer_store_dwordx4 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28 +define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11 + %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12 + %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13 + %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14 + %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7 + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8 + %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9 + %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10 + + %x = load i32, i32 addrspace(1)* %in.gep.0 + %y = load i32, i32 addrspace(1)* %in.gep.1 + %z = load i32, i32 addrspace(1)* %in.gep.2 + %w = load i32, i32 addrspace(1)* %in.gep.3 + + store i32 %x, i32 addrspace(1)* %out.gep.0 + store i32 %y, i32 addrspace(1)* %out.gep.1 + store i32 %z, i32 addrspace(1)* %out.gep.2 + store i32 %w, i32 addrspace(1)* %out.gep.3 + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32: +; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] +; GCN: s_barrier +; GCN: buffer_store_dwordx4 [[LOAD]] +define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 + %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 + %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 + + %x = load i32, i32 addrspace(1)* %in + %y = load i32, i32 addrspace(1)* %in.gep.1 + %z = load i32, i32 addrspace(1)* %in.gep.2 + %w = load i32, i32 addrspace(1)* %in.gep.3 + + ; Make sure the barrier doesn't stop this + tail call void @llvm.AMDGPU.barrier.local() #1 + + store i32 %w, i32 addrspace(1)* %out.gep.3 + store i32 %z, i32 addrspace(1)* %out.gep.2 + store i32 %y, i32 addrspace(1)* %out.gep.1 + store i32 %x, i32 addrspace(1)* %out + + ret void +} + +; TODO: Re-packing of loaded register required. Maybe an IR pass +; should catch this? + +; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32: +; GCN: buffer_load_dword v +; GCN: buffer_load_dword v +; GCN: buffer_load_dword v +; GCN: buffer_load_dword v +; GCN: s_barrier +; GCN: buffer_store_dword v +; GCN: buffer_store_dword v +; GCN: buffer_store_dword v +; GCN: buffer_store_dword v +define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 + %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 + %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 + + %x = load i32, i32 addrspace(1)* %in + %y = load i32, i32 addrspace(1)* %in.gep.1 + %z = load i32, i32 addrspace(1)* %in.gep.2 + %w = load i32, i32 addrspace(1)* %in.gep.3 + + ; Make sure the barrier doesn't stop this + tail call void @llvm.AMDGPU.barrier.local() #1 + + store i32 %w, i32 addrspace(1)* %out + store i32 %z, i32 addrspace(1)* %out.gep.1 + store i32 %y, i32 addrspace(1)* %out.gep.2 + store i32 %x, i32 addrspace(1)* %out.gep.3 + + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8: +; GCN: buffer_load_dword [[LOAD:v[0-9]+]] +; GCN: buffer_store_dword [[LOAD]] +; GCN: s_endpgm +define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { + %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1 + %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2 + %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3 + %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1 + %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2 + %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3 + + %x = load i8, i8 addrspace(1)* %in, align 4 + %y = load i8, i8 addrspace(1)* %in.gep.1 + %z = load i8, i8 addrspace(1)* %in.gep.2 + %w = load i8, i8 addrspace(1)* %in.gep.3 + + store i8 %x, i8 addrspace(1)* %out, align 4 + store i8 %y, i8 addrspace(1)* %out.gep.1 + store i8 %z, i8 addrspace(1)* %out.gep.2 + store i8 %w, i8 addrspace(1)* %out.gep.3 + ret void +} + +; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align: +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: s_endpgm +define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { + %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1 + %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2 + %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3 + %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1 + %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2 + %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3 + + %x = load i8, i8 addrspace(1)* %in + %y = load i8, i8 addrspace(1)* %in.gep.1 + %z = load i8, i8 addrspace(1)* %in.gep.2 + %w = load i8, i8 addrspace(1)* %in.gep.3 + + store i8 %x, i8 addrspace(1)* %out + store i8 %y, i8 addrspace(1)* %out.gep.1 + store i8 %z, i8 addrspace(1)* %out.gep.2 + store i8 %w, i8 addrspace(1)* %out.gep.3 + ret void +} + +; This works once AA is enabled on the subtarget +; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32: +; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] +; XGCN: buffer_store_dwordx4 [[LOAD]] +; GCN: buffer_store_dword v +; GCN: buffer_store_dword v +; GCN: buffer_store_dword v +; GCN: buffer_store_dword v +define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 + %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in + + %x = extractelement <4 x i32> %vec, i32 0 + %y = extractelement <4 x i32> %vec, i32 1 + %z = extractelement <4 x i32> %vec, i32 2 + %w = extractelement <4 x i32> %vec, i32 3 + + store i32 %x, i32 addrspace(1)* %out + store i32 %y, i32 addrspace(1)* %out.gep.1 + store i32 %z, i32 addrspace(1)* %out.gep.2 + store i32 %w, i32 addrspace(1)* %out.gep.3 + ret void +} + +; GCN-LABEL: {{^}}merge_local_store_2_constants_i8: +; GCN: ds_write_b8 +; GCN: ds_write_b8 +; GCN: s_endpgm +define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 { + %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1 + + store i8 123, i8 addrspace(3)* %out.gep.1 + store i8 456, i8 addrspace(3)* %out, align 2 + ret void +} + +; GCN-LABEL: {{^}}merge_local_store_2_constants_i32: +; GCN-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8 +; GCN-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b +; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]] +; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]] +; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}} +define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 + + store i32 123, i32 addrspace(3)* %out.gep.1 + store i32 456, i32 addrspace(3)* %out + ret void +} + +; GCN-LABEL: {{^}}merge_local_store_4_constants_i32: +; GCN: ds_write_b32 +; GCN: ds_write_b32 +; GCN: ds_write_b32 +; GCN: ds_write_b32 +define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 { + %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 + %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2 + %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3 + + store i32 123, i32 addrspace(3)* %out.gep.1 + store i32 456, i32 addrspace(3)* %out.gep.2 + store i32 333, i32 addrspace(3)* %out.gep.3 + store i32 1234, i32 addrspace(3)* %out + ret void +} + +declare void @llvm.AMDGPU.barrier.local() #1 + +attributes #0 = { nounwind } +attributes #1 = { noduplicate nounwind } diff --git a/test/CodeGen/AMDGPU/min.ll b/test/CodeGen/AMDGPU/min.ll new file mode 100644 index 00000000000..0332d1a8e40 --- /dev/null +++ b/test/CodeGen/AMDGPU/min.ll @@ -0,0 +1,189 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; FUNC-LABEL: @v_test_imin_sle_i32 +; SI: v_min_i32_e32 +define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0, align 4 + %b = load i32, i32 addrspace(1)* %gep1, align 4 + %cmp = icmp sle i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: @s_test_imin_sle_i32 +; SI: s_min_i32 +define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %cmp = icmp sle i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @v_test_imin_slt_i32 +; SI: v_min_i32_e32 +define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0, align 4 + %b = load i32, i32 addrspace(1)* %gep1, align 4 + %cmp = icmp slt i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: @s_test_imin_slt_i32 +; SI: s_min_i32 +define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %cmp = icmp slt i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_test_imin_slt_imm_i32: +; SI: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8 +define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { + %cmp = icmp slt i32 %a, 8 + %val = select i1 %cmp, i32 %a, i32 8 + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}s_test_imin_sle_imm_i32: +; SI: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8 +define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { + %cmp = icmp sle i32 %a, 8 + %val = select i1 %cmp, i32 %a, i32 8 + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umin_ule_i32 +; SI: v_min_u32_e32 +define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0, align 4 + %b = load i32, i32 addrspace(1)* %gep1, align 4 + %cmp = icmp ule i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: @s_test_umin_ule_i32 +; SI: s_min_u32 +define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %cmp = icmp ule i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umin_ult_i32 +; SI: v_min_u32_e32 +define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0, align 4 + %b = load i32, i32 addrspace(1)* %gep1, align 4 + %cmp = icmp ult i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: @s_test_umin_ult_i32 +; SI: s_min_u32 +define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %cmp = icmp ult i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umin_ult_i32_multi_use +; SI-NOT: v_min +; SI: v_cmp_lt_u32 +; SI-NEXT: v_cndmask_b32 +; SI-NOT: v_min +; SI: s_endpgm +define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %outgep0 = getelementptr i32, i32 addrspace(1)* %out0, i32 %tid + %outgep1 = getelementptr i1, i1 addrspace(1)* %out1, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0, align 4 + %b = load i32, i32 addrspace(1)* %gep1, align 4 + %cmp = icmp ult i32 %a, %b + %val = select i1 %cmp, i32 %a, i32 %b + store i32 %val, i32 addrspace(1)* %outgep0, align 4 + store i1 %cmp, i1 addrspace(1)* %outgep1 + ret void +} + +; Make sure redundant and removed +; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umin_ult_i16: +; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc +; SI: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]] +; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] +; SI-NEXT: buffer_store_dword [[VMIN]] +define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind { + %a.ext = zext i16 %a to i32 + %b.ext = zext i16 %b to i32 + %cmp = icmp ult i32 %a.ext, %b.ext + %val = select i1 %cmp, i32 %a.ext, i32 %b.ext + %mask = and i32 %val, 65535 + store i32 %mask, i32 addrspace(1)* %out + ret void +} + +; Make sure redundant sign_extend_inreg removed. + +; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16: +; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc +; SI: s_min_i32 [[MIN:s[0-9]+]], [[A]], [[B]] +; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] +; SI-NEXT: buffer_store_dword [[VMIN]] +define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind { + %a.ext = sext i16 %a to i32 + %b.ext = sext i16 %b to i32 + %cmp = icmp slt i32 %a.ext, %b.ext + %val = select i1 %cmp, i32 %a.ext, i32 %b.ext + %shl = shl i32 %val, 16 + %sextinreg = ashr i32 %shl, 16 + store i32 %sextinreg, i32 addrspace(1)* %out + ret void +} + +; FIXME: Should get match min/max through extends inserted by +; legalization. + +; FUNC-LABEL: {{^}}s_test_imin_sle_i16: +; SI: s_sext_i32_i16 +; SI: s_sext_i32_i16 +; SI: v_cmp_le_i32_e32 +; SI: v_cndmask_b32 +define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { + %cmp = icmp sle i16 %a, %b + %val = select i1 %cmp, i16 %a, i16 %b + store i16 %val, i16 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/min3.ll b/test/CodeGen/AMDGPU/min3.ll new file mode 100644 index 00000000000..38ef46d1bdd --- /dev/null +++ b/test/CodeGen/AMDGPU/min3.ll @@ -0,0 +1,111 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; FUNC-LABEL: @v_test_imin3_slt_i32 +; SI: v_min3_i32 +define void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0, align 4 + %b = load i32, i32 addrspace(1)* %gep1, align 4 + %c = load i32, i32 addrspace(1)* %gep2, align 4 + %icmp0 = icmp slt i32 %a, %b + %i0 = select i1 %icmp0, i32 %a, i32 %b + %icmp1 = icmp slt i32 %i0, %c + %i1 = select i1 %icmp1, i32 %i0, i32 %c + store i32 %i1, i32 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umin3_ult_i32 +; SI: v_min3_u32 +define void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0, align 4 + %b = load i32, i32 addrspace(1)* %gep1, align 4 + %c = load i32, i32 addrspace(1)* %gep2, align 4 + %icmp0 = icmp ult i32 %a, %b + %i0 = select i1 %icmp0, i32 %a, i32 %b + %icmp1 = icmp ult i32 %i0, %c + %i1 = select i1 %icmp1, i32 %i0, i32 %c + store i32 %i1, i32 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umin_umin_umin +; SI: v_min_i32 +; SI: v_min3_i32 +define void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %tid2 = mul i32 %tid, 2 + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid + + %gep3 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid2 + %gep4 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid2 + %gep5 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid2 + + %outgep0 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %outgep1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2 + + %a = load i32, i32 addrspace(1)* %gep0, align 4 + %b = load i32, i32 addrspace(1)* %gep1, align 4 + %c = load i32, i32 addrspace(1)* %gep2, align 4 + %d = load i32, i32 addrspace(1)* %gep3, align 4 + + %icmp0 = icmp slt i32 %a, %b + %i0 = select i1 %icmp0, i32 %a, i32 %b + + %icmp1 = icmp slt i32 %c, %d + %i1 = select i1 %icmp1, i32 %c, i32 %d + + %icmp2 = icmp slt i32 %i0, %i1 + %i2 = select i1 %icmp2, i32 %i0, i32 %i1 + + store i32 %i2, i32 addrspace(1)* %outgep1, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umin3_2_uses +; SI-NOT: v_min3 +define void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %tid2 = mul i32 %tid, 2 + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid + + %gep3 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid2 + %gep4 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid2 + %gep5 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid2 + + %outgep0 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %outgep1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2 + + %a = load i32, i32 addrspace(1)* %gep0, align 4 + %b = load i32, i32 addrspace(1)* %gep1, align 4 + %c = load i32, i32 addrspace(1)* %gep2, align 4 + %d = load i32, i32 addrspace(1)* %gep3, align 4 + + %icmp0 = icmp slt i32 %a, %b + %i0 = select i1 %icmp0, i32 %a, i32 %b + + %icmp1 = icmp slt i32 %c, %d + %i1 = select i1 %icmp1, i32 %c, i32 %d + + %icmp2 = icmp slt i32 %i0, %c + %i2 = select i1 %icmp2, i32 %i0, i32 %c + + store i32 %i2, i32 addrspace(1)* %outgep0, align 4 + store i32 %i0, i32 addrspace(1)* %outgep1, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/missing-store.ll b/test/CodeGen/AMDGPU/missing-store.ll new file mode 100644 index 00000000000..4af9cdf1b96 --- /dev/null +++ b/test/CodeGen/AMDGPU/missing-store.ll @@ -0,0 +1,26 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s + +@ptr_load = addrspace(3) global i32 addrspace(2)* undef, align 8 + +; Make sure when the load from %ptr2 is folded the chain isn't lost, +; resulting in losing the store to gptr + +; FUNC-LABEL: {{^}}missing_store_reduced: +; SI: ds_read_b64 +; SI: buffer_store_dword +; SI: buffer_load_dword +; SI: buffer_store_dword +; SI: s_endpgm +define void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { + %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @ptr_load, align 8 + %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2 + + store i32 99, i32 addrspace(1)* %gptr, align 4 + %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4 + + store i32 %tmp2, i32 addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { nounwind } + diff --git a/test/CodeGen/AMDGPU/mubuf.ll b/test/CodeGen/AMDGPU/mubuf.ll new file mode 100644 index 00000000000..b19163f294e --- /dev/null +++ b/test/CodeGen/AMDGPU/mubuf.ll @@ -0,0 +1,183 @@ +; RUN: llc -march=amdgcn -mcpu=SI -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s + +declare i32 @llvm.r600.read.tidig.x() readnone + +;;;==========================================================================;;; +;;; MUBUF LOAD TESTS +;;;==========================================================================;;; + +; MUBUF load with an immediate byte offset that fits into 12-bits +; CHECK-LABEL: {{^}}mubuf_load0: +; CHECK: buffer_load_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x30,0xe0 +define void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +entry: + %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1 + %1 = load i32, i32 addrspace(1)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; MUBUF load with the largest possible immediate offset +; CHECK-LABEL: {{^}}mubuf_load1: +; CHECK: buffer_load_ubyte v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0 +define void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { +entry: + %0 = getelementptr i8, i8 addrspace(1)* %in, i64 4095 + %1 = load i8, i8 addrspace(1)* %0 + store i8 %1, i8 addrspace(1)* %out + ret void +} + +; MUBUF load with an immediate byte offset that doesn't fit into 12-bits +; CHECK-LABEL: {{^}}mubuf_load2: +; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000 +; CHECK: buffer_load_dword v{{[0-9]}}, s[{{[0-9]+:[0-9]+}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x30,0xe0 +define void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +entry: + %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1024 + %1 = load i32, i32 addrspace(1)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; MUBUF load with a 12-bit immediate offset and a register offset +; CHECK-LABEL: {{^}}mubuf_load3: +; CHECK-NOT: ADD +; CHECK: buffer_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x30,0xe0 +define void @mubuf_load3(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i64 %offset) { +entry: + %0 = getelementptr i32, i32 addrspace(1)* %in, i64 %offset + %1 = getelementptr i32, i32 addrspace(1)* %0, i64 1 + %2 = load i32, i32 addrspace(1)* %1 + store i32 %2, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}soffset_max_imm: +; CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 64 offen glc +define void @soffset_max_imm([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) #1 { +main_body: + %tmp0 = getelementptr [6 x <16 x i8>], [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0 + %tmp1 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp0 + %tmp2 = shl i32 %6, 2 + %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 64, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) + %tmp4 = add i32 %6, 16 + %tmp5 = bitcast float 0.0 to i32 + call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp5, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + ret void +} + +; Make sure immediates that aren't inline constants don't get folded into +; the soffset operand. +; FIXME: for this test we should be smart enough to shift the immediate into +; the offset field. +; CHECK-LABEL: {{^}}soffset_no_fold: +; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x41 +; CHECK: buffer_load_dword v{{[0-9+]}}, v{{[0-9+]}}, s[{{[0-9]+}}:{{[0-9]+}}], [[SOFFSET]] offen glc +define void @soffset_no_fold([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) #1 { +main_body: + %tmp0 = getelementptr [6 x <16 x i8>], [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0 + %tmp1 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp0 + %tmp2 = shl i32 %6, 2 + %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 65, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) + %tmp4 = add i32 %6, 16 + %tmp5 = bitcast float 0.0 to i32 + call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp5, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + ret void +} + +;;;==========================================================================;;; +;;; MUBUF STORE TESTS +;;;==========================================================================;;; + +; MUBUF store with an immediate byte offset that fits into 12-bits +; CHECK-LABEL: {{^}}mubuf_store0: +; CHECK: buffer_store_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x70,0xe0 +define void @mubuf_store0(i32 addrspace(1)* %out) { +entry: + %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1 + store i32 0, i32 addrspace(1)* %0 + ret void +} + +; MUBUF store with the largest possible immediate offset +; CHECK-LABEL: {{^}}mubuf_store1: +; CHECK: buffer_store_byte v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0 + +define void @mubuf_store1(i8 addrspace(1)* %out) { +entry: + %0 = getelementptr i8, i8 addrspace(1)* %out, i64 4095 + store i8 0, i8 addrspace(1)* %0 + ret void +} + +; MUBUF store with an immediate byte offset that doesn't fit into 12-bits +; CHECK-LABEL: {{^}}mubuf_store2: +; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000 +; CHECK: buffer_store_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x70,0xe0 +define void @mubuf_store2(i32 addrspace(1)* %out) { +entry: + %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1024 + store i32 0, i32 addrspace(1)* %0 + ret void +} + +; MUBUF store with a 12-bit immediate offset and a register offset +; CHECK-LABEL: {{^}}mubuf_store3: +; CHECK-NOT: ADD +; CHECK: buffer_store_dword v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x70,0xe0 +define void @mubuf_store3(i32 addrspace(1)* %out, i64 %offset) { +entry: + %0 = getelementptr i32, i32 addrspace(1)* %out, i64 %offset + %1 = getelementptr i32, i32 addrspace(1)* %0, i64 1 + store i32 0, i32 addrspace(1)* %1 + ret void +} + +; CHECK-LABEL: {{^}}store_sgpr_ptr: +; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 +define void @store_sgpr_ptr(i32 addrspace(1)* %out) #0 { + store i32 99, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}store_sgpr_ptr_offset: +; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:40 +define void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 { + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 10 + store i32 99, i32 addrspace(1)* %out.gep, align 4 + ret void +} + +; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset: +; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000 +; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]] +define void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 { + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768 + store i32 99, i32 addrspace(1)* %out.gep, align 4 + ret void +} + +; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset_atomic: +; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000 +; CHECK: buffer_atomic_add v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]] +define void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 { + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768 + %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 5 seq_cst + ret void +} + +; CHECK-LABEL: {{^}}store_vgpr_ptr: +; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 +define void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() readnone + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + store i32 99, i32 addrspace(1)* %out.gep, align 4 + ret void +} + +declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #3 +declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) + +attributes #1 = { "ShaderType"="2" "unsafe-fp-math"="true" } +attributes #3 = { nounwind readonly } diff --git a/test/CodeGen/AMDGPU/mul.ll b/test/CodeGen/AMDGPU/mul.ll new file mode 100644 index 00000000000..94e0f96b323 --- /dev/null +++ b/test/CodeGen/AMDGPU/mul.ll @@ -0,0 +1,200 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; mul24 and mad24 are affected + +; FUNC-LABEL: {{^}}test_mul_v2i32: +; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 + %a = load <2 x i32>, <2 x i32> addrspace(1) * %in + %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr + %result = mul <2 x i32> %a, %b + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_mul_v4i32: +; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 + %a = load <4 x i32>, <4 x i32> addrspace(1) * %in + %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr + %result = mul <4 x i32> %a, %b + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_trunc_i64_mul_to_i32: +; SI: s_load_dword +; SI: s_load_dword +; SI: s_mul_i32 +; SI: buffer_store_dword +define void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { + %mul = mul i64 %b, %a + %trunc = trunc i64 %mul to i32 + store i32 %trunc, i32 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_trunc_i64_mul_to_i32: +; SI: s_load_dword +; SI: s_load_dword +; SI: v_mul_lo_i32 +; SI: buffer_store_dword +define void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %a = load i64, i64 addrspace(1)* %aptr, align 8 + %b = load i64, i64 addrspace(1)* %bptr, align 8 + %mul = mul i64 %b, %a + %trunc = trunc i64 %mul to i32 + store i32 %trunc, i32 addrspace(1)* %out, align 8 + ret void +} + +; This 64-bit multiply should just use MUL_HI and MUL_LO, since the top +; 32-bits of both arguments are sign bits. +; FUNC-LABEL: {{^}}mul64_sext_c: +; EG-DAG: MULLO_INT +; EG-DAG: MULHI_INT +; SI-DAG: s_mul_i32 +; SI-DAG: v_mul_hi_i32 +define void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) { +entry: + %0 = sext i32 %in to i64 + %1 = mul i64 %0, 80 + store i64 %1, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_mul64_sext_c: +; EG-DAG: MULLO_INT +; EG-DAG: MULHI_INT +; SI-DAG: v_mul_lo_i32 +; SI-DAG: v_mul_hi_i32 +; SI: s_endpgm +define void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { + %val = load i32, i32 addrspace(1)* %in, align 4 + %ext = sext i32 %val to i64 + %mul = mul i64 %ext, 80 + store i64 %mul, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_mul64_sext_inline_imm: +; SI-DAG: v_mul_lo_i32 v{{[0-9]+}}, 9, v{{[0-9]+}} +; SI-DAG: v_mul_hi_i32 v{{[0-9]+}}, 9, v{{[0-9]+}} +; SI: s_endpgm +define void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { + %val = load i32, i32 addrspace(1)* %in, align 4 + %ext = sext i32 %val to i64 + %mul = mul i64 %ext, 9 + store i64 %mul, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_mul_i32: +; SI: s_load_dword [[SRC0:s[0-9]+]], +; SI: s_load_dword [[SRC1:s[0-9]+]], +; SI: s_mul_i32 [[SRESULT:s[0-9]+]], [[SRC0]], [[SRC1]] +; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] +; SI: buffer_store_dword [[VRESULT]], +; SI: s_endpgm +define void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %mul = mul i32 %a, %b + store i32 %mul, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}v_mul_i32: +; SI: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %a = load i32, i32 addrspace(1)* %in + %b = load i32, i32 addrspace(1)* %b_ptr + %result = mul i32 %a, %b + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; A standard 64-bit multiply. The expansion should be around 6 instructions. +; It would be difficult to match the expansion correctly without writing +; a really complicated list of FileCheck expressions. I don't want +; to confuse people who may 'break' this test with a correct optimization, +; so this test just uses FUNC-LABEL to make sure the compiler does not +; crash with a 'failed to select' error. + +; FUNC-LABEL: {{^}}s_mul_i64: +define void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %mul = mul i64 %a, %b + store i64 %mul, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_mul_i64: +; SI: v_mul_lo_i32 +define void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) { + %a = load i64, i64 addrspace(1)* %aptr, align 8 + %b = load i64, i64 addrspace(1)* %bptr, align 8 + %mul = mul i64 %a, %b + store i64 %mul, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}mul32_in_branch: +; SI: s_mul_i32 +define void @mul32_in_branch(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b, i32 %c) { +entry: + %0 = icmp eq i32 %a, 0 + br i1 %0, label %if, label %else + +if: + %1 = load i32, i32 addrspace(1)* %in + br label %endif + +else: + %2 = mul i32 %a, %b + br label %endif + +endif: + %3 = phi i32 [%1, %if], [%2, %else] + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}mul64_in_branch: +; SI-DAG: s_mul_i32 +; SI-DAG: v_mul_hi_u32 +; SI: s_endpgm +define void @mul64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { +entry: + %0 = icmp eq i64 %a, 0 + br i1 %0, label %if, label %else + +if: + %1 = load i64, i64 addrspace(1)* %in + br label %endif + +else: + %2 = mul i64 %a, %b + br label %endif + +endif: + %3 = phi i64 [%1, %if], [%2, %else] + store i64 %3, i64 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/mul_int24.ll b/test/CodeGen/AMDGPU/mul_int24.ll new file mode 100644 index 00000000000..7609dcc87af --- /dev/null +++ b/test/CodeGen/AMDGPU/mul_int24.ll @@ -0,0 +1,23 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC + +; FUNC-LABEL: {{^}}i32_mul24: +; Signed 24-bit multiply is not supported on pre-Cayman GPUs. +; EG: MULLO_INT +; Make sure we are not masking the inputs +; CM-NOT: AND +; CM: MUL_INT24 +; SI-NOT: and +; SI: v_mul_i32_i24 +define void @i32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) { +entry: + %0 = shl i32 %a, 8 + %a_24 = ashr i32 %0, 8 + %1 = shl i32 %b, 8 + %b_24 = ashr i32 %1, 8 + %2 = mul i32 %a_24, %b_24 + store i32 %2, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/mul_uint24.ll b/test/CodeGen/AMDGPU/mul_uint24.ll new file mode 100644 index 00000000000..e640a7cd69f --- /dev/null +++ b/test/CodeGen/AMDGPU/mul_uint24.ll @@ -0,0 +1,67 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC + +; FUNC-LABEL: {{^}}u32_mul24: +; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W +; SI: v_mul_u32_u24 + +define void @u32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) { +entry: + %0 = shl i32 %a, 8 + %a_24 = lshr i32 %0, 8 + %1 = shl i32 %b, 8 + %b_24 = lshr i32 %1, 8 + %2 = mul i32 %a_24, %b_24 + store i32 %2, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i16_mul24: +; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]] +; The result must be sign-extended +; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x +; EG: 16 +; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} +; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16 +define void @i16_mul24(i32 addrspace(1)* %out, i16 %a, i16 %b) { +entry: + %0 = mul i16 %a, %b + %1 = sext i16 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i8_mul24: +; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]] +; The result must be sign-extended +; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x +; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} +; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8 + +define void @i8_mul24(i32 addrspace(1)* %out, i8 %a, i8 %b) { +entry: + %0 = mul i8 %a, %b + %1 = sext i8 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; Multiply with 24-bit inputs and 64-bit output +; FUNC_LABEL: {{^}}mul24_i64: +; EG; MUL_UINT24 +; EG: MULHI +; SI: v_mul_u32_u24 +; FIXME: SI support 24-bit mulhi +; SI: v_mul_hi_u32 +define void @mul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +entry: + %0 = shl i64 %a, 40 + %a_24 = lshr i64 %0, 40 + %1 = shl i64 %b, 40 + %b_24 = lshr i64 %1, 40 + %2 = mul i64 %a_24, %b_24 + store i64 %2, i64 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/mulhu.ll b/test/CodeGen/AMDGPU/mulhu.ll new file mode 100644 index 00000000000..29b0944a553 --- /dev/null +++ b/test/CodeGen/AMDGPU/mulhu.ll @@ -0,0 +1,17 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xaaaaaaab +;CHECK: v_mul_hi_u32 v0, {{v[0-9]+}}, {{s[0-9]+}} +;CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0 + +define void @test(i32 %p) { + %i = udiv i32 %p, 3 + %r = bitcast i32 %i to float + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) + ret void +} + +declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll b/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll new file mode 100644 index 00000000000..9a814b579de --- /dev/null +++ b/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll @@ -0,0 +1,21 @@ +; RUN: llc -march=amdgcn -mcpu=SI -o /dev/null %s +; RUN: llc -march=amdgcn -mcpu=tonga -o /dev/null %s +; RUN: llc -march=r600 -mcpu=cypress -o /dev/null %s + +@extern_const_addrspace = external unnamed_addr addrspace(2) constant [5 x i32], align 4 + +; FUNC-LABEL: {{^}}load_extern_const_init: +define void @load_extern_const_init(i32 addrspace(1)* %out) nounwind { + %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @extern_const_addrspace, i64 0, i64 3), align 4 + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +@undef_const_addrspace = unnamed_addr addrspace(2) constant [5 x i32] undef, align 4 + +; FUNC-LABEL: {{^}}load_undef_const_init: +define void @load_undef_const_init(i32 addrspace(1)* %out) nounwind { + %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @undef_const_addrspace, i64 0, i64 3), align 4 + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/no-shrink-extloads.ll b/test/CodeGen/AMDGPU/no-shrink-extloads.ll new file mode 100644 index 00000000000..e4328ecbaca --- /dev/null +++ b/test/CodeGen/AMDGPU/no-shrink-extloads.ll @@ -0,0 +1,191 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; Make sure we don't turn the 32-bit argument load into a 16-bit +; load. There aren't extending scalar lods, so that would require +; using a buffer_load instruction. + +; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i16: +; SI: s_load_dword s +; SI: buffer_store_short v +define void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounwind { + %trunc = trunc i32 %arg to i16 + store i16 %trunc, i16 addrspace(1)* %out + ret void +} + +; It should be OK (and probably performance neutral) to reduce this, +; but we don't know if the load is uniform yet. + +; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i16: +; SI: buffer_load_dword v +; SI: buffer_store_short v +define void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i16, i16 addrspace(1)* %out, i32 %tid + %load = load i32, i32 addrspace(1)* %gep.in + %trunc = trunc i32 %load to i16 + store i16 %trunc, i16 addrspace(1)* %gep.out + ret void +} + +; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i8: +; SI: s_load_dword s +; SI: buffer_store_byte v +define void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwind { + %trunc = trunc i32 %arg to i8 + store i8 %trunc, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i8: +; SI: buffer_load_dword v +; SI: buffer_store_byte v +define void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid + %load = load i32, i32 addrspace(1)* %gep.in + %trunc = trunc i32 %load to i8 + store i8 %trunc, i8 addrspace(1)* %gep.out + ret void +} + +; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i1: +; SI: s_load_dword s +; SI: buffer_store_byte v +define void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwind { + %trunc = trunc i32 %arg to i1 + store i1 %trunc, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i1: +; SI: buffer_load_dword v +; SI: buffer_store_byte v +define void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i1, i1 addrspace(1)* %out, i32 %tid + %load = load i32, i32 addrspace(1)* %gep.in + %trunc = trunc i32 %load to i1 + store i1 %trunc, i1 addrspace(1)* %gep.out + ret void +} + +; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i32: +; SI: s_load_dword s +; SI: buffer_store_dword v +define void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind { + %trunc = trunc i64 %arg to i32 + store i32 %trunc, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i32: +; SI: buffer_load_dword v +; SI: buffer_store_dword v +define void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %load = load i64, i64 addrspace(1)* %gep.in + %trunc = trunc i64 %load to i32 + store i32 %trunc, i32 addrspace(1)* %gep.out + ret void +} + +; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i32: +; SI: s_load_dword s +; SI: buffer_store_dword v +define void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind { + %srl = lshr i64 %arg, 32 + %trunc = trunc i64 %srl to i32 + store i32 %trunc, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i32: +; SI: buffer_load_dword v +; SI: buffer_store_dword v +define void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %load = load i64, i64 addrspace(1)* %gep.in + %srl = lshr i64 %load, 32 + %trunc = trunc i64 %srl to i32 + store i32 %trunc, i32 addrspace(1)* %gep.out + ret void +} + +; Might as well reduce to 8-bit loads. +; FUNC-LABEL: {{^}}truncate_kernarg_i16_to_i8: +; SI: s_load_dword s +; SI: buffer_store_byte v +define void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwind { + %trunc = trunc i16 %arg to i8 + store i8 %trunc, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}truncate_buffer_load_i16_to_i8: +; SI: buffer_load_ubyte v +; SI: buffer_store_byte v +define void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.in = getelementptr i16, i16 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid + %load = load i16, i16 addrspace(1)* %gep.in + %trunc = trunc i16 %load to i8 + store i8 %trunc, i8 addrspace(1)* %gep.out + ret void +} + +; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i8: +; SI: s_load_dword s +; SI: buffer_store_byte v +define void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind { + %srl = lshr i64 %arg, 32 + %trunc = trunc i64 %srl to i8 + store i8 %trunc, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i8: +; SI: buffer_load_dword v +; SI: buffer_store_byte v +define void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid + %load = load i64, i64 addrspace(1)* %gep.in + %srl = lshr i64 %load, 32 + %trunc = trunc i64 %srl to i8 + store i8 %trunc, i8 addrspace(1)* %gep.out + ret void +} + +; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i8: +; SI: s_load_dword s +; SI: buffer_store_byte v +define void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind { + %trunc = trunc i64 %arg to i8 + store i8 %trunc, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i8: +; SI: buffer_load_dword v +; SI: buffer_store_byte v +define void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid + %load = load i64, i64 addrspace(1)* %gep.in + %trunc = trunc i64 %load to i8 + store i8 %trunc, i8 addrspace(1)* %gep.out + ret void +} diff --git a/test/CodeGen/AMDGPU/operand-folding.ll b/test/CodeGen/AMDGPU/operand-folding.ll new file mode 100644 index 00000000000..816755efb07 --- /dev/null +++ b/test/CodeGen/AMDGPU/operand-folding.ll @@ -0,0 +1,113 @@ +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s + +; CHECK-LABEL: {{^}}fold_sgpr: +; CHECK: v_add_i32_e32 v{{[0-9]+}}, s +define void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) { +entry: + %tmp0 = icmp ne i32 %fold, 0 + br i1 %tmp0, label %if, label %endif + +if: + %id = call i32 @llvm.r600.read.tidig.x() + %offset = add i32 %fold, %id + %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %offset + store i32 0, i32 addrspace(1)* %tmp1 + br label %endif + +endif: + ret void +} + +; CHECK-LABEL: {{^}}fold_imm: +; CHECK: v_or_b32_e32 v{{[0-9]+}}, 5 +define void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) { +entry: + %fold = add i32 3, 2 + %tmp0 = icmp ne i32 %cmp, 0 + br i1 %tmp0, label %if, label %endif + +if: + %id = call i32 @llvm.r600.read.tidig.x() + %val = or i32 %id, %fold + store i32 %val, i32 addrspace(1)* %out + br label %endif + +endif: + ret void +} + +; CHECK-LABEL: {{^}}fold_64bit_constant_add: +; CHECK-NOT: s_mov_b64 +; FIXME: It would be better if we could use v_add here and drop the extra +; v_mov_b32 instructions. +; CHECK-DAG: s_add_u32 [[LO:s[0-9]+]], s{{[0-9]+}}, 1 +; CHECK-DAG: s_addc_u32 [[HI:s[0-9]+]], s{{[0-9]+}}, 0 +; CHECK-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[LO]] +; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[HI]] +; CHECK: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}, + +define void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) { +entry: + %tmp0 = add i64 %val, 1 + store i64 %tmp0, i64 addrspace(1)* %out + ret void +} + +; Inline constants should always be folded. + +; CHECK-LABEL: {{^}}vector_inline: +; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}} +; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}} +; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}} +; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}} + +define void @vector_inline(<4 x i32> addrspace(1)* %out) { +entry: + %tmp0 = call i32 @llvm.r600.read.tidig.x() + %tmp1 = add i32 %tmp0, 1 + %tmp2 = add i32 %tmp0, 2 + %tmp3 = add i32 %tmp0, 3 + %vec0 = insertelement <4 x i32> undef, i32 %tmp0, i32 0 + %vec1 = insertelement <4 x i32> %vec0, i32 %tmp1, i32 1 + %vec2 = insertelement <4 x i32> %vec1, i32 %tmp2, i32 2 + %vec3 = insertelement <4 x i32> %vec2, i32 %tmp3, i32 3 + %tmp4 = xor <4 x i32> , %vec3 + store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %out + ret void +} + +; Immediates with one use should be folded +; CHECK-LABEL: {{^}}imm_one_use: +; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 0x64, v{{[0-9]+}} + +define void @imm_one_use(i32 addrspace(1)* %out) { +entry: + %tmp0 = call i32 @llvm.r600.read.tidig.x() + %tmp1 = xor i32 %tmp0, 100 + store i32 %tmp1, i32 addrspace(1)* %out + ret void +} +; CHECK-LABEL: {{^}}vector_imm: +; CHECK: s_movk_i32 [[IMM:s[0-9]+]], 0x64 +; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}} +; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}} +; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}} +; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}} + +define void @vector_imm(<4 x i32> addrspace(1)* %out) { +entry: + %tmp0 = call i32 @llvm.r600.read.tidig.x() + %tmp1 = add i32 %tmp0, 1 + %tmp2 = add i32 %tmp0, 2 + %tmp3 = add i32 %tmp0, 3 + %vec0 = insertelement <4 x i32> undef, i32 %tmp0, i32 0 + %vec1 = insertelement <4 x i32> %vec0, i32 %tmp1, i32 1 + %vec2 = insertelement <4 x i32> %vec1, i32 %tmp2, i32 2 + %vec3 = insertelement <4 x i32> %vec2, i32 %tmp3, i32 3 + %tmp4 = xor <4 x i32> , %vec3 + store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %out + ret void +} + +declare i32 @llvm.r600.read.tidig.x() #0 +attributes #0 = { readnone } diff --git a/test/CodeGen/AMDGPU/operand-spacing.ll b/test/CodeGen/AMDGPU/operand-spacing.ll new file mode 100644 index 00000000000..20420a84de6 --- /dev/null +++ b/test/CodeGen/AMDGPU/operand-spacing.ll @@ -0,0 +1,18 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=VI -check-prefix=GCN %s + +; Make sure there isn't an extra space between the instruction name and first operands. + +; GCN-LABEL: {{^}}add_f32: +; SI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; VI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; GCN: v_mov_b32_e32 [[VREGB:v[0-9]+]], [[SREGB]] +; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]] +; GCN: buffer_store_dword [[RESULT]], +define void @add_f32(float addrspace(1)* %out, float %a, float %b) { + %result = fadd float %a, %b + store float %result, float addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/or.ll b/test/CodeGen/AMDGPU/or.ll new file mode 100644 index 00000000000..1c04090b407 --- /dev/null +++ b/test/CodeGen/AMDGPU/or.ll @@ -0,0 +1,178 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + + +; FUNC-LABEL: {{^}}or_v2i32: +; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 + %a = load <2 x i32>, <2 x i32> addrspace(1) * %in + %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr + %result = or <2 x i32> %a, %b + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}or_v4i32: +; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 + %a = load <4 x i32>, <4 x i32> addrspace(1) * %in + %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr + %result = or <4 x i32> %a, %b + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}scalar_or_i32: +; SI: s_or_b32 +define void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { + %or = or i32 %a, %b + store i32 %or, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}vector_or_i32: +; SI: v_or_b32_e32 v{{[0-9]}} +define void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b) { + %loada = load i32, i32 addrspace(1)* %a + %or = or i32 %loada, %b + store i32 %or, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}scalar_or_literal_i32: +; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x1869f +define void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) { + %or = or i32 %a, 99999 + store i32 %or, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}vector_or_literal_i32: +; SI: v_or_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} +define void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) { + %loada = load i32, i32 addrspace(1)* %a, align 4 + %or = or i32 %loada, 65535 + store i32 %or, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}vector_or_inline_immediate_i32: +; SI: v_or_b32_e32 v{{[0-9]+}}, 4, v{{[0-9]+}} +define void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) { + %loada = load i32, i32 addrspace(1)* %a, align 4 + %or = or i32 %loada, 4 + store i32 %or, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}scalar_or_i64: +; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y +; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z + +; SI: s_or_b64 +define void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { + %or = or i64 %a, %b + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}vector_or_i64: +; SI: v_or_b32_e32 v{{[0-9]}} +; SI: v_or_b32_e32 v{{[0-9]}} +define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 8 + %loadb = load i64, i64 addrspace(1)* %a, align 8 + %or = or i64 %loada, %loadb + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}scalar_vector_or_i64: +; SI: v_or_b32_e32 v{{[0-9]}} +; SI: v_or_b32_e32 v{{[0-9]}} +define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 %b) { + %loada = load i64, i64 addrspace(1)* %a + %or = or i64 %loada, %b + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}vector_or_i64_loadimm: +; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xdf77987f +; SI-DAG: s_movk_i32 [[HI_S_IMM:s[0-9]+]], 0x146f +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI: s_endpgm +define void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 8 + %or = or i64 %loada, 22470723082367 + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; FIXME: The or 0 should really be removed. +; FUNC-LABEL: {{^}}vector_or_i64_imm: +; SI: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI: v_or_b32_e32 {{v[0-9]+}}, 8, v[[LO_VREG]] +; SI: v_or_b32_e32 {{v[0-9]+}}, 0, {{.*}} +; SI: s_endpgm +define void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 8 + %or = or i64 %loada, 8 + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}trunc_i64_or_to_i32: +; SI: s_load_dword s[[SREG0:[0-9]+]] +; SI: s_load_dword s[[SREG1:[0-9]+]] +; SI: s_or_b32 s[[SRESULT:[0-9]+]], s[[SREG1]], s[[SREG0]] +; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], s[[SRESULT]] +; SI: buffer_store_dword [[VRESULT]], +define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { + %add = or i64 %b, %a + %trunc = trunc i64 %add to i32 + store i32 %trunc, i32 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}or_i1: +; EG: OR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}} + +; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}] +define void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) { + %a = load float, float addrspace(1)* %in0 + %b = load float, float addrspace(1)* %in1 + %acmp = fcmp oge float %a, 0.000000e+00 + %bcmp = fcmp oge float %b, 0.000000e+00 + %or = or i1 %acmp, %bcmp + %result = zext i1 %or to i32 + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_or_i1: +; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}] +define void @s_or_i1(i1 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { + %cmp0 = icmp eq i32 %a, %b + %cmp1 = icmp eq i32 %c, %d + %or = or i1 %cmp0, %cmp1 + store i1 %or, i1 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/packetizer.ll b/test/CodeGen/AMDGPU/packetizer.ll new file mode 100644 index 00000000000..49a7c0df748 --- /dev/null +++ b/test/CodeGen/AMDGPU/packetizer.ll @@ -0,0 +1,34 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s + +; CHECK: {{^}}test: +; CHECK: BIT_ALIGN_INT T{{[0-9]}}.X +; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Y +; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Z +; CHECK: BIT_ALIGN_INT * T{{[0-9]}}.W + +define void @test(i32 addrspace(1)* %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) { +entry: + %shl = sub i32 32, %e + %x = add i32 %x_arg, 1 + %x.0 = shl i32 %x, %shl + %x.1 = lshr i32 %x, %e + %x.2 = or i32 %x.0, %x.1 + %y = add i32 %y_arg, 1 + %y.0 = shl i32 %y, %shl + %y.1 = lshr i32 %y, %e + %y.2 = or i32 %y.0, %y.1 + %z = add i32 %z_arg, 1 + %z.0 = shl i32 %z, %shl + %z.1 = lshr i32 %z, %e + %z.2 = or i32 %z.0, %z.1 + %w = add i32 %w_arg, 1 + %w.0 = shl i32 %w, %shl + %w.1 = lshr i32 %w, %e + %w.2 = or i32 %w.0, %w.1 + %xy = or i32 %x.2, %y.2 + %zw = or i32 %z.2, %w.2 + %xyzw = or i32 %xy, %zw + store i32 %xyzw, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/parallelandifcollapse.ll b/test/CodeGen/AMDGPU/parallelandifcollapse.ll new file mode 100644 index 00000000000..f32b044198a --- /dev/null +++ b/test/CodeGen/AMDGPU/parallelandifcollapse.ll @@ -0,0 +1,59 @@ +; Function Attrs: nounwind +; RUN: llc -march=r600 -mcpu=redwood -mattr=-promote-alloca < %s | FileCheck %s +; +; CFG flattening should use parallel-and mode to generate branch conditions and +; then merge if-regions with the same bodies. +; +; CHECK: AND_INT +; CHECK-NEXT: AND_INT +; CHECK-NEXT: OR_INT + +; FIXME: For some reason having the allocas here allowed the flatten cfg pass +; to do its transfomation, however now that we are using local memory for +; allocas, the transformation isn't happening. + +define void @_Z9chk1D_512v() #0 { +entry: + %a0 = alloca i32, align 4 + %b0 = alloca i32, align 4 + %c0 = alloca i32, align 4 + %d0 = alloca i32, align 4 + %a1 = alloca i32, align 4 + %b1 = alloca i32, align 4 + %c1 = alloca i32, align 4 + %d1 = alloca i32, align 4 + %data = alloca i32, align 4 + %0 = load i32, i32* %a0, align 4 + %1 = load i32, i32* %b0, align 4 + %cmp = icmp ne i32 %0, %1 + br i1 %cmp, label %land.lhs.true, label %if.end + +land.lhs.true: ; preds = %entry + %2 = load i32, i32* %c0, align 4 + %3 = load i32, i32* %d0, align 4 + %cmp1 = icmp ne i32 %2, %3 + br i1 %cmp1, label %if.then, label %if.end + +if.then: ; preds = %land.lhs.true + store i32 1, i32* %data, align 4 + br label %if.end + +if.end: ; preds = %if.then, %land.lhs.true, %entry + %4 = load i32, i32* %a1, align 4 + %5 = load i32, i32* %b1, align 4 + %cmp2 = icmp ne i32 %4, %5 + br i1 %cmp2, label %land.lhs.true3, label %if.end6 + +land.lhs.true3: ; preds = %if.end + %6 = load i32, i32* %c1, align 4 + %7 = load i32, i32* %d1, align 4 + %cmp4 = icmp ne i32 %6, %7 + br i1 %cmp4, label %if.then5, label %if.end6 + +if.then5: ; preds = %land.lhs.true3 + store i32 1, i32* %data, align 4 + br label %if.end6 + +if.end6: ; preds = %if.then5, %land.lhs.true3, %if.end + ret void +} diff --git a/test/CodeGen/AMDGPU/parallelorifcollapse.ll b/test/CodeGen/AMDGPU/parallelorifcollapse.ll new file mode 100644 index 00000000000..1da1e91b8ab --- /dev/null +++ b/test/CodeGen/AMDGPU/parallelorifcollapse.ll @@ -0,0 +1,66 @@ +; Function Attrs: nounwind +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s +; +; CFG flattening should use parallel-or to generate branch conditions and +; then merge if-regions with the same bodies. + +; FIXME: For some reason having the allocas here allowed the flatten cfg pass +; to do its transfomation, however now that we are using local memory for +; allocas, the transformation isn't happening. +; XFAIL: * +; +; CHECK: OR_INT +; CHECK-NEXT: OR_INT +; CHECK-NEXT: OR_INT +define void @_Z9chk1D_512v() #0 { +entry: + %a0 = alloca i32, align 4 + %b0 = alloca i32, align 4 + %c0 = alloca i32, align 4 + %d0 = alloca i32, align 4 + %a1 = alloca i32, align 4 + %b1 = alloca i32, align 4 + %c1 = alloca i32, align 4 + %d1 = alloca i32, align 4 + %data = alloca i32, align 4 + %0 = load i32, i32* %a0, align 4 + %1 = load i32, i32* %b0, align 4 + %cmp = icmp ne i32 %0, %1 + br i1 %cmp, label %land.lhs.true, label %if.else + +land.lhs.true: ; preds = %entry + %2 = load i32, i32* %c0, align 4 + %3 = load i32, i32* %d0, align 4 + %cmp1 = icmp ne i32 %2, %3 + br i1 %cmp1, label %if.then, label %if.else + +if.then: ; preds = %land.lhs.true + br label %if.end + +if.else: ; preds = %land.lhs.true, %entry + store i32 1, i32* %data, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %4 = load i32, i32* %a1, align 4 + %5 = load i32, i32* %b1, align 4 + %cmp2 = icmp ne i32 %4, %5 + br i1 %cmp2, label %land.lhs.true3, label %if.else6 + +land.lhs.true3: ; preds = %if.end + %6 = load i32, i32* %c1, align 4 + %7 = load i32, i32* %d1, align 4 + %cmp4 = icmp ne i32 %6, %7 + br i1 %cmp4, label %if.then5, label %if.else6 + +if.then5: ; preds = %land.lhs.true3 + br label %if.end7 + +if.else6: ; preds = %land.lhs.true3, %if.end + store i32 1, i32* %data, align 4 + br label %if.end7 + +if.end7: ; preds = %if.else6, %if.then5 + ret void +} + diff --git a/test/CodeGen/AMDGPU/predicate-dp4.ll b/test/CodeGen/AMDGPU/predicate-dp4.ll new file mode 100644 index 00000000000..6bc18759435 --- /dev/null +++ b/test/CodeGen/AMDGPU/predicate-dp4.ll @@ -0,0 +1,27 @@ +;RUN: llc < %s -march=r600 -mcpu=cayman + +; CHECK-LABEL: {{^}}main: +; CHECK: PRED_SETE_INT * Pred, +; CHECK: DOT4 T{{[0-9]+}}.X, T0.X, T0.X, Pred_sel_one +define void @main(<4 x float> inreg) #0 { +main_body: + %1 = extractelement <4 x float> %0, i32 0 + %2 = bitcast float %1 to i32 + %3 = icmp eq i32 %2, 0 + br i1 %3, label %IF, label %ENDIF + +IF: ; preds = %main_body + %4 = call float @llvm.AMDGPU.dp4(<4 x float> %0, <4 x float> %0) + br label %ENDIF + +ENDIF: ; preds = %IF, %main_body + %5 = phi float [%4, %IF], [0.000000e+00, %main_body] + %6 = insertelement <4 x float> undef, float %5, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %6, i32 0, i32 0) + ret void +} + +declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) +attributes #1 = { readnone } +attributes #0 = { "ShaderType"="0" } diff --git a/test/CodeGen/AMDGPU/predicates.ll b/test/CodeGen/AMDGPU/predicates.ll new file mode 100644 index 00000000000..0ce74d97ba8 --- /dev/null +++ b/test/CodeGen/AMDGPU/predicates.ll @@ -0,0 +1,104 @@ +; RUN: llc < %s -march=r600 -mattr=disable-irstructurizer -mcpu=redwood | FileCheck %s + +; These tests make sure the compiler is optimizing branches using predicates +; when it is legal to do so. + +; CHECK: {{^}}simple_if: +; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred, +; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel +define void @simple_if(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = icmp sgt i32 %in, 0 + br i1 %0, label %IF, label %ENDIF + +IF: + %1 = shl i32 %in, 1 + br label %ENDIF + +ENDIF: + %2 = phi i32 [ %in, %entry ], [ %1, %IF ] + store i32 %2, i32 addrspace(1)* %out + ret void +} + +; CHECK: {{^}}simple_if_else: +; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred, +; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel +; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel +define void @simple_if_else(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = icmp sgt i32 %in, 0 + br i1 %0, label %IF, label %ELSE + +IF: + %1 = shl i32 %in, 1 + br label %ENDIF + +ELSE: + %2 = lshr i32 %in, 1 + br label %ENDIF + +ENDIF: + %3 = phi i32 [ %1, %IF ], [ %2, %ELSE ] + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; CHECK: {{^}}nested_if: +; CHECK: ALU_PUSH_BEFORE +; CHECK: JUMP +; CHECK: POP +; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Exec +; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred, +; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel +define void @nested_if(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = icmp sgt i32 %in, 0 + br i1 %0, label %IF0, label %ENDIF + +IF0: + %1 = add i32 %in, 10 + %2 = icmp sgt i32 %1, 0 + br i1 %2, label %IF1, label %ENDIF + +IF1: + %3 = shl i32 %1, 1 + br label %ENDIF + +ENDIF: + %4 = phi i32 [%in, %entry], [%1, %IF0], [%3, %IF1] + store i32 %4, i32 addrspace(1)* %out + ret void +} + +; CHECK: {{^}}nested_if_else: +; CHECK: ALU_PUSH_BEFORE +; CHECK: JUMP +; CHECK: POP +; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Exec +; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred, +; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel +; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel +define void @nested_if_else(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = icmp sgt i32 %in, 0 + br i1 %0, label %IF0, label %ENDIF + +IF0: + %1 = add i32 %in, 10 + %2 = icmp sgt i32 %1, 0 + br i1 %2, label %IF1, label %ELSE1 + +IF1: + %3 = shl i32 %1, 1 + br label %ENDIF + +ELSE1: + %4 = lshr i32 %in, 1 + br label %ENDIF + +ENDIF: + %5 = phi i32 [%in, %entry], [%3, %IF1], [%4, %ELSE1] + store i32 %5, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/private-memory-atomics.ll b/test/CodeGen/AMDGPU/private-memory-atomics.ll new file mode 100644 index 00000000000..a008ac98a43 --- /dev/null +++ b/test/CodeGen/AMDGPU/private-memory-atomics.ll @@ -0,0 +1,32 @@ +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s + +; This works because promote allocas pass replaces these with LDS atomics. + +; Private atomics have no real use, but at least shouldn't crash on it. +define void @atomicrmw_private(i32 addrspace(1)* %out, i32 %in) nounwind { +entry: + %tmp = alloca [2 x i32] + %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 + %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1 + store i32 0, i32* %tmp1 + store i32 1, i32* %tmp2 + %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in + %tmp4 = atomicrmw add i32* %tmp3, i32 7 acq_rel + store i32 %tmp4, i32 addrspace(1)* %out + ret void +} + +define void @cmpxchg_private(i32 addrspace(1)* %out, i32 %in) nounwind { +entry: + %tmp = alloca [2 x i32] + %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 + %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1 + store i32 0, i32* %tmp1 + store i32 1, i32* %tmp2 + %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in + %tmp4 = cmpxchg i32* %tmp3, i32 0, i32 1 acq_rel monotonic + %val = extractvalue { i32, i1 } %tmp4, 0 + store i32 %val, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/private-memory-broken.ll b/test/CodeGen/AMDGPU/private-memory-broken.ll new file mode 100644 index 00000000000..6b18a19f195 --- /dev/null +++ b/test/CodeGen/AMDGPU/private-memory-broken.ll @@ -0,0 +1,21 @@ +; RUN: not llc -verify-machineinstrs -march=amdgcn -mcpu=SI %s -o /dev/null 2>&1 | FileCheck %s +; RUN: not llc -verify-machineinstrs -march=amdgcn -mcpu=tonga %s -o /dev/null 2>&1 | FileCheck %s + +; Make sure promote alloca pass doesn't crash + +; CHECK: unsupported call + +declare i32 @foo(i32*) nounwind + +define void @call_private(i32 addrspace(1)* %out, i32 %in) nounwind { +entry: + %tmp = alloca [2 x i32] + %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 + %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1 + store i32 0, i32* %tmp1 + store i32 1, i32* %tmp2 + %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in + %val = call i32 @foo(i32* %tmp3) nounwind + store i32 %val, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/private-memory.ll b/test/CodeGen/AMDGPU/private-memory.ll new file mode 100644 index 00000000000..1c562978050 --- /dev/null +++ b/test/CodeGen/AMDGPU/private-memory.ll @@ -0,0 +1,313 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC +; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC +; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC +; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC +; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; FUNC-LABEL: {{^}}mova_same_clause: + +; R600: LDS_WRITE +; R600: LDS_WRITE +; R600: LDS_READ +; R600: LDS_READ + +; SI-PROMOTE: ds_write_b32 +; SI-PROMOTE: ds_write_b32 +; SI-PROMOTE: ds_read_b32 +; SI-PROMOTE: ds_read_b32 + +; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0 +; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0 +define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { +entry: + %stack = alloca [5 x i32], align 4 + %0 = load i32, i32 addrspace(1)* %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 + store i32 4, i32* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 + %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 + store i32 5, i32* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 + %2 = load i32, i32* %arrayidx10, align 4 + store i32 %2, i32 addrspace(1)* %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 + %3 = load i32, i32* %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 + store i32 %3, i32 addrspace(1)* %arrayidx13 + ret void +} + +; This test checks that the stack offset is calculated correctly for structs. +; All register loads/stores should be optimized away, so there shouldn't be +; any MOVA instructions. +; +; XXX: This generated code has unnecessary MOVs, we should be able to optimize +; this. + +; FUNC-LABEL: {{^}}multiple_structs: +; R600-NOT: MOVA_INT +; SI-NOT: v_movrel +; SI-NOT: v_movrel +%struct.point = type { i32, i32 } + +define void @multiple_structs(i32 addrspace(1)* %out) { +entry: + %a = alloca %struct.point + %b = alloca %struct.point + %a.x.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 0 + %a.y.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 1 + %b.x.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 0 + %b.y.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 1 + store i32 0, i32* %a.x.ptr + store i32 1, i32* %a.y.ptr + store i32 2, i32* %b.x.ptr + store i32 3, i32* %b.y.ptr + %a.indirect.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 0 + %b.indirect.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 0 + %a.indirect = load i32, i32* %a.indirect.ptr + %b.indirect = load i32, i32* %b.indirect.ptr + %0 = add i32 %a.indirect, %b.indirect + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; Test direct access of a private array inside a loop. The private array +; loads and stores should be lowered to copies, so there shouldn't be any +; MOVA instructions. + +; FUNC-LABEL: {{^}}direct_loop: +; R600-NOT: MOVA_INT +; SI-NOT: v_movrel + +define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +entry: + %prv_array_const = alloca [2 x i32] + %prv_array = alloca [2 x i32] + %a = load i32, i32 addrspace(1)* %in + %b_src_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %b = load i32, i32 addrspace(1)* %b_src_ptr + %a_dst_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0 + store i32 %a, i32* %a_dst_ptr + %b_dst_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 1 + store i32 %b, i32* %b_dst_ptr + br label %for.body + +for.body: + %inc = phi i32 [0, %entry], [%count, %for.body] + %x_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0 + %x = load i32, i32* %x_ptr + %y_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0 + %y = load i32, i32* %y_ptr + %xy = add i32 %x, %y + store i32 %xy, i32* %y_ptr + %count = add i32 %inc, 1 + %done = icmp eq i32 %count, 4095 + br i1 %done, label %for.end, label %for.body + +for.end: + %value_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0 + %value = load i32, i32* %value_ptr + store i32 %value, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}short_array: + +; R600: MOVA_INT + +; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x68,0xe0 +; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:2 ; encoding: [0x02,0x10,0x68,0xe0 +; SI-PROMOTE: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} +define void @short_array(i32 addrspace(1)* %out, i32 %index) { +entry: + %0 = alloca [2 x i16] + %1 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 0 + %2 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 1 + store i16 0, i16* %1 + store i16 1, i16* %2 + %3 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 %index + %4 = load i16, i16* %3 + %5 = sext i16 %4 to i32 + store i32 %5, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}char_array: + +; R600: MOVA_INT + +; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x60,0xe0 +; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:1 ; encoding: [0x01,0x10,0x60,0xe0 +define void @char_array(i32 addrspace(1)* %out, i32 %index) { +entry: + %0 = alloca [2 x i8] + %1 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 0 + %2 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 1 + store i8 0, i8* %1 + store i8 1, i8* %2 + %3 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 %index + %4 = load i8, i8* %3 + %5 = sext i8 %4 to i32 + store i32 %5, i32 addrspace(1)* %out + ret void + +} + +; Make sure we don't overwrite workitem information with private memory + +; FUNC-LABEL: {{^}}work_item_info: +; R600-NOT: MOV T0.X +; Additional check in case the move ends up in the last slot +; R600-NOT: MOV * TO.X + +; SI-NOT: v_mov_b32_e{{(32|64)}} v0 +define void @work_item_info(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = alloca [2 x i32] + %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0 + %2 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 1 + store i32 0, i32* %1 + store i32 1, i32* %2 + %3 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 %in + %4 = load i32, i32* %3 + %5 = call i32 @llvm.r600.read.tidig.x() + %6 = add i32 %4, %5 + store i32 %6, i32 addrspace(1)* %out + ret void +} + +; Test that two stack objects are not stored in the same register +; The second stack object should be in T3.X +; FUNC-LABEL: {{^}}no_overlap: +; R600_CHECK: MOV +; R600_CHECK: [[CHAN:[XYZW]]]+ +; R600-NOT: [[CHAN]]+ +; SI: v_mov_b32_e32 v3 +define void @no_overlap(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = alloca [3 x i8], align 1 + %1 = alloca [2 x i8], align 1 + %2 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 0 + %3 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 1 + %4 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 2 + %5 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 0 + %6 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 1 + store i8 0, i8* %2 + store i8 1, i8* %3 + store i8 2, i8* %4 + store i8 1, i8* %5 + store i8 0, i8* %6 + %7 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 %in + %8 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 %in + %9 = load i8, i8* %7 + %10 = load i8, i8* %8 + %11 = add i8 %9, %10 + %12 = sext i8 %11 to i32 + store i32 %12, i32 addrspace(1)* %out + ret void +} + +define void @char_array_array(i32 addrspace(1)* %out, i32 %index) { +entry: + %alloca = alloca [2 x [2 x i8]] + %gep0 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0 + %gep1 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 1 + store i8 0, i8* %gep0 + store i8 1, i8* %gep1 + %gep2 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 %index + %load = load i8, i8* %gep2 + %sext = sext i8 %load to i32 + store i32 %sext, i32 addrspace(1)* %out + ret void +} + +define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) { +entry: + %alloca = alloca [2 x [2 x i32]] + %gep0 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0 + %gep1 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 1 + store i32 0, i32* %gep0 + store i32 1, i32* %gep1 + %gep2 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index + %load = load i32, i32* %gep2 + store i32 %load, i32 addrspace(1)* %out + ret void +} + +define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) { +entry: + %alloca = alloca [2 x [2 x i64]] + %gep0 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0 + %gep1 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 1 + store i64 0, i64* %gep0 + store i64 1, i64* %gep1 + %gep2 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 %index + %load = load i64, i64* %gep2 + store i64 %load, i64 addrspace(1)* %out + ret void +} + +%struct.pair32 = type { i32, i32 } + +define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) { +entry: + %alloca = alloca [2 x [2 x %struct.pair32]] + %gep0 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1 + %gep1 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 1, i32 1 + store i32 0, i32* %gep0 + store i32 1, i32* %gep1 + %gep2 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 %index, i32 0 + %load = load i32, i32* %gep2 + store i32 %load, i32 addrspace(1)* %out + ret void +} + +define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) { +entry: + %alloca = alloca [2 x %struct.pair32] + %gep0 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1 + %gep1 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 1, i32 0 + store i32 0, i32* %gep0 + store i32 1, i32* %gep1 + %gep2 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 %index, i32 0 + %load = load i32, i32* %gep2 + store i32 %load, i32 addrspace(1)* %out + ret void +} + +define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind { +entry: + %tmp = alloca [2 x i32] + %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 + %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1 + store i32 0, i32* %tmp1 + store i32 1, i32* %tmp2 + %cmp = icmp eq i32 %in, 0 + %sel = select i1 %cmp, i32* %tmp1, i32* %tmp2 + %load = load i32, i32* %sel + store i32 %load, i32 addrspace(1)* %out + ret void +} + +; AMDGPUPromoteAlloca does not know how to handle ptrtoint. When it +; finds one, it should stop trying to promote. + +; FUNC-LABEL: ptrtoint: +; SI-NOT: ds_write +; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen +; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:5 +define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) { + %alloca = alloca [16 x i32] + %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a + store i32 5, i32* %tmp0 + %tmp1 = ptrtoint [16 x i32]* %alloca to i32 + %tmp2 = add i32 %tmp1, 5 + %tmp3 = inttoptr i32 %tmp2 to i32* + %tmp4 = getelementptr i32, i32* %tmp3, i32 %b + %tmp5 = load i32, i32* %tmp4 + store i32 %tmp5, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/pv-packing.ll b/test/CodeGen/AMDGPU/pv-packing.ll new file mode 100644 index 00000000000..abeae563ff3 --- /dev/null +++ b/test/CodeGen/AMDGPU/pv-packing.ll @@ -0,0 +1,45 @@ +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s + +;CHECK: DOT4 T{{[0-9]\.X}} +;CHECK: MULADD_IEEE * T{{[0-9]\.W}} + +define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) #0 { +main_body: + %0 = extractelement <4 x float> %reg1, i32 0 + %1 = extractelement <4 x float> %reg1, i32 1 + %2 = extractelement <4 x float> %reg1, i32 2 + %3 = extractelement <4 x float> %reg2, i32 0 + %4 = extractelement <4 x float> %reg2, i32 1 + %5 = extractelement <4 x float> %reg2, i32 2 + %6 = extractelement <4 x float> %reg3, i32 0 + %7 = extractelement <4 x float> %reg3, i32 1 + %8 = extractelement <4 x float> %reg3, i32 2 + %9 = load <4 x float>, <4 x float> addrspace(8)* null + %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %11 = call float @llvm.AMDGPU.dp4(<4 x float> %9, <4 x float> %9) + %12 = fmul float %0, %3 + %13 = fadd float %12, %6 + %14 = fmul float %1, %4 + %15 = fadd float %14, %7 + %16 = fmul float %2, %5 + %17 = fadd float %16, %8 + %18 = fmul float %11, %11 + %19 = fadd float %18, %0 + %20 = insertelement <4 x float> undef, float %13, i32 0 + %21 = insertelement <4 x float> %20, float %15, i32 1 + %22 = insertelement <4 x float> %21, float %17, i32 2 + %23 = insertelement <4 x float> %22, float %19, i32 3 + %24 = call float @llvm.AMDGPU.dp4(<4 x float> %23, <4 x float> %10) + %25 = insertelement <4 x float> undef, float %24, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %25, i32 0, i32 2) + ret void +} + +; Function Attrs: readnone +declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 + + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="1" } +attributes #1 = { readnone } diff --git a/test/CodeGen/AMDGPU/pv.ll b/test/CodeGen/AMDGPU/pv.ll new file mode 100644 index 00000000000..9a57dd19765 --- /dev/null +++ b/test/CodeGen/AMDGPU/pv.ll @@ -0,0 +1,241 @@ +; RUN: llc < %s -march=r600 | FileCheck %s + +; CHECK: DOT4 * T{{[0-9]\.W}} (MASKED) +; CHECK: MAX T{{[0-9].[XYZW]}}, 0.0, PV.X + +define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7) #0 { +main_body: + %0 = extractelement <4 x float> %reg1, i32 0 + %1 = extractelement <4 x float> %reg1, i32 1 + %2 = extractelement <4 x float> %reg1, i32 2 + %3 = extractelement <4 x float> %reg1, i32 3 + %4 = extractelement <4 x float> %reg2, i32 0 + %5 = extractelement <4 x float> %reg2, i32 1 + %6 = extractelement <4 x float> %reg2, i32 2 + %7 = extractelement <4 x float> %reg2, i32 3 + %8 = extractelement <4 x float> %reg3, i32 0 + %9 = extractelement <4 x float> %reg3, i32 1 + %10 = extractelement <4 x float> %reg3, i32 2 + %11 = extractelement <4 x float> %reg3, i32 3 + %12 = extractelement <4 x float> %reg4, i32 0 + %13 = extractelement <4 x float> %reg4, i32 1 + %14 = extractelement <4 x float> %reg4, i32 2 + %15 = extractelement <4 x float> %reg4, i32 3 + %16 = extractelement <4 x float> %reg5, i32 0 + %17 = extractelement <4 x float> %reg5, i32 1 + %18 = extractelement <4 x float> %reg5, i32 2 + %19 = extractelement <4 x float> %reg5, i32 3 + %20 = extractelement <4 x float> %reg6, i32 0 + %21 = extractelement <4 x float> %reg6, i32 1 + %22 = extractelement <4 x float> %reg6, i32 2 + %23 = extractelement <4 x float> %reg6, i32 3 + %24 = extractelement <4 x float> %reg7, i32 0 + %25 = extractelement <4 x float> %reg7, i32 1 + %26 = extractelement <4 x float> %reg7, i32 2 + %27 = extractelement <4 x float> %reg7, i32 3 + %28 = load <4 x float>, <4 x float> addrspace(8)* null + %29 = extractelement <4 x float> %28, i32 0 + %30 = fmul float %0, %29 + %31 = load <4 x float>, <4 x float> addrspace(8)* null + %32 = extractelement <4 x float> %31, i32 1 + %33 = fmul float %0, %32 + %34 = load <4 x float>, <4 x float> addrspace(8)* null + %35 = extractelement <4 x float> %34, i32 2 + %36 = fmul float %0, %35 + %37 = load <4 x float>, <4 x float> addrspace(8)* null + %38 = extractelement <4 x float> %37, i32 3 + %39 = fmul float %0, %38 + %40 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %41 = extractelement <4 x float> %40, i32 0 + %42 = fmul float %1, %41 + %43 = fadd float %42, %30 + %44 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %45 = extractelement <4 x float> %44, i32 1 + %46 = fmul float %1, %45 + %47 = fadd float %46, %33 + %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %49 = extractelement <4 x float> %48, i32 2 + %50 = fmul float %1, %49 + %51 = fadd float %50, %36 + %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %53 = extractelement <4 x float> %52, i32 3 + %54 = fmul float %1, %53 + %55 = fadd float %54, %39 + %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %57 = extractelement <4 x float> %56, i32 0 + %58 = fmul float %2, %57 + %59 = fadd float %58, %43 + %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %61 = extractelement <4 x float> %60, i32 1 + %62 = fmul float %2, %61 + %63 = fadd float %62, %47 + %64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %65 = extractelement <4 x float> %64, i32 2 + %66 = fmul float %2, %65 + %67 = fadd float %66, %51 + %68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %69 = extractelement <4 x float> %68, i32 3 + %70 = fmul float %2, %69 + %71 = fadd float %70, %55 + %72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %73 = extractelement <4 x float> %72, i32 0 + %74 = fmul float %3, %73 + %75 = fadd float %74, %59 + %76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %77 = extractelement <4 x float> %76, i32 1 + %78 = fmul float %3, %77 + %79 = fadd float %78, %63 + %80 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %81 = extractelement <4 x float> %80, i32 2 + %82 = fmul float %3, %81 + %83 = fadd float %82, %67 + %84 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %85 = extractelement <4 x float> %84, i32 3 + %86 = fmul float %3, %85 + %87 = fadd float %86, %71 + %88 = insertelement <4 x float> undef, float %4, i32 0 + %89 = insertelement <4 x float> %88, float %5, i32 1 + %90 = insertelement <4 x float> %89, float %6, i32 2 + %91 = insertelement <4 x float> %90, float 0.000000e+00, i32 3 + %92 = insertelement <4 x float> undef, float %4, i32 0 + %93 = insertelement <4 x float> %92, float %5, i32 1 + %94 = insertelement <4 x float> %93, float %6, i32 2 + %95 = insertelement <4 x float> %94, float 0.000000e+00, i32 3 + %96 = call float @llvm.AMDGPU.dp4(<4 x float> %91, <4 x float> %95) + %97 = call float @fabs(float %96) + %98 = call float @llvm.AMDGPU.rsq.f32(float %97) + %99 = fmul float %4, %98 + %100 = fmul float %5, %98 + %101 = fmul float %6, %98 + %102 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) + %103 = extractelement <4 x float> %102, i32 0 + %104 = fmul float %103, %8 + %105 = fadd float %104, %20 + %106 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) + %107 = extractelement <4 x float> %106, i32 1 + %108 = fmul float %107, %9 + %109 = fadd float %108, %21 + %110 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) + %111 = extractelement <4 x float> %110, i32 2 + %112 = fmul float %111, %10 + %113 = fadd float %112, %22 + %114 = call float @llvm.AMDIL.clamp.(float %105, float 0.000000e+00, float 1.000000e+00) + %115 = call float @llvm.AMDIL.clamp.(float %109, float 0.000000e+00, float 1.000000e+00) + %116 = call float @llvm.AMDIL.clamp.(float %113, float 0.000000e+00, float 1.000000e+00) + %117 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00) + %118 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) + %119 = extractelement <4 x float> %118, i32 0 + %120 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) + %121 = extractelement <4 x float> %120, i32 1 + %122 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) + %123 = extractelement <4 x float> %122, i32 2 + %124 = insertelement <4 x float> undef, float %99, i32 0 + %125 = insertelement <4 x float> %124, float %100, i32 1 + %126 = insertelement <4 x float> %125, float %101, i32 2 + %127 = insertelement <4 x float> %126, float 0.000000e+00, i32 3 + %128 = insertelement <4 x float> undef, float %119, i32 0 + %129 = insertelement <4 x float> %128, float %121, i32 1 + %130 = insertelement <4 x float> %129, float %123, i32 2 + %131 = insertelement <4 x float> %130, float 0.000000e+00, i32 3 + %132 = call float @llvm.AMDGPU.dp4(<4 x float> %127, <4 x float> %131) + %133 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) + %134 = extractelement <4 x float> %133, i32 0 + %135 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) + %136 = extractelement <4 x float> %135, i32 1 + %137 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) + %138 = extractelement <4 x float> %137, i32 2 + %139 = insertelement <4 x float> undef, float %99, i32 0 + %140 = insertelement <4 x float> %139, float %100, i32 1 + %141 = insertelement <4 x float> %140, float %101, i32 2 + %142 = insertelement <4 x float> %141, float 0.000000e+00, i32 3 + %143 = insertelement <4 x float> undef, float %134, i32 0 + %144 = insertelement <4 x float> %143, float %136, i32 1 + %145 = insertelement <4 x float> %144, float %138, i32 2 + %146 = insertelement <4 x float> %145, float 0.000000e+00, i32 3 + %147 = call float @llvm.AMDGPU.dp4(<4 x float> %142, <4 x float> %146) + %148 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) + %149 = extractelement <4 x float> %148, i32 0 + %150 = fmul float %149, %8 + %151 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) + %152 = extractelement <4 x float> %151, i32 1 + %153 = fmul float %152, %9 + %154 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) + %155 = extractelement <4 x float> %154, i32 2 + %156 = fmul float %155, %10 + %157 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) + %158 = extractelement <4 x float> %157, i32 0 + %159 = fmul float %158, %12 + %160 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) + %161 = extractelement <4 x float> %160, i32 1 + %162 = fmul float %161, %13 + %163 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) + %164 = extractelement <4 x float> %163, i32 2 + %165 = fmul float %164, %14 + %166 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10) + %167 = extractelement <4 x float> %166, i32 0 + %168 = fmul float %167, %16 + %169 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10) + %170 = extractelement <4 x float> %169, i32 1 + %171 = fmul float %170, %17 + %172 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10) + %173 = extractelement <4 x float> %172, i32 2 + %174 = fmul float %173, %18 + %175 = fcmp uge float %132, 0.000000e+00 + %176 = select i1 %175, float %132, float 0.000000e+00 + %177 = fcmp uge float %147, 0.000000e+00 + %178 = select i1 %177, float %147, float 0.000000e+00 + %179 = call float @llvm.pow.f32(float %178, float %24) + %180 = fcmp ult float %132, 0.000000e+00 + %181 = select i1 %180, float 0.000000e+00, float %179 + %182 = fadd float %150, %105 + %183 = fadd float %153, %109 + %184 = fadd float %156, %113 + %185 = fmul float %176, %159 + %186 = fadd float %185, %182 + %187 = fmul float %176, %162 + %188 = fadd float %187, %183 + %189 = fmul float %176, %165 + %190 = fadd float %189, %184 + %191 = fmul float %181, %168 + %192 = fadd float %191, %186 + %193 = fmul float %181, %171 + %194 = fadd float %193, %188 + %195 = fmul float %181, %174 + %196 = fadd float %195, %190 + %197 = call float @llvm.AMDIL.clamp.(float %192, float 0.000000e+00, float 1.000000e+00) + %198 = call float @llvm.AMDIL.clamp.(float %194, float 0.000000e+00, float 1.000000e+00) + %199 = call float @llvm.AMDIL.clamp.(float %196, float 0.000000e+00, float 1.000000e+00) + %200 = insertelement <4 x float> undef, float %75, i32 0 + %201 = insertelement <4 x float> %200, float %79, i32 1 + %202 = insertelement <4 x float> %201, float %83, i32 2 + %203 = insertelement <4 x float> %202, float %87, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %203, i32 60, i32 1) + %204 = insertelement <4 x float> undef, float %197, i32 0 + %205 = insertelement <4 x float> %204, float %198, i32 1 + %206 = insertelement <4 x float> %205, float %199, i32 2 + %207 = insertelement <4 x float> %206, float %117, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %207, i32 0, i32 2) + ret void +} + +; Function Attrs: readnone +declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 + +; Function Attrs: readonly +declare float @fabs(float) #2 + +; Function Attrs: readnone +declare float @llvm.AMDGPU.rsq.f32(float) #1 + +; Function Attrs: readnone +declare float @llvm.AMDIL.clamp.(float, float, float) #1 + +; Function Attrs: nounwind readonly +declare float @llvm.pow.f32(float, float) #3 + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="1" } +attributes #1 = { readnone } +attributes #2 = { readonly } +attributes #3 = { nounwind readonly } diff --git a/test/CodeGen/AMDGPU/r600-encoding.ll b/test/CodeGen/AMDGPU/r600-encoding.ll new file mode 100644 index 00000000000..3a82ee30a32 --- /dev/null +++ b/test/CodeGen/AMDGPU/r600-encoding.ll @@ -0,0 +1,25 @@ +; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=redwood | FileCheck --check-prefix=EG %s +; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=rs880 | FileCheck --check-prefix=R600 %s + +; The earliest R600 GPUs have a slightly different encoding than the rest of +; the VLIW4/5 GPUs. + +; EG: {{^}}test: +; EG: MUL_IEEE {{[ *TXYZWPVxyzw.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x01,0x[0-9a-f]+,0x[0-9a-f]+}}] + +; R600: {{^}}test: +; R600: MUL_IEEE {{[ *TXYZWPVxyzw.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x02,0x[0-9a-f]+,0x[0-9a-f]+}}] + +define void @test(<4 x float> inreg %reg0) #0 { +entry: + %r0 = extractelement <4 x float> %reg0, i32 0 + %r1 = extractelement <4 x float> %reg0, i32 1 + %r2 = fmul float %r0, %r1 + %vec = insertelement <4 x float> undef, float %r2, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) + ret void +} + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="0" } diff --git a/test/CodeGen/AMDGPU/r600-export-fix.ll b/test/CodeGen/AMDGPU/r600-export-fix.ll new file mode 100644 index 00000000000..7cb80195b36 --- /dev/null +++ b/test/CodeGen/AMDGPU/r600-export-fix.ll @@ -0,0 +1,142 @@ +; RUN: llc < %s -march=r600 -mcpu=cedar | FileCheck %s + +;CHECK: EXPORT T{{[0-9]}}.XYZW +;CHECK: EXPORT T{{[0-9]}}.0000 +;CHECK: EXPORT T{{[0-9]}}.0000 +;CHECK: EXPORT T{{[0-9]}}.0XYZ +;CHECK: EXPORT T{{[0-9]}}.XYZW +;CHECK: EXPORT T{{[0-9]}}.YZ00 +;CHECK: EXPORT T{{[0-9]}}.0000 +;CHECK: EXPORT T{{[0-9]}}.0000 + + +define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { +main_body: + %0 = extractelement <4 x float> %reg1, i32 0 + %1 = extractelement <4 x float> %reg1, i32 1 + %2 = extractelement <4 x float> %reg1, i32 2 + %3 = extractelement <4 x float> %reg1, i32 3 + %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) + %5 = extractelement <4 x float> %4, i32 0 + %6 = fmul float %5, %0 + %7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) + %8 = extractelement <4 x float> %7, i32 1 + %9 = fmul float %8, %0 + %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) + %11 = extractelement <4 x float> %10, i32 2 + %12 = fmul float %11, %0 + %13 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) + %14 = extractelement <4 x float> %13, i32 3 + %15 = fmul float %14, %0 + %16 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) + %17 = extractelement <4 x float> %16, i32 0 + %18 = fmul float %17, %1 + %19 = fadd float %18, %6 + %20 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) + %21 = extractelement <4 x float> %20, i32 1 + %22 = fmul float %21, %1 + %23 = fadd float %22, %9 + %24 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) + %25 = extractelement <4 x float> %24, i32 2 + %26 = fmul float %25, %1 + %27 = fadd float %26, %12 + %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) + %29 = extractelement <4 x float> %28, i32 3 + %30 = fmul float %29, %1 + %31 = fadd float %30, %15 + %32 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6) + %33 = extractelement <4 x float> %32, i32 0 + %34 = fmul float %33, %2 + %35 = fadd float %34, %19 + %36 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6) + %37 = extractelement <4 x float> %36, i32 1 + %38 = fmul float %37, %2 + %39 = fadd float %38, %23 + %40 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6) + %41 = extractelement <4 x float> %40, i32 2 + %42 = fmul float %41, %2 + %43 = fadd float %42, %27 + %44 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6) + %45 = extractelement <4 x float> %44, i32 3 + %46 = fmul float %45, %2 + %47 = fadd float %46, %31 + %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) + %49 = extractelement <4 x float> %48, i32 0 + %50 = fmul float %49, %3 + %51 = fadd float %50, %35 + %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) + %53 = extractelement <4 x float> %52, i32 1 + %54 = fmul float %53, %3 + %55 = fadd float %54, %39 + %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) + %57 = extractelement <4 x float> %56, i32 2 + %58 = fmul float %57, %3 + %59 = fadd float %58, %43 + %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) + %61 = extractelement <4 x float> %60, i32 3 + %62 = fmul float %61, %3 + %63 = fadd float %62, %47 + %64 = load <4 x float>, <4 x float> addrspace(8)* null + %65 = extractelement <4 x float> %64, i32 0 + %66 = load <4 x float>, <4 x float> addrspace(8)* null + %67 = extractelement <4 x float> %66, i32 1 + %68 = load <4 x float>, <4 x float> addrspace(8)* null + %69 = extractelement <4 x float> %68, i32 2 + %70 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %71 = extractelement <4 x float> %70, i32 0 + %72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %73 = extractelement <4 x float> %72, i32 1 + %74 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %75 = extractelement <4 x float> %74, i32 2 + %76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %77 = extractelement <4 x float> %76, i32 0 + %78 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %79 = extractelement <4 x float> %78, i32 1 + %80 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) + %81 = extractelement <4 x float> %80, i32 2 + %82 = insertelement <4 x float> undef, float %51, i32 0 + %83 = insertelement <4 x float> %82, float %55, i32 1 + %84 = insertelement <4 x float> %83, float %59, i32 2 + %85 = insertelement <4 x float> %84, float %63, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %85, i32 60, i32 1) + %86 = insertelement <4 x float> undef, float 0.000000e+00, i32 0 + %87 = insertelement <4 x float> %86, float 0.000000e+00, i32 1 + %88 = insertelement <4 x float> %87, float 0.000000e+00, i32 2 + %89 = insertelement <4 x float> %88, float 0.000000e+00, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %89, i32 0, i32 2) + %90 = insertelement <4 x float> undef, float 0.000000e+00, i32 0 + %91 = insertelement <4 x float> %90, float 0.000000e+00, i32 1 + %92 = insertelement <4 x float> %91, float 0.000000e+00, i32 2 + %93 = insertelement <4 x float> %92, float 0.000000e+00, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %93, i32 1, i32 2) + %94 = insertelement <4 x float> undef, float 0.000000e+00, i32 0 + %95 = insertelement <4 x float> %94, float %65, i32 1 + %96 = insertelement <4 x float> %95, float %67, i32 2 + %97 = insertelement <4 x float> %96, float %69, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %97, i32 2, i32 2) + %98 = insertelement <4 x float> undef, float %77, i32 0 + %99 = insertelement <4 x float> %98, float %79, i32 1 + %100 = insertelement <4 x float> %99, float %81, i32 2 + %101 = insertelement <4 x float> %100, float %71, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %101, i32 3, i32 2) + %102 = insertelement <4 x float> undef, float %73, i32 0 + %103 = insertelement <4 x float> %102, float %75, i32 1 + %104 = insertelement <4 x float> %103, float 0.000000e+00, i32 2 + %105 = insertelement <4 x float> %104, float 0.000000e+00, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %105, i32 4, i32 2) + %106 = insertelement <4 x float> undef, float 0.000000e+00, i32 0 + %107 = insertelement <4 x float> %106, float 0.000000e+00, i32 1 + %108 = insertelement <4 x float> %107, float 0.000000e+00, i32 2 + %109 = insertelement <4 x float> %108, float 0.000000e+00, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %109, i32 5, i32 2) + %110 = insertelement <4 x float> undef, float 0.000000e+00, i32 0 + %111 = insertelement <4 x float> %110, float 0.000000e+00, i32 1 + %112 = insertelement <4 x float> %111, float 0.000000e+00, i32 2 + %113 = insertelement <4 x float> %112, float 0.000000e+00, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %113, i32 6, i32 2) + ret void +} + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="1" } diff --git a/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll b/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll new file mode 100644 index 00000000000..f388f8ffe29 --- /dev/null +++ b/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll @@ -0,0 +1,58 @@ +;RUN: llc < %s -march=r600 -mcpu=cayman + +define void @main(<4 x float> inreg, <4 x float> inreg) #0 { +main_body: + %2 = extractelement <4 x float> %0, i32 0 + %3 = extractelement <4 x float> %0, i32 1 + %4 = extractelement <4 x float> %0, i32 2 + %5 = extractelement <4 x float> %0, i32 3 + %6 = insertelement <4 x float> undef, float %2, i32 0 + %7 = insertelement <4 x float> %6, float %3, i32 1 + %8 = insertelement <4 x float> %7, float %4, i32 2 + %9 = insertelement <4 x float> %8, float %5, i32 3 + %10 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %9) + %11 = extractelement <4 x float> %10, i32 0 + %12 = extractelement <4 x float> %10, i32 1 + %13 = extractelement <4 x float> %10, i32 2 + %14 = extractelement <4 x float> %10, i32 3 + %15 = call float @fabs(float %13) + %16 = fdiv float 1.000000e+00, %15 + %17 = fmul float %11, %16 + %18 = fadd float %17, 1.500000e+00 + %19 = fmul float %12, %16 + %20 = fadd float %19, 1.500000e+00 + %21 = insertelement <4 x float> undef, float %20, i32 0 + %22 = insertelement <4 x float> %21, float %18, i32 1 + %23 = insertelement <4 x float> %22, float %14, i32 2 + %24 = insertelement <4 x float> %23, float %5, i32 3 + %25 = extractelement <4 x float> %24, i32 0 + %26 = extractelement <4 x float> %24, i32 1 + %27 = extractelement <4 x float> %24, i32 2 + %28 = extractelement <4 x float> %24, i32 3 + %29 = insertelement <4 x float> undef, float %25, i32 0 + %30 = insertelement <4 x float> %29, float %26, i32 1 + %31 = insertelement <4 x float> %30, float %27, i32 2 + %32 = insertelement <4 x float> %31, float %28, i32 3 + %33 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %32, i32 16, i32 0, i32 13) + %34 = extractelement <4 x float> %33, i32 0 + %35 = insertelement <4 x float> undef, float %34, i32 0 + %36 = insertelement <4 x float> %35, float %34, i32 1 + %37 = insertelement <4 x float> %36, float %34, i32 2 + %38 = insertelement <4 x float> %37, float 1.000000e+00, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %38, i32 0, i32 0) + ret void +} + +; Function Attrs: readnone +declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #1 + +; Function Attrs: readnone +declare float @fabs(float) #1 + +; Function Attrs: readnone +declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1 + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { readnone } diff --git a/test/CodeGen/AMDGPU/r600cfg.ll b/test/CodeGen/AMDGPU/r600cfg.ll new file mode 100644 index 00000000000..c7b9d65220f --- /dev/null +++ b/test/CodeGen/AMDGPU/r600cfg.ll @@ -0,0 +1,119 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood + +define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { +main_body: + %0 = extractelement <4 x float> %reg1, i32 0 + %1 = extractelement <4 x float> %reg1, i32 1 + %2 = extractelement <4 x float> %reg1, i32 2 + %3 = extractelement <4 x float> %reg1, i32 3 + %4 = bitcast float %0 to i32 + %5 = icmp eq i32 %4, 0 + %6 = sext i1 %5 to i32 + %7 = bitcast i32 %6 to float + %8 = bitcast float %7 to i32 + %9 = icmp ne i32 %8, 0 + %. = select i1 %9, float 0x36A0000000000000, float %0 + br label %LOOP + +LOOP: ; preds = %LOOP47, %main_body + %temp12.0 = phi float [ 0x36A0000000000000, %main_body ], [ %temp12.1, %LOOP47 ] + %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %38, %LOOP47 ] + %temp4.1 = phi float [ %., %main_body ], [ %52, %LOOP47 ] + %10 = bitcast float %temp4.1 to i32 + %11 = icmp eq i32 %10, 1 + %12 = sext i1 %11 to i32 + %13 = bitcast i32 %12 to float + %14 = bitcast float %13 to i32 + %15 = icmp ne i32 %14, 0 + br i1 %15, label %IF41, label %ENDIF40 + +IF41: ; preds = %LOOP + %16 = insertelement <4 x float> undef, float %0, i32 0 + %17 = insertelement <4 x float> %16, float %temp8.0, i32 1 + %18 = insertelement <4 x float> %17, float %temp12.0, i32 2 + %19 = insertelement <4 x float> %18, float 0.000000e+00, i32 3 + call void @llvm.R600.store.stream.output(<4 x float> %19, i32 0, i32 0, i32 1) + %20 = insertelement <4 x float> undef, float %0, i32 0 + %21 = insertelement <4 x float> %20, float %temp8.0, i32 1 + %22 = insertelement <4 x float> %21, float %temp12.0, i32 2 + %23 = insertelement <4 x float> %22, float 0.000000e+00, i32 3 + call void @llvm.R600.store.stream.output(<4 x float> %23, i32 0, i32 0, i32 2) + %24 = insertelement <4 x float> undef, float %0, i32 0 + %25 = insertelement <4 x float> %24, float %temp8.0, i32 1 + %26 = insertelement <4 x float> %25, float %temp12.0, i32 2 + %27 = insertelement <4 x float> %26, float 0.000000e+00, i32 3 + call void @llvm.R600.store.stream.output(<4 x float> %27, i32 0, i32 0, i32 4) + %28 = insertelement <4 x float> undef, float 0.000000e+00, i32 0 + %29 = insertelement <4 x float> %28, float 0.000000e+00, i32 1 + %30 = insertelement <4 x float> %29, float 0.000000e+00, i32 2 + %31 = insertelement <4 x float> %30, float 0.000000e+00, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %31, i32 60, i32 1) + %32 = insertelement <4 x float> undef, float %0, i32 0 + %33 = insertelement <4 x float> %32, float %temp8.0, i32 1 + %34 = insertelement <4 x float> %33, float %temp12.0, i32 2 + %35 = insertelement <4 x float> %34, float 0.000000e+00, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %35, i32 0, i32 2) + ret void + +ENDIF40: ; preds = %LOOP + %36 = bitcast float %temp8.0 to i32 + %37 = add i32 %36, 1 + %38 = bitcast i32 %37 to float + %39 = bitcast float %temp4.1 to i32 + %40 = urem i32 %39, 2 + %41 = bitcast i32 %40 to float + %42 = bitcast float %41 to i32 + %43 = icmp eq i32 %42, 0 + %44 = sext i1 %43 to i32 + %45 = bitcast i32 %44 to float + %46 = bitcast float %45 to i32 + %47 = icmp ne i32 %46, 0 + %48 = bitcast float %temp4.1 to i32 + br i1 %47, label %IF44, label %ELSE45 + +IF44: ; preds = %ENDIF40 + %49 = udiv i32 %48, 2 + br label %ENDIF43 + +ELSE45: ; preds = %ENDIF40 + %50 = mul i32 3, %48 + %51 = add i32 %50, 1 + br label %ENDIF43 + +ENDIF43: ; preds = %ELSE45, %IF44 + %.sink = phi i32 [ %49, %IF44 ], [ %51, %ELSE45 ] + %52 = bitcast i32 %.sink to float + %53 = load <4 x float>, <4 x float> addrspace(8)* null + %54 = extractelement <4 x float> %53, i32 0 + %55 = bitcast float %54 to i32 + br label %LOOP47 + +LOOP47: ; preds = %ENDIF48, %ENDIF43 + %temp12.1 = phi float [ %temp12.0, %ENDIF43 ], [ %67, %ENDIF48 ] + %temp28.0 = phi float [ 0.000000e+00, %ENDIF43 ], [ %70, %ENDIF48 ] + %56 = bitcast float %temp28.0 to i32 + %57 = icmp uge i32 %56, %55 + %58 = sext i1 %57 to i32 + %59 = bitcast i32 %58 to float + %60 = bitcast float %59 to i32 + %61 = icmp ne i32 %60, 0 + br i1 %61, label %LOOP, label %ENDIF48 + +ENDIF48: ; preds = %LOOP47 + %62 = bitcast float %temp12.1 to i32 + %63 = mul i32 %62, 2 + %64 = bitcast i32 %63 to float + %65 = bitcast float %64 to i32 + %66 = urem i32 %65, 2147483647 + %67 = bitcast i32 %66 to float + %68 = bitcast float %temp28.0 to i32 + %69 = add i32 %68, 1 + %70 = bitcast i32 %69 to float + br label %LOOP47 +} + +declare void @llvm.R600.store.stream.output(<4 x float>, i32, i32, i32) + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="1" } diff --git a/test/CodeGen/AMDGPU/reciprocal.ll b/test/CodeGen/AMDGPU/reciprocal.ll new file mode 100644 index 00000000000..b4ac47afced --- /dev/null +++ b/test/CodeGen/AMDGPU/reciprocal.ll @@ -0,0 +1,15 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @test(<4 x float> inreg %reg0) #0 { + %r0 = extractelement <4 x float> %reg0, i32 0 + %r1 = fdiv float 1.0, %r0 + %vec = insertelement <4 x float> undef, float %r1, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) + ret void +} + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="0" } diff --git a/test/CodeGen/AMDGPU/register-count-comments.ll b/test/CodeGen/AMDGPU/register-count-comments.ll new file mode 100644 index 00000000000..de6bfb31088 --- /dev/null +++ b/test/CodeGen/AMDGPU/register-count-comments.ll @@ -0,0 +1,27 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -asm-verbose < %s | FileCheck -check-prefix=SI %s + +declare i32 @llvm.SI.tid() nounwind readnone + +; SI-LABEL: {{^}}foo: +; SI: .section .AMDGPU.csdata +; SI: ; Kernel info: +; SI: ; NumSgprs: {{[0-9]+}} +; SI: ; NumVgprs: {{[0-9]+}} +define void @foo(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %abase, i32 addrspace(1)* %bbase) nounwind { + %tid = call i32 @llvm.SI.tid() nounwind readnone + %aptr = getelementptr i32, i32 addrspace(1)* %abase, i32 %tid + %bptr = getelementptr i32, i32 addrspace(1)* %bbase, i32 %tid + %outptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %aptr, align 4 + %b = load i32, i32 addrspace(1)* %bptr, align 4 + %result = add i32 %a, %b + store i32 %result, i32 addrspace(1)* %outptr, align 4 + ret void +} + +; SI-LABEL: {{^}}one_vgpr_used: +; SI: NumVgprs: 1 +define void @one_vgpr_used(i32 addrspace(1)* %out, i32 %x) nounwind { + store i32 %x, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/reorder-stores.ll b/test/CodeGen/AMDGPU/reorder-stores.ll new file mode 100644 index 00000000000..187650ff9a5 --- /dev/null +++ b/test/CodeGen/AMDGPU/reorder-stores.ll @@ -0,0 +1,105 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}no_reorder_v2f64_global_load_store: +; SI: buffer_load_dwordx2 +; SI: buffer_load_dwordx2 +; SI: buffer_load_dwordx2 +; SI: buffer_load_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind { + %tmp1 = load <2 x double>, <2 x double> addrspace(1)* %x, align 16 + %tmp4 = load <2 x double>, <2 x double> addrspace(1)* %y, align 16 + store <2 x double> %tmp4, <2 x double> addrspace(1)* %x, align 16 + store <2 x double> %tmp1, <2 x double> addrspace(1)* %y, align 16 + ret void +} + +; SI-LABEL: {{^}}no_reorder_scalarized_v2f64_local_load_store: +; SI: ds_read_b64 +; SI: ds_read_b64 +; SI: ds_write_b64 +; SI: ds_write_b64 +; SI: s_endpgm +define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind { + %tmp1 = load <2 x double>, <2 x double> addrspace(3)* %x, align 16 + %tmp4 = load <2 x double>, <2 x double> addrspace(3)* %y, align 16 + store <2 x double> %tmp4, <2 x double> addrspace(3)* %x, align 16 + store <2 x double> %tmp1, <2 x double> addrspace(3)* %y, align 16 + ret void +} + +; SI-LABEL: {{^}}no_reorder_split_v8i32_global_load_store: +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword +; SI: buffer_load_dword + + +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword + +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword + +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword + +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword +; SI: s_endpgm +define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind { + %tmp1 = load <8 x i32>, <8 x i32> addrspace(1)* %x, align 32 + %tmp4 = load <8 x i32>, <8 x i32> addrspace(1)* %y, align 32 + store <8 x i32> %tmp4, <8 x i32> addrspace(1)* %x, align 32 + store <8 x i32> %tmp1, <8 x i32> addrspace(1)* %y, align 32 + ret void +} + +; SI-LABEL: {{^}}no_reorder_extload_64: +; SI: ds_read_b64 +; SI: ds_read_b64 +; SI: ds_write_b64 +; SI-NOT: ds_read +; SI: ds_write_b64 +; SI: s_endpgm +define void @no_reorder_extload_64(<2 x i32> addrspace(3)* nocapture %x, <2 x i32> addrspace(3)* nocapture %y) nounwind { + %tmp1 = load <2 x i32>, <2 x i32> addrspace(3)* %x, align 8 + %tmp4 = load <2 x i32>, <2 x i32> addrspace(3)* %y, align 8 + %tmp1ext = zext <2 x i32> %tmp1 to <2 x i64> + %tmp4ext = zext <2 x i32> %tmp4 to <2 x i64> + %tmp7 = add <2 x i64> %tmp1ext, + %tmp9 = add <2 x i64> %tmp4ext, + %trunctmp9 = trunc <2 x i64> %tmp9 to <2 x i32> + %trunctmp7 = trunc <2 x i64> %tmp7 to <2 x i32> + store <2 x i32> %trunctmp9, <2 x i32> addrspace(3)* %x, align 8 + store <2 x i32> %trunctmp7, <2 x i32> addrspace(3)* %y, align 8 + ret void +} diff --git a/test/CodeGen/AMDGPU/rotl.i64.ll b/test/CodeGen/AMDGPU/rotl.i64.ll new file mode 100644 index 00000000000..3f4ceb7e031 --- /dev/null +++ b/test/CodeGen/AMDGPU/rotl.i64.ll @@ -0,0 +1,39 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s + +; BOTH-LABEL: {{^}}s_rotl_i64: +; BOTH-DAG: s_lshl_b64 +; BOTH-DAG: s_sub_i32 +; BOTH-DAG: s_lshr_b64 +; BOTH: s_or_b64 +; BOTH: s_endpgm +define void @s_rotl_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) { +entry: + %0 = shl i64 %x, %y + %1 = sub i64 64, %y + %2 = lshr i64 %x, %1 + %3 = or i64 %0, %2 + store i64 %3, i64 addrspace(1)* %in + ret void +} + +; BOTH-LABEL: {{^}}v_rotl_i64: +; SI-DAG: v_lshl_b64 +; VI-DAG: v_lshlrev_b64 +; BOTH-DAG: v_sub_i32 +; SI: v_lshr_b64 +; VI: v_lshrrev_b64 +; BOTH: v_or_b32 +; BOTH: v_or_b32 +; BOTH: s_endpgm +define void @v_rotl_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) { +entry: + %x = load i64, i64 addrspace(1)* %xptr, align 8 + %y = load i64, i64 addrspace(1)* %yptr, align 8 + %tmp0 = shl i64 %x, %y + %tmp1 = sub i64 64, %y + %tmp2 = lshr i64 %x, %tmp1 + %tmp3 = or i64 %tmp0, %tmp2 + store i64 %tmp3, i64 addrspace(1)* %in, align 8 + ret void +} diff --git a/test/CodeGen/AMDGPU/rotl.ll b/test/CodeGen/AMDGPU/rotl.ll new file mode 100644 index 00000000000..6c144cd56ea --- /dev/null +++ b/test/CodeGen/AMDGPU/rotl.ll @@ -0,0 +1,57 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}rotl_i32: +; R600: SUB_INT {{\** T[0-9]+\.[XYZW]}}, literal.x +; R600-NEXT: 32 +; R600: BIT_ALIGN_INT {{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].Z, PV.{{[XYZW]}} + +; SI: s_sub_i32 [[SDST:s[0-9]+]], 32, {{[s][0-9]+}} +; SI: v_mov_b32_e32 [[VDST:v[0-9]+]], [[SDST]] +; SI: v_alignbit_b32 {{v[0-9]+, [s][0-9]+, s[0-9]+}}, [[VDST]] +define void @rotl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) { +entry: + %0 = shl i32 %x, %y + %1 = sub i32 32, %y + %2 = lshr i32 %x, %1 + %3 = or i32 %0, %2 + store i32 %3, i32 addrspace(1)* %in + ret void +} + +; FUNC-LABEL: {{^}}rotl_v2i32: +; SI-DAG: s_sub_i32 +; SI-DAG: s_sub_i32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: v_alignbit_b32 +; SI: s_endpgm +define void @rotl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) { +entry: + %0 = shl <2 x i32> %x, %y + %1 = sub <2 x i32> , %y + %2 = lshr <2 x i32> %x, %1 + %3 = or <2 x i32> %0, %2 + store <2 x i32> %3, <2 x i32> addrspace(1)* %in + ret void +} + +; FUNC-LABEL: {{^}}rotl_v4i32: +; SI-DAG: s_sub_i32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: s_sub_i32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: s_sub_i32 +; SI-DAG: v_alignbit_b32 +; SI-DAG: s_sub_i32 +; SI-DAG: v_alignbit_b32 +; SI: s_endpgm +define void @rotl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) { +entry: + %0 = shl <4 x i32> %x, %y + %1 = sub <4 x i32> , %y + %2 = lshr <4 x i32> %x, %1 + %3 = or <4 x i32> %0, %2 + store <4 x i32> %3, <4 x i32> addrspace(1)* %in + ret void +} diff --git a/test/CodeGen/AMDGPU/rotr.i64.ll b/test/CodeGen/AMDGPU/rotr.i64.ll new file mode 100644 index 00000000000..586de44a566 --- /dev/null +++ b/test/CodeGen/AMDGPU/rotr.i64.ll @@ -0,0 +1,61 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s + +; BOTH-LABEL: {{^}}s_rotr_i64: +; BOTH-DAG: s_sub_i32 +; BOTH-DAG: s_lshr_b64 +; BOTH-DAG: s_lshl_b64 +; BOTH: s_or_b64 +define void @s_rotr_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) { +entry: + %tmp0 = sub i64 64, %y + %tmp1 = shl i64 %x, %tmp0 + %tmp2 = lshr i64 %x, %y + %tmp3 = or i64 %tmp1, %tmp2 + store i64 %tmp3, i64 addrspace(1)* %in + ret void +} + +; BOTH-LABEL: {{^}}v_rotr_i64: +; BOTH-DAG: v_sub_i32 +; SI-DAG: v_lshr_b64 +; SI-DAG: v_lshl_b64 +; VI-DAG: v_lshrrev_b64 +; VI-DAG: v_lshlrev_b64 +; BOTH: v_or_b32 +; BOTH: v_or_b32 +define void @v_rotr_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) { +entry: + %x = load i64, i64 addrspace(1)* %xptr, align 8 + %y = load i64, i64 addrspace(1)* %yptr, align 8 + %tmp0 = sub i64 64, %y + %tmp1 = shl i64 %x, %tmp0 + %tmp2 = lshr i64 %x, %y + %tmp3 = or i64 %tmp1, %tmp2 + store i64 %tmp3, i64 addrspace(1)* %in + ret void +} + +; BOTH-LABEL: {{^}}s_rotr_v2i64: +define void @s_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> %x, <2 x i64> %y) { +entry: + %tmp0 = sub <2 x i64> , %y + %tmp1 = shl <2 x i64> %x, %tmp0 + %tmp2 = lshr <2 x i64> %x, %y + %tmp3 = or <2 x i64> %tmp1, %tmp2 + store <2 x i64> %tmp3, <2 x i64> addrspace(1)* %in + ret void +} + +; BOTH-LABEL: {{^}}v_rotr_v2i64: +define void @v_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> addrspace(1)* %xptr, <2 x i64> addrspace(1)* %yptr) { +entry: + %x = load <2 x i64>, <2 x i64> addrspace(1)* %xptr, align 8 + %y = load <2 x i64>, <2 x i64> addrspace(1)* %yptr, align 8 + %tmp0 = sub <2 x i64> , %y + %tmp1 = shl <2 x i64> %x, %tmp0 + %tmp2 = lshr <2 x i64> %x, %y + %tmp3 = or <2 x i64> %tmp1, %tmp2 + store <2 x i64> %tmp3, <2 x i64> addrspace(1)* %in + ret void +} diff --git a/test/CodeGen/AMDGPU/rotr.ll b/test/CodeGen/AMDGPU/rotr.ll new file mode 100644 index 00000000000..044f9ffe6d6 --- /dev/null +++ b/test/CodeGen/AMDGPU/rotr.ll @@ -0,0 +1,53 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}rotr_i32: +; R600: BIT_ALIGN_INT + +; SI: v_alignbit_b32 +define void @rotr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) { +entry: + %tmp0 = sub i32 32, %y + %tmp1 = shl i32 %x, %tmp0 + %tmp2 = lshr i32 %x, %y + %tmp3 = or i32 %tmp1, %tmp2 + store i32 %tmp3, i32 addrspace(1)* %in + ret void +} + +; FUNC-LABEL: {{^}}rotr_v2i32: +; R600: BIT_ALIGN_INT +; R600: BIT_ALIGN_INT + +; SI: v_alignbit_b32 +; SI: v_alignbit_b32 +define void @rotr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) { +entry: + %tmp0 = sub <2 x i32> , %y + %tmp1 = shl <2 x i32> %x, %tmp0 + %tmp2 = lshr <2 x i32> %x, %y + %tmp3 = or <2 x i32> %tmp1, %tmp2 + store <2 x i32> %tmp3, <2 x i32> addrspace(1)* %in + ret void +} + +; FUNC-LABEL: {{^}}rotr_v4i32: +; R600: BIT_ALIGN_INT +; R600: BIT_ALIGN_INT +; R600: BIT_ALIGN_INT +; R600: BIT_ALIGN_INT + +; SI: v_alignbit_b32 +; SI: v_alignbit_b32 +; SI: v_alignbit_b32 +; SI: v_alignbit_b32 +define void @rotr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) { +entry: + %tmp0 = sub <4 x i32> , %y + %tmp1 = shl <4 x i32> %x, %tmp0 + %tmp2 = lshr <4 x i32> %x, %y + %tmp3 = or <4 x i32> %tmp1, %tmp2 + store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %in + ret void +} diff --git a/test/CodeGen/AMDGPU/rsq.ll b/test/CodeGen/AMDGPU/rsq.ll new file mode 100644 index 00000000000..b67b800c737 --- /dev/null +++ b/test/CodeGen/AMDGPU/rsq.ll @@ -0,0 +1,74 @@ +; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone +declare float @llvm.sqrt.f32(float) nounwind readnone +declare double @llvm.sqrt.f64(double) nounwind readnone + +; SI-LABEL: {{^}}rsq_f32: +; SI: v_rsq_f32_e32 +; SI: s_endpgm +define void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { + %val = load float, float addrspace(1)* %in, align 4 + %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone + %div = fdiv float 1.0, %sqrt + store float %div, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}rsq_f64: +; SI-UNSAFE: v_rsq_f64_e32 +; SI-SAFE: v_sqrt_f64_e32 +; SI: s_endpgm +define void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind { + %val = load double, double addrspace(1)* %in, align 4 + %sqrt = call double @llvm.sqrt.f64(double %val) nounwind readnone + %div = fdiv double 1.0, %sqrt + store double %div, double addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}rsq_f32_sgpr: +; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} +; SI: s_endpgm +define void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind { + %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone + %div = fdiv float 1.0, %sqrt + store float %div, float addrspace(1)* %out, align 4 + ret void +} + +; Recognize that this is rsqrt(a) * rcp(b) * c, +; not 1 / ( 1 / sqrt(a)) * rcp(b) * c. + +; SI-LABEL: @rsqrt_fmul +; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 + +; SI-UNSAFE-DAG: v_rsq_f32_e32 [[RSQA:v[0-9]+]], [[A]] +; SI-UNSAFE-DAG: v_rcp_f32_e32 [[RCPB:v[0-9]+]], [[B]] +; SI-UNSAFE-DAG: v_mul_f32_e32 [[TMP:v[0-9]+]], [[RCPB]], [[RSQA]] +; SI-UNSAFE: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] +; SI-UNSAFE: buffer_store_dword [[RESULT]] + +; SI-SAFE-NOT: v_rsq_f32 + +; SI: s_endpgm +define void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 + + %a = load float, float addrspace(1)* %gep.0 + %b = load float, float addrspace(1)* %gep.1 + %c = load float, float addrspace(1)* %gep.2 + + %x = call float @llvm.sqrt.f32(float %a) + %y = fmul float %x, %b + %z = fdiv float %c, %y + store float %z, float addrspace(1)* %out.gep + ret void +} diff --git a/test/CodeGen/AMDGPU/rv7x0_count3.ll b/test/CodeGen/AMDGPU/rv7x0_count3.ll new file mode 100644 index 00000000000..c3fd923e459 --- /dev/null +++ b/test/CodeGen/AMDGPU/rv7x0_count3.ll @@ -0,0 +1,41 @@ +; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=rv710 | FileCheck %s + +; CHECK: TEX 9 @6 ; encoding: [0x06,0x00,0x00,0x00,0x00,0x04,0x88,0x80] + +define void @test(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { + %1 = extractelement <4 x float> %reg1, i32 0 + %2 = extractelement <4 x float> %reg1, i32 1 + %3 = extractelement <4 x float> %reg1, i32 2 + %4 = extractelement <4 x float> %reg1, i32 3 + %5 = insertelement <4 x float> undef, float %1, i32 0 + %6 = insertelement <4 x float> %5, float %2, i32 1 + %7 = insertelement <4 x float> %6, float %3, i32 2 + %8 = insertelement <4 x float> %7, float %4, i32 3 + %9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1) + %10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 1, i32 0, i32 1) + %11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 2, i32 0, i32 1) + %12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 3, i32 0, i32 1) + %13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 4, i32 0, i32 1) + %14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 5, i32 0, i32 1) + %15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 6, i32 0, i32 1) + %16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 7, i32 0, i32 1) + %17 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 8, i32 0, i32 1) + %18 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 9, i32 0, i32 1) + %19 = fadd <4 x float> %9, %10 + %20 = fadd <4 x float> %19, %11 + %21 = fadd <4 x float> %20, %12 + %22 = fadd <4 x float> %21, %13 + %23 = fadd <4 x float> %22, %14 + %24 = fadd <4 x float> %23, %15 + %25 = fadd <4 x float> %24, %16 + %26 = fadd <4 x float> %25, %17 + %27 = fadd <4 x float> %26, %18 + call void @llvm.R600.store.swizzle(<4 x float> %27, i32 0, i32 2) + ret void +} + +declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="1" } diff --git a/test/CodeGen/AMDGPU/s_movk_i32.ll b/test/CodeGen/AMDGPU/s_movk_i32.ll new file mode 100644 index 00000000000..6b1a36c979c --- /dev/null +++ b/test/CodeGen/AMDGPU/s_movk_i32.ll @@ -0,0 +1,185 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}s_movk_i32_k0: +; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff{{$}} +; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI: s_endpgm +define void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 4 + %or = or i64 %loada, 4295032831 ; ((1 << 16) - 1) | (1 << 32) + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_movk_i32_k1: +; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}} +; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI: s_endpgm +define void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 4 + %or = or i64 %loada, 4295000063 ; ((1 << 15) - 1) | (1 << 32) + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_movk_i32_k2: +; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}} +; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 64{{$}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI: s_endpgm +define void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 4 + %or = or i64 %loada, 274877939711 ; ((1 << 15) - 1) | (64 << 32) + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_movk_i32_k3: +; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}} +; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI: s_endpgm +define void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 4 + %or = or i64 %loada, 4295000064 ; (1 << 15) | (1 << 32) + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_movk_i32_k4: +; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x20000{{$}} +; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI: s_endpgm +define void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 4 + %or = or i64 %loada, 4295098368 ; (1 << 17) | (1 << 32) + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_movk_i32_k5: +; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0xffef{{$}} +; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0xff00ffff{{$}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI: s_endpgm +define void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 4 + %or = or i64 %loada, 18374967954648334319 ; -17 & 0xff00ffffffffffff + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_movk_i32_k6: +; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x41{{$}} +; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 63{{$}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI: s_endpgm +define void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 4 + %or = or i64 %loada, 270582939713 ; 65 | (63 << 32) + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_movk_i32_k7: +; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x2000{{$}} +; SI-DAG: s_movk_i32 [[HI_S_IMM:s[0-9]+]], 0x4000{{$}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI: s_endpgm +define void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 4 + %or = or i64 %loada, 70368744185856; ((1 << 13)) | ((1 << 14) << 32) + store i64 %or, i64 addrspace(1)* %out + ret void +} + + +; SI-LABEL: {{^}}s_movk_i32_k8: +; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}} +; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI: s_endpgm +define void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 4 + %or = or i64 %loada, 1229782942255906816 ; 0x11111111ffff8000 + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_movk_i32_k9: +; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8001{{$}} +; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI: s_endpgm +define void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 4 + %or = or i64 %loada, 1229782942255906817 ; 0x11111111ffff8001 + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_movk_i32_k10: +; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8888{{$}} +; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI: s_endpgm +define void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 4 + %or = or i64 %loada, 1229782942255909000 ; 0x11111111ffff8888 + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_movk_i32_k11: +; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8fff{{$}} +; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI: s_endpgm +define void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 4 + %or = or i64 %loada, 1229782942255910911 ; 0x11111111ffff8fff + store i64 %or, i64 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_movk_i32_k12: +; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff7001{{$}} +; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} +; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] +; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] +; SI: s_endpgm +define void @s_movk_i32_k12(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { + %loada = load i64, i64 addrspace(1)* %a, align 4 + %or = or i64 %loada, 1229782942255902721 ; 0x11111111ffff7001 + store i64 %or, i64 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/saddo.ll b/test/CodeGen/AMDGPU/saddo.ll new file mode 100644 index 00000000000..f8ced7942a6 --- /dev/null +++ b/test/CodeGen/AMDGPU/saddo.ll @@ -0,0 +1,63 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s + +declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone +declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone + +; FUNC-LABEL: {{^}}saddo_i64_zext: +define void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %sadd, 0 + %carry = extractvalue { i64, i1 } %sadd, 1 + %ext = zext i1 %carry to i64 + %add2 = add i64 %val, %ext + store i64 %add2, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_saddo_i32: +define void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { + %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind + %val = extractvalue { i32, i1 } %sadd, 0 + %carry = extractvalue { i32, i1 } %sadd, 1 + store i32 %val, i32 addrspace(1)* %out, align 4 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: {{^}}v_saddo_i32: +define void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %a = load i32, i32 addrspace(1)* %aptr, align 4 + %b = load i32, i32 addrspace(1)* %bptr, align 4 + %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind + %val = extractvalue { i32, i1 } %sadd, 0 + %carry = extractvalue { i32, i1 } %sadd, 1 + store i32 %val, i32 addrspace(1)* %out, align 4 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: {{^}}s_saddo_i64: +define void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { + %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %sadd, 0 + %carry = extractvalue { i64, i1 } %sadd, 1 + store i64 %val, i64 addrspace(1)* %out, align 8 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: {{^}}v_saddo_i64: +; SI: v_add_i32 +; SI: v_addc_u32 +define void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %a = load i64, i64 addrspace(1)* %aptr, align 4 + %b = load i64, i64 addrspace(1)* %bptr, align 4 + %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %sadd, 0 + %carry = extractvalue { i64, i1 } %sadd, 1 + store i64 %val, i64 addrspace(1)* %out, align 8 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} diff --git a/test/CodeGen/AMDGPU/salu-to-valu.ll b/test/CodeGen/AMDGPU/salu-to-valu.ll new file mode 100644 index 00000000000..0b964957654 --- /dev/null +++ b/test/CodeGen/AMDGPU/salu-to-valu.ll @@ -0,0 +1,118 @@ +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s + +; In this test both the pointer and the offset operands to the +; BUFFER_LOAD instructions end up being stored in vgprs. This +; requires us to add the pointer and offset together, store the +; result in the offset operand (vaddr), and then store 0 in an +; sgpr register pair and use that for the pointer operand +; (low 64-bits of srsrc). + +; CHECK-LABEL: {{^}}mubuf: + +; Make sure we aren't using VGPRs for the source operand of s_mov_b64 +; CHECK-NOT: s_mov_b64 s[{{[0-9]+:[0-9]+}}], v + +; Make sure we aren't using VGPR's for the srsrc operand of BUFFER_LOAD_* +; instructions +; CHECK: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 +; CHECK: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 +define void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { +entry: + %0 = call i32 @llvm.r600.read.tidig.x() #1 + %1 = call i32 @llvm.r600.read.tidig.y() #1 + %2 = sext i32 %0 to i64 + %3 = sext i32 %1 to i64 + br label %loop + +loop: + %4 = phi i64 [0, %entry], [%5, %loop] + %5 = add i64 %2, %4 + %6 = getelementptr i8, i8 addrspace(1)* %in, i64 %5 + %7 = load i8, i8 addrspace(1)* %6, align 1 + %8 = or i64 %5, 1 + %9 = getelementptr i8, i8 addrspace(1)* %in, i64 %8 + %10 = load i8, i8 addrspace(1)* %9, align 1 + %11 = add i8 %7, %10 + %12 = sext i8 %11 to i32 + store i32 %12, i32 addrspace(1)* %out + %13 = icmp slt i64 %5, 10 + br i1 %13, label %loop, label %done + +done: + ret void +} + +declare i32 @llvm.r600.read.tidig.x() #1 +declare i32 @llvm.r600.read.tidig.y() #1 + +attributes #1 = { nounwind readnone } + +; Test moving an SMRD instruction to the VALU + +; CHECK-LABEL: {{^}}smrd_valu: +; CHECK: buffer_load_dword [[OUT:v[0-9]+]] +; CHECK: buffer_store_dword [[OUT]] + +define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 addrspace(1)* %out) { +entry: + %0 = icmp ne i32 %a, 0 + br i1 %0, label %if, label %else + +if: + %1 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in + br label %endif + +else: + %2 = getelementptr i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in + %3 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %2 + br label %endif + +endif: + %4 = phi i32 addrspace(2)* [%1, %if], [%3, %else] + %5 = getelementptr i32, i32 addrspace(2)* %4, i32 3000 + %6 = load i32, i32 addrspace(2)* %5 + store i32 %6, i32 addrspace(1)* %out + ret void +} + +; Test moving ann SMRD with an immediate offset to the VALU + +; CHECK-LABEL: {{^}}smrd_valu2: +; CHECK: buffer_load_dword +define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) { +entry: + %0 = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %1 = add i32 %0, 4 + %2 = getelementptr [8 x i32], [8 x i32] addrspace(2)* %in, i32 %0, i32 4 + %3 = load i32, i32 addrspace(2)* %2 + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}s_load_imm_v8i32: +; CHECK: buffer_load_dwordx4 +; CHECK: buffer_load_dwordx4 +define void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) { +entry: + %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1 + %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0 + %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <8 x i32> addrspace(2)* + %tmp3 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp2, align 4 + store <8 x i32> %tmp3, <8 x i32> addrspace(1)* %out, align 32 + ret void +} + +; CHECK-LABEL: {{^}}s_load_imm_v16i32: +; CHECK: buffer_load_dwordx4 +; CHECK: buffer_load_dwordx4 +; CHECK: buffer_load_dwordx4 +; CHECK: buffer_load_dwordx4 +define void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) { +entry: + %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1 + %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0 + %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <16 x i32> addrspace(2)* + %tmp3 = load <16 x i32>, <16 x i32> addrspace(2)* %tmp2, align 4 + store <16 x i32> %tmp3, <16 x i32> addrspace(1)* %out, align 32 + ret void +} diff --git a/test/CodeGen/AMDGPU/scalar_to_vector.ll b/test/CodeGen/AMDGPU/scalar_to_vector.ll new file mode 100644 index 00000000000..0970e5d3063 --- /dev/null +++ b/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -0,0 +1,81 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + + +; FUNC-LABEL: {{^}}scalar_to_vector_v2i32: +; SI: buffer_load_dword [[VAL:v[0-9]+]], +; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]] +; SI: buffer_store_short [[RESULT]] +; SI: buffer_store_short [[RESULT]] +; SI: buffer_store_short [[RESULT]] +; SI: buffer_store_short [[RESULT]] +; SI: s_endpgm +define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %tmp1 = load i32, i32 addrspace(1)* %in, align 4 + %bc = bitcast i32 %tmp1 to <2 x i16> + %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> + store <4 x i16> %tmp2, <4 x i16> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}scalar_to_vector_v2f32: +; SI: buffer_load_dword [[VAL:v[0-9]+]], +; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]] +; SI: buffer_store_short [[RESULT]] +; SI: buffer_store_short [[RESULT]] +; SI: buffer_store_short [[RESULT]] +; SI: buffer_store_short [[RESULT]] +; SI: s_endpgm +define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind { + %tmp1 = load float, float addrspace(1)* %in, align 4 + %bc = bitcast float %tmp1 to <2 x i16> + %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> + store <4 x i16> %tmp2, <4 x i16> addrspace(1)* %out, align 8 + ret void +} + +; Getting a SCALAR_TO_VECTOR seems to be tricky. These cases managed +; to produce one, but for some reason never made it to selection. + + +; define void @scalar_to_vector_test2(<8 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { +; %tmp1 = load i32, i32 addrspace(1)* %in, align 4 +; %bc = bitcast i32 %tmp1 to <4 x i8> + +; %tmp2 = shufflevector <4 x i8> %bc, <4 x i8> undef, <8 x i32> +; store <8 x i8> %tmp2, <8 x i8> addrspace(1)* %out, align 4 +; ret void +; } + +; define void @scalar_to_vector_test3(<4 x i32> addrspace(1)* %out) nounwind { +; %newvec0 = insertelement <2 x i64> undef, i64 12345, i32 0 +; %newvec1 = insertelement <2 x i64> %newvec0, i64 undef, i32 1 +; %bc = bitcast <2 x i64> %newvec1 to <4 x i32> +; %add = add <4 x i32> %bc, +; store <4 x i32> %add, <4 x i32> addrspace(1)* %out, align 16 +; ret void +; } + +; define void @scalar_to_vector_test4(<8 x i16> addrspace(1)* %out) nounwind { +; %newvec0 = insertelement <4 x i32> undef, i32 12345, i32 0 +; %bc = bitcast <4 x i32> %newvec0 to <8 x i16> +; %add = add <8 x i16> %bc, +; store <8 x i16> %add, <8 x i16> addrspace(1)* %out, align 16 +; ret void +; } + +; define void @scalar_to_vector_test5(<4 x i16> addrspace(1)* %out) nounwind { +; %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0 +; %bc = bitcast <2 x i32> %newvec0 to <4 x i16> +; %add = add <4 x i16> %bc, +; store <4 x i16> %add, <4 x i16> addrspace(1)* %out, align 16 +; ret void +; } + +; define void @scalar_to_vector_test6(<4 x i16> addrspace(1)* %out) nounwind { +; %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0 +; %bc = bitcast <2 x i32> %newvec0 to <4 x i16> +; %add = add <4 x i16> %bc, +; store <4 x i16> %add, <4 x i16> addrspace(1)* %out, align 16 +; ret void +; } diff --git a/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll b/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll new file mode 100644 index 00000000000..11e8f5176f4 --- /dev/null +++ b/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll @@ -0,0 +1,82 @@ +;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs +;REQUIRES: asserts + +define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #1 { +main_body: + %0 = extractelement <4 x float> %reg1, i32 0 + %1 = extractelement <4 x float> %reg1, i32 1 + %2 = extractelement <4 x float> %reg1, i32 2 + %3 = extractelement <4 x float> %reg1, i32 3 + %4 = fcmp ult float %1, 0.000000e+00 + %5 = select i1 %4, float 1.000000e+00, float 0.000000e+00 + %6 = fsub float -0.000000e+00, %5 + %7 = fptosi float %6 to i32 + %8 = bitcast i32 %7 to float + %9 = fcmp ult float %0, 5.700000e+01 + %10 = select i1 %9, float 1.000000e+00, float 0.000000e+00 + %11 = fsub float -0.000000e+00, %10 + %12 = fptosi float %11 to i32 + %13 = bitcast i32 %12 to float + %14 = bitcast float %8 to i32 + %15 = bitcast float %13 to i32 + %16 = and i32 %14, %15 + %17 = bitcast i32 %16 to float + %18 = bitcast float %17 to i32 + %19 = icmp ne i32 %18, 0 + %20 = fcmp ult float %0, 0.000000e+00 + %21 = select i1 %20, float 1.000000e+00, float 0.000000e+00 + %22 = fsub float -0.000000e+00, %21 + %23 = fptosi float %22 to i32 + %24 = bitcast i32 %23 to float + %25 = bitcast float %24 to i32 + %26 = icmp ne i32 %25, 0 + br i1 %19, label %IF, label %ELSE + +IF: ; preds = %main_body + %. = select i1 %26, float 0.000000e+00, float 1.000000e+00 + %.18 = select i1 %26, float 1.000000e+00, float 0.000000e+00 + br label %ENDIF + +ELSE: ; preds = %main_body + br i1 %26, label %ENDIF, label %ELSE17 + +ENDIF: ; preds = %ELSE17, %ELSE, %IF + %temp1.0 = phi float [ %., %IF ], [ %48, %ELSE17 ], [ 0.000000e+00, %ELSE ] + %temp2.0 = phi float [ 0.000000e+00, %IF ], [ %49, %ELSE17 ], [ 1.000000e+00, %ELSE ] + %temp.0 = phi float [ %.18, %IF ], [ %47, %ELSE17 ], [ 0.000000e+00, %ELSE ] + %27 = call float @llvm.AMDIL.clamp.(float %temp.0, float 0.000000e+00, float 1.000000e+00) + %28 = call float @llvm.AMDIL.clamp.(float %temp1.0, float 0.000000e+00, float 1.000000e+00) + %29 = call float @llvm.AMDIL.clamp.(float %temp2.0, float 0.000000e+00, float 1.000000e+00) + %30 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00) + %31 = insertelement <4 x float> undef, float %27, i32 0 + %32 = insertelement <4 x float> %31, float %28, i32 1 + %33 = insertelement <4 x float> %32, float %29, i32 2 + %34 = insertelement <4 x float> %33, float %30, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %34, i32 0, i32 0) + ret void + +ELSE17: ; preds = %ELSE + %35 = fadd float 0.000000e+00, 0x3FC99999A0000000 + %36 = fadd float 0.000000e+00, 0x3FC99999A0000000 + %37 = fadd float 0.000000e+00, 0x3FC99999A0000000 + %38 = fadd float %35, 0x3FC99999A0000000 + %39 = fadd float %36, 0x3FC99999A0000000 + %40 = fadd float %37, 0x3FC99999A0000000 + %41 = fadd float %38, 0x3FC99999A0000000 + %42 = fadd float %39, 0x3FC99999A0000000 + %43 = fadd float %40, 0x3FC99999A0000000 + %44 = fadd float %41, 0x3FC99999A0000000 + %45 = fadd float %42, 0x3FC99999A0000000 + %46 = fadd float %43, 0x3FC99999A0000000 + %47 = fadd float %44, 0x3FC99999A0000000 + %48 = fadd float %45, 0x3FC99999A0000000 + %49 = fadd float %46, 0x3FC99999A0000000 + br label %ENDIF +} + +declare float @llvm.AMDIL.clamp.(float, float, float) #0 + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { readnone } +attributes #1 = { "ShaderType"="1" } diff --git a/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll b/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll new file mode 100644 index 00000000000..759197ca61f --- /dev/null +++ b/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll @@ -0,0 +1,88 @@ +;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs +;REQUIRES: asserts + +define void @main() { +main_body: + %0 = load <4 x float>, <4 x float> addrspace(9)* null + %1 = extractelement <4 x float> %0, i32 3 + %2 = fptosi float %1 to i32 + %3 = bitcast i32 %2 to float + %4 = bitcast float %3 to i32 + %5 = sdiv i32 %4, 4 + %6 = bitcast i32 %5 to float + %7 = bitcast float %6 to i32 + %8 = mul i32 %7, 4 + %9 = bitcast i32 %8 to float + %10 = bitcast float %9 to i32 + %11 = sub i32 0, %10 + %12 = bitcast i32 %11 to float + %13 = bitcast float %3 to i32 + %14 = bitcast float %12 to i32 + %15 = add i32 %13, %14 + %16 = bitcast i32 %15 to float + %17 = load <4 x float>, <4 x float> addrspace(9)* null + %18 = extractelement <4 x float> %17, i32 0 + %19 = load <4 x float>, <4 x float> addrspace(9)* null + %20 = extractelement <4 x float> %19, i32 1 + %21 = load <4 x float>, <4 x float> addrspace(9)* null + %22 = extractelement <4 x float> %21, i32 2 + br label %LOOP + +LOOP: ; preds = %IF31, %main_body + %temp12.0 = phi float [ 0.000000e+00, %main_body ], [ %47, %IF31 ] + %temp6.0 = phi float [ %22, %main_body ], [ %temp6.1, %IF31 ] + %temp5.0 = phi float [ %20, %main_body ], [ %temp5.1, %IF31 ] + %temp4.0 = phi float [ %18, %main_body ], [ %temp4.1, %IF31 ] + %23 = bitcast float %temp12.0 to i32 + %24 = bitcast float %6 to i32 + %25 = icmp sge i32 %23, %24 + %26 = sext i1 %25 to i32 + %27 = bitcast i32 %26 to float + %28 = bitcast float %27 to i32 + %29 = icmp ne i32 %28, 0 + br i1 %29, label %IF, label %LOOP29 + +IF: ; preds = %LOOP + %30 = call float @llvm.AMDIL.clamp.(float %temp4.0, float 0.000000e+00, float 1.000000e+00) + %31 = call float @llvm.AMDIL.clamp.(float %temp5.0, float 0.000000e+00, float 1.000000e+00) + %32 = call float @llvm.AMDIL.clamp.(float %temp6.0, float 0.000000e+00, float 1.000000e+00) + %33 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00) + %34 = insertelement <4 x float> undef, float %30, i32 0 + %35 = insertelement <4 x float> %34, float %31, i32 1 + %36 = insertelement <4 x float> %35, float %32, i32 2 + %37 = insertelement <4 x float> %36, float %33, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %37, i32 0, i32 0) + ret void + +LOOP29: ; preds = %LOOP, %ENDIF30 + %temp6.1 = phi float [ %temp4.1, %ENDIF30 ], [ %temp6.0, %LOOP ] + %temp5.1 = phi float [ %temp6.1, %ENDIF30 ], [ %temp5.0, %LOOP ] + %temp4.1 = phi float [ %temp5.1, %ENDIF30 ], [ %temp4.0, %LOOP ] + %temp20.0 = phi float [ %50, %ENDIF30 ], [ 0.000000e+00, %LOOP ] + %38 = bitcast float %temp20.0 to i32 + %39 = bitcast float %16 to i32 + %40 = icmp sge i32 %38, %39 + %41 = sext i1 %40 to i32 + %42 = bitcast i32 %41 to float + %43 = bitcast float %42 to i32 + %44 = icmp ne i32 %43, 0 + br i1 %44, label %IF31, label %ENDIF30 + +IF31: ; preds = %LOOP29 + %45 = bitcast float %temp12.0 to i32 + %46 = add i32 %45, 1 + %47 = bitcast i32 %46 to float + br label %LOOP + +ENDIF30: ; preds = %LOOP29 + %48 = bitcast float %temp20.0 to i32 + %49 = add i32 %48, 1 + %50 = bitcast i32 %49 to float + br label %LOOP29 +} + +declare float @llvm.AMDIL.clamp.(float, float, float) #0 + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { readnone } diff --git a/test/CodeGen/AMDGPU/schedule-fs-loop.ll b/test/CodeGen/AMDGPU/schedule-fs-loop.ll new file mode 100644 index 00000000000..28cc08abc02 --- /dev/null +++ b/test/CodeGen/AMDGPU/schedule-fs-loop.ll @@ -0,0 +1,55 @@ +;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs +;REQUIRES: asserts + +define void @main() { +main_body: + %0 = load <4 x float>, <4 x float> addrspace(9)* null + %1 = extractelement <4 x float> %0, i32 3 + %2 = fptosi float %1 to i32 + %3 = bitcast i32 %2 to float + %4 = load <4 x float>, <4 x float> addrspace(9)* null + %5 = extractelement <4 x float> %4, i32 0 + %6 = load <4 x float>, <4 x float> addrspace(9)* null + %7 = extractelement <4 x float> %6, i32 1 + %8 = load <4 x float>, <4 x float> addrspace(9)* null + %9 = extractelement <4 x float> %8, i32 2 + br label %LOOP + +LOOP: ; preds = %ENDIF, %main_body + %temp4.0 = phi float [ %5, %main_body ], [ %temp5.0, %ENDIF ] + %temp5.0 = phi float [ %7, %main_body ], [ %temp6.0, %ENDIF ] + %temp6.0 = phi float [ %9, %main_body ], [ %temp4.0, %ENDIF ] + %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %27, %ENDIF ] + %10 = bitcast float %temp8.0 to i32 + %11 = bitcast float %3 to i32 + %12 = icmp sge i32 %10, %11 + %13 = sext i1 %12 to i32 + %14 = bitcast i32 %13 to float + %15 = bitcast float %14 to i32 + %16 = icmp ne i32 %15, 0 + br i1 %16, label %IF, label %ENDIF + +IF: ; preds = %LOOP + %17 = call float @llvm.AMDIL.clamp.(float %temp4.0, float 0.000000e+00, float 1.000000e+00) + %18 = call float @llvm.AMDIL.clamp.(float %temp5.0, float 0.000000e+00, float 1.000000e+00) + %19 = call float @llvm.AMDIL.clamp.(float %temp6.0, float 0.000000e+00, float 1.000000e+00) + %20 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00) + %21 = insertelement <4 x float> undef, float %17, i32 0 + %22 = insertelement <4 x float> %21, float %18, i32 1 + %23 = insertelement <4 x float> %22, float %19, i32 2 + %24 = insertelement <4 x float> %23, float %20, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %24, i32 0, i32 0) + ret void + +ENDIF: ; preds = %LOOP + %25 = bitcast float %temp8.0 to i32 + %26 = add i32 %25, 1 + %27 = bitcast i32 %26 to float + br label %LOOP +} + +declare float @llvm.AMDIL.clamp.(float, float, float) #0 + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { readnone } diff --git a/test/CodeGen/AMDGPU/schedule-global-loads.ll b/test/CodeGen/AMDGPU/schedule-global-loads.ll new file mode 100644 index 00000000000..3f728fd873b --- /dev/null +++ b/test/CodeGen/AMDGPU/schedule-global-loads.ll @@ -0,0 +1,41 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s + + +declare i32 @llvm.r600.read.tidig.x() #1 + +; FIXME: This currently doesn't do a great job of clustering the +; loads, which end up with extra moves between them. Right now, it +; seems the only things areLoadsFromSameBasePtr is accomplishing is +; ordering the loads so that the lower address loads come first. + +; FUNC-LABEL: {{^}}cluster_global_arg_loads: +; SI-DAG: buffer_load_dword [[REG0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; SI-DAG: buffer_load_dword [[REG1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4 +; SI: buffer_store_dword [[REG0]] +; SI: buffer_store_dword [[REG1]] +define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 { + %load0 = load i32, i32 addrspace(1)* %ptr, align 4 + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 1 + %load1 = load i32, i32 addrspace(1)* %gep, align 4 + store i32 %load0, i32 addrspace(1)* %out0, align 4 + store i32 %load1, i32 addrspace(1)* %out1, align 4 + ret void +} + +; Test for a crach in SIInstrInfo::areLoadsFromSameBasePtr() when checking +; an MUBUF load which does not have a vaddr operand. +; FUNC-LABEL: {{^}}same_base_ptr_crash: +; SI: buffer_load_dword +; SI: buffer_load_dword +define void @same_base_ptr_crash(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) { +entry: + %out1 = getelementptr i32, i32 addrspace(1)* %out, i32 %offset + %tmp0 = load i32, i32 addrspace(1)* %out + %tmp1 = load i32, i32 addrspace(1)* %out1 + %tmp2 = add i32 %tmp0, %tmp1 + store i32 %tmp2, i32 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/schedule-if-2.ll b/test/CodeGen/AMDGPU/schedule-if-2.ll new file mode 100644 index 00000000000..54946509683 --- /dev/null +++ b/test/CodeGen/AMDGPU/schedule-if-2.ll @@ -0,0 +1,94 @@ +;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs +;REQUIRES: asserts + +define void @main() { +main_body: + %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) + %1 = extractelement <4 x float> %0, i32 0 + %2 = fadd float 1.000000e+03, %1 + %3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %4 = extractelement <4 x float> %3, i32 0 + %5 = bitcast float %4 to i32 + %6 = icmp eq i32 %5, 0 + %7 = sext i1 %6 to i32 + %8 = bitcast i32 %7 to float + %9 = bitcast float %8 to i32 + %10 = icmp ne i32 %9, 0 + br i1 %10, label %IF, label %ELSE + +IF: ; preds = %main_body + %11 = call float @fabs(float %2) + %12 = fcmp ueq float %11, 0x7FF0000000000000 + %13 = select i1 %12, float 1.000000e+00, float 0.000000e+00 + %14 = fsub float -0.000000e+00, %13 + %15 = fptosi float %14 to i32 + %16 = bitcast i32 %15 to float + %17 = bitcast float %16 to i32 + %18 = icmp ne i32 %17, 0 + %. = select i1 %18, float 0x36A0000000000000, float 0.000000e+00 + %19 = fcmp une float %2, %2 + %20 = select i1 %19, float 1.000000e+00, float 0.000000e+00 + %21 = fsub float -0.000000e+00, %20 + %22 = fptosi float %21 to i32 + %23 = bitcast i32 %22 to float + %24 = bitcast float %23 to i32 + %25 = icmp ne i32 %24, 0 + %temp8.0 = select i1 %25, float 0x36A0000000000000, float 0.000000e+00 + %26 = bitcast float %. to i32 + %27 = sitofp i32 %26 to float + %28 = bitcast float %temp8.0 to i32 + %29 = sitofp i32 %28 to float + %30 = fcmp ugt float %2, 0.000000e+00 + %31 = select i1 %30, float 1.000000e+00, float %2 + %32 = fcmp uge float %31, 0.000000e+00 + %33 = select i1 %32, float %31, float -1.000000e+00 + %34 = fadd float %33, 1.000000e+00 + %35 = fmul float %34, 5.000000e-01 + br label %ENDIF + +ELSE: ; preds = %main_body + %36 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %37 = extractelement <4 x float> %36, i32 0 + %38 = bitcast float %37 to i32 + %39 = icmp eq i32 %38, 1 + %40 = sext i1 %39 to i32 + %41 = bitcast i32 %40 to float + %42 = bitcast float %41 to i32 + %43 = icmp ne i32 %42, 0 + br i1 %43, label %IF23, label %ENDIF + +ENDIF: ; preds = %IF23, %ELSE, %IF + %temp4.0 = phi float [ %2, %IF ], [ %56, %IF23 ], [ 0.000000e+00, %ELSE ] + %temp5.0 = phi float [ %27, %IF ], [ %60, %IF23 ], [ 0.000000e+00, %ELSE ] + %temp6.0 = phi float [ %29, %IF ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF23 ] + %temp7.0 = phi float [ %35, %IF ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF23 ] + %44 = insertelement <4 x float> undef, float %temp4.0, i32 0 + %45 = insertelement <4 x float> %44, float %temp5.0, i32 1 + %46 = insertelement <4 x float> %45, float %temp6.0, i32 2 + %47 = insertelement <4 x float> %46, float %temp7.0, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %47, i32 0, i32 0) + ret void + +IF23: ; preds = %ELSE + %48 = fcmp ult float 0.000000e+00, %2 + %49 = select i1 %48, float 1.000000e+00, float 0.000000e+00 + %50 = fsub float -0.000000e+00, %49 + %51 = fptosi float %50 to i32 + %52 = bitcast i32 %51 to float + %53 = bitcast float %52 to i32 + %54 = icmp ne i32 %53, 0 + %.28 = select i1 %54, float 0x36A0000000000000, float 0.000000e+00 + %55 = bitcast float %.28 to i32 + %56 = sitofp i32 %55 to float + %57 = load <4 x float>, <4 x float> addrspace(8)* null + %58 = extractelement <4 x float> %57, i32 0 + %59 = fsub float -0.000000e+00, %58 + %60 = fadd float %2, %59 + br label %ENDIF +} + +declare float @fabs(float) #0 + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { readonly } diff --git a/test/CodeGen/AMDGPU/schedule-if.ll b/test/CodeGen/AMDGPU/schedule-if.ll new file mode 100644 index 00000000000..94c653c8f25 --- /dev/null +++ b/test/CodeGen/AMDGPU/schedule-if.ll @@ -0,0 +1,46 @@ +;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs +;REQUIRES: asserts + +define void @main() { +main_body: + %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %1 = extractelement <4 x float> %0, i32 0 + %2 = bitcast float %1 to i32 + %3 = icmp eq i32 %2, 0 + %4 = sext i1 %3 to i32 + %5 = bitcast i32 %4 to float + %6 = bitcast float %5 to i32 + %7 = icmp ne i32 %6, 0 + br i1 %7, label %ENDIF, label %ELSE + +ELSE: ; preds = %main_body + %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %9 = extractelement <4 x float> %8, i32 0 + %10 = bitcast float %9 to i32 + %11 = icmp eq i32 %10, 1 + %12 = sext i1 %11 to i32 + %13 = bitcast i32 %12 to float + %14 = bitcast float %13 to i32 + %15 = icmp ne i32 %14, 0 + br i1 %15, label %IF13, label %ENDIF + +ENDIF: ; preds = %IF13, %ELSE, %main_body + %temp.0 = phi float [ 1.000000e+03, %main_body ], [ 1.000000e+00, %IF13 ], [ 0.000000e+00, %ELSE ] + %temp1.0 = phi float [ 0.000000e+00, %main_body ], [ %23, %IF13 ], [ 0.000000e+00, %ELSE ] + %temp3.0 = phi float [ 1.000000e+00, %main_body ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF13 ] + %16 = insertelement <4 x float> undef, float %temp.0, i32 0 + %17 = insertelement <4 x float> %16, float %temp1.0, i32 1 + %18 = insertelement <4 x float> %17, float 0.000000e+00, i32 2 + %19 = insertelement <4 x float> %18, float %temp3.0, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %19, i32 0, i32 0) + ret void + +IF13: ; preds = %ELSE + %20 = load <4 x float>, <4 x float> addrspace(8)* null + %21 = extractelement <4 x float> %20, i32 0 + %22 = fsub float -0.000000e+00, %21 + %23 = fadd float 1.000000e+03, %22 + br label %ENDIF +} + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) diff --git a/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll b/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll new file mode 100644 index 00000000000..6b3e0814c38 --- /dev/null +++ b/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll @@ -0,0 +1,51 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI --check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI --check-prefix=GCN %s + +; FUNC-LABEL: {{^}}cluster_arg_loads: +; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9 +; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xe +; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-NEXT: s_nop 0 +; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; VI-NEXT: s_nop 0 +; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38 +define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind { + store i32 %x, i32 addrspace(1)* %out0, align 4 + store i32 %y, i32 addrspace(1)* %out1, align 4 + ret void +} + +; Test for a crash in SIInstrInfo::areLoadsFromSameBasePtr() when +; s_load_dwordx2 has a register offset + +; FUNC-LABEL: @same_base_ptr_crash +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GCN: s_endpgm +define void @same_base_ptr_crash(i64 addrspace(1)* %out, + i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7, + i64 %arg8, i64 %arg9, i64 %arg10, i64 %arg11, i64 %arg12, i64 %arg13, i64 %arg14, i64 %arg15, + i64 %arg16, i64 %arg17, i64 %arg18, i64 %arg19, i64 %arg20, i64 %arg21, i64 %arg22, i64 %arg23, + i64 %arg24, i64 %arg25, i64 %arg26, i64 %arg27, i64 %arg28, i64 %arg29, i64 %arg30, i64 %arg31, + i64 %arg32, i64 %arg33, i64 %arg34, i64 %arg35, i64 %arg36, i64 %arg37, i64 %arg38, i64 %arg39, + i64 %arg40, i64 %arg41, i64 %arg42, i64 %arg43, i64 %arg44, i64 %arg45, i64 %arg46, i64 %arg47, + i64 %arg48, i64 %arg49, i64 %arg50, i64 %arg51, i64 %arg52, i64 %arg53, i64 %arg54, i64 %arg55, + i64 %arg56, i64 %arg57, i64 %arg58, i64 %arg59, i64 %arg60, i64 %arg61, i64 %arg62, i64 %arg63, + i64 %arg64, i64 %arg65, i64 %arg66, i64 %arg67, i64 %arg68, i64 %arg69, i64 %arg70, i64 %arg71, + i64 %arg72, i64 %arg73, i64 %arg74, i64 %arg75, i64 %arg76, i64 %arg77, i64 %arg78, i64 %arg79, + i64 %arg80, i64 %arg81, i64 %arg82, i64 %arg83, i64 %arg84, i64 %arg85, i64 %arg86, i64 %arg87, + i64 %arg88, i64 %arg89, i64 %arg90, i64 %arg91, i64 %arg92, i64 %arg93, i64 %arg94, i64 %arg95, + i64 %arg96, i64 %arg97, i64 %arg98, i64 %arg99, i64 %arg100, i64 %arg101, i64 %arg102, i64 %arg103, + i64 %arg104, i64 %arg105, i64 %arg106, i64 %arg107, i64 %arg108, i64 %arg109, i64 %arg110, i64 %arg111, + i64 %arg112, i64 %arg113, i64 %arg114, i64 %arg115, i64 %arg116, i64 %arg117, i64 %arg118, i64 %arg119, + i64 %arg120, i64 %arg121, i64 %arg122, i64 %arg123, i64 %arg124, i64 %arg125, i64 %arg126) { +entry: + %value = add i64 %arg125, %arg126 + store i64 %value, i64 addrspace(1)* %out, align 8 + ret void +} diff --git a/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll b/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll new file mode 100644 index 00000000000..3863afda5dd --- /dev/null +++ b/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll @@ -0,0 +1,163 @@ +; XFAIL: * +; REQUIRES: asserts +; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI +; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI + +declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate + + +; SI-LABEL: {{^}}main( +define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { +main_body: + %0 = extractelement <4 x float> %reg1, i32 0 + %1 = extractelement <4 x float> %reg1, i32 2 + %2 = fcmp ult float %0, 0.000000e+00 + %3 = select i1 %2, float 1.000000e+00, float 0.000000e+00 + %4 = fsub float -0.000000e+00, %3 + %5 = fptosi float %4 to i32 + %6 = bitcast i32 %5 to float + %7 = bitcast float %6 to i32 + %8 = icmp ne i32 %7, 0 + br i1 %8, label %LOOP, label %ENDIF + +Flow1: ; preds = %ENDIF19, %ENDIF16 + %9 = phi float [ %115, %ENDIF19 ], [ undef, %ENDIF16 ] + %10 = phi float [ %114, %ENDIF19 ], [ undef, %ENDIF16 ] + %11 = phi float [ %113, %ENDIF19 ], [ undef, %ENDIF16 ] + %12 = phi float [ %112, %ENDIF19 ], [ undef, %ENDIF16 ] + %13 = phi float [ %111, %ENDIF19 ], [ undef, %ENDIF16 ] + %14 = phi i1 [ false, %ENDIF19 ], [ true, %ENDIF16 ] + br label %Flow + +Flow2: ; preds = %Flow + br label %ENDIF + +ENDIF: ; preds = %main_body, %Flow2 + %temp.0 = phi float [ 0.000000e+00, %main_body ], [ %104, %Flow2 ] + %temp1.0 = phi float [ 1.000000e+00, %main_body ], [ %103, %Flow2 ] + %temp2.0 = phi float [ 0.000000e+00, %main_body ], [ %102, %Flow2 ] + %temp3.0 = phi float [ 0.000000e+00, %main_body ], [ %101, %Flow2 ] + %15 = extractelement <4 x float> %reg1, i32 1 + %16 = extractelement <4 x float> %reg1, i32 3 + %17 = load <4 x float>, <4 x float> addrspace(9)* null + %18 = extractelement <4 x float> %17, i32 0 + %19 = fmul float %18, %0 + %20 = load <4 x float>, <4 x float> addrspace(9)* null + %21 = extractelement <4 x float> %20, i32 1 + %22 = fmul float %21, %0 + %23 = load <4 x float>, <4 x float> addrspace(9)* null + %24 = extractelement <4 x float> %23, i32 2 + %25 = fmul float %24, %0 + %26 = load <4 x float>, <4 x float> addrspace(9)* null + %27 = extractelement <4 x float> %26, i32 3 + %28 = fmul float %27, %0 + %29 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) + %30 = extractelement <4 x float> %29, i32 0 + %31 = fmul float %30, %15 + %32 = fadd float %31, %19 + %33 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) + %34 = extractelement <4 x float> %33, i32 1 + %35 = fmul float %34, %15 + %36 = fadd float %35, %22 + %37 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) + %38 = extractelement <4 x float> %37, i32 2 + %39 = fmul float %38, %15 + %40 = fadd float %39, %25 + %41 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) + %42 = extractelement <4 x float> %41, i32 3 + %43 = fmul float %42, %15 + %44 = fadd float %43, %28 + %45 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) + %46 = extractelement <4 x float> %45, i32 0 + %47 = fmul float %46, %1 + %48 = fadd float %47, %32 + %49 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) + %50 = extractelement <4 x float> %49, i32 1 + %51 = fmul float %50, %1 + %52 = fadd float %51, %36 + %53 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) + %54 = extractelement <4 x float> %53, i32 2 + %55 = fmul float %54, %1 + %56 = fadd float %55, %40 + %57 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) + %58 = extractelement <4 x float> %57, i32 3 + %59 = fmul float %58, %1 + %60 = fadd float %59, %44 + %61 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) + %62 = extractelement <4 x float> %61, i32 0 + %63 = fmul float %62, %16 + %64 = fadd float %63, %48 + %65 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) + %66 = extractelement <4 x float> %65, i32 1 + %67 = fmul float %66, %16 + %68 = fadd float %67, %52 + %69 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) + %70 = extractelement <4 x float> %69, i32 2 + %71 = fmul float %70, %16 + %72 = fadd float %71, %56 + %73 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) + %74 = extractelement <4 x float> %73, i32 3 + %75 = fmul float %74, %16 + %76 = fadd float %75, %60 + %77 = insertelement <4 x float> undef, float %64, i32 0 + %78 = insertelement <4 x float> %77, float %68, i32 1 + %79 = insertelement <4 x float> %78, float %72, i32 2 + %80 = insertelement <4 x float> %79, float %76, i32 3 + call void @llvm.AMDGPU.barrier.local() + %81 = insertelement <4 x float> undef, float %temp.0, i32 0 + %82 = insertelement <4 x float> %81, float %temp1.0, i32 1 + %83 = insertelement <4 x float> %82, float %temp2.0, i32 2 + %84 = insertelement <4 x float> %83, float %temp3.0, i32 3 + call void @llvm.AMDGPU.barrier.local() + ret void + +LOOP: ; preds = %main_body, %Flow + %temp.1 = phi float [ %109, %Flow ], [ 0.000000e+00, %main_body ] + %temp1.1 = phi float [ %108, %Flow ], [ 1.000000e+00, %main_body ] + %temp2.1 = phi float [ %107, %Flow ], [ 0.000000e+00, %main_body ] + %temp3.1 = phi float [ %106, %Flow ], [ 0.000000e+00, %main_body ] + %temp4.0 = phi float [ %105, %Flow ], [ -2.000000e+00, %main_body ] + %85 = fcmp uge float %temp4.0, %0 + %86 = select i1 %85, float 1.000000e+00, float 0.000000e+00 + %87 = fsub float -0.000000e+00, %86 + %88 = fptosi float %87 to i32 + %89 = bitcast i32 %88 to float + %90 = bitcast float %89 to i32 + %91 = icmp ne i32 %90, 0 + %92 = xor i1 %91, true + br i1 %92, label %ENDIF16, label %Flow + +ENDIF16: ; preds = %LOOP + %93 = fcmp une float %1, %temp4.0 + %94 = select i1 %93, float 1.000000e+00, float 0.000000e+00 + %95 = fsub float -0.000000e+00, %94 + %96 = fptosi float %95 to i32 + %97 = bitcast i32 %96 to float + %98 = bitcast float %97 to i32 + %99 = icmp ne i32 %98, 0 + %100 = xor i1 %99, true + br i1 %100, label %ENDIF19, label %Flow1 + +Flow: ; preds = %Flow1, %LOOP + %101 = phi float [ %temp3.1, %Flow1 ], [ %temp3.1, %LOOP ] + %102 = phi float [ %temp2.1, %Flow1 ], [ %temp2.1, %LOOP ] + %103 = phi float [ %temp1.1, %Flow1 ], [ %temp1.1, %LOOP ] + %104 = phi float [ %temp.1, %Flow1 ], [ %temp.1, %LOOP ] + %105 = phi float [ %9, %Flow1 ], [ undef, %LOOP ] + %106 = phi float [ %10, %Flow1 ], [ undef, %LOOP ] + %107 = phi float [ %11, %Flow1 ], [ undef, %LOOP ] + %108 = phi float [ %12, %Flow1 ], [ undef, %LOOP ] + %109 = phi float [ %13, %Flow1 ], [ undef, %LOOP ] + %110 = phi i1 [ %14, %Flow1 ], [ true, %LOOP ] + br i1 %110, label %Flow2, label %LOOP + +ENDIF19: ; preds = %ENDIF16 + %111 = fadd float %temp.1, 1.000000e+00 + %112 = fadd float %temp1.1, 0.000000e+00 + %113 = fadd float %temp2.1, 0.000000e+00 + %114 = fadd float %temp3.1, 0.000000e+00 + %115 = fadd float %temp4.0, 1.000000e+00 + br label %Flow1 +} + +attributes #0 = { "ShaderType"="1" } diff --git a/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll b/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll new file mode 100644 index 00000000000..8d980dbf899 --- /dev/null +++ b/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll @@ -0,0 +1,132 @@ +;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched +;REQUIRES: asserts + +define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { +main_body: + %0 = extractelement <4 x float> %reg1, i32 0 + %1 = extractelement <4 x float> %reg1, i32 1 + %2 = extractelement <4 x float> %reg1, i32 2 + %3 = extractelement <4 x float> %reg1, i32 3 + %4 = fcmp ult float %0, 0.000000e+00 + %5 = select i1 %4, float 1.000000e+00, float 0.000000e+00 + %6 = fsub float -0.000000e+00, %5 + %7 = fptosi float %6 to i32 + %8 = bitcast i32 %7 to float + %9 = bitcast float %8 to i32 + %10 = icmp ne i32 %9, 0 + br i1 %10, label %LOOP, label %ENDIF + +ENDIF: ; preds = %ENDIF16, %LOOP, %main_body + %temp.0 = phi float [ 0.000000e+00, %main_body ], [ %temp.1, %LOOP ], [ %temp.1, %ENDIF16 ] + %temp1.0 = phi float [ 1.000000e+00, %main_body ], [ %temp1.1, %LOOP ], [ %temp1.1, %ENDIF16 ] + %temp2.0 = phi float [ 0.000000e+00, %main_body ], [ %temp2.1, %LOOP ], [ %temp2.1, %ENDIF16 ] + %temp3.0 = phi float [ 0.000000e+00, %main_body ], [ %temp3.1, %LOOP ], [ %temp3.1, %ENDIF16 ] + %11 = load <4 x float>, <4 x float> addrspace(9)* null + %12 = extractelement <4 x float> %11, i32 0 + %13 = fmul float %12, %0 + %14 = load <4 x float>, <4 x float> addrspace(9)* null + %15 = extractelement <4 x float> %14, i32 1 + %16 = fmul float %15, %0 + %17 = load <4 x float>, <4 x float> addrspace(9)* null + %18 = extractelement <4 x float> %17, i32 2 + %19 = fmul float %18, %0 + %20 = load <4 x float>, <4 x float> addrspace(9)* null + %21 = extractelement <4 x float> %20, i32 3 + %22 = fmul float %21, %0 + %23 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) + %24 = extractelement <4 x float> %23, i32 0 + %25 = fmul float %24, %1 + %26 = fadd float %25, %13 + %27 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) + %28 = extractelement <4 x float> %27, i32 1 + %29 = fmul float %28, %1 + %30 = fadd float %29, %16 + %31 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) + %32 = extractelement <4 x float> %31, i32 2 + %33 = fmul float %32, %1 + %34 = fadd float %33, %19 + %35 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) + %36 = extractelement <4 x float> %35, i32 3 + %37 = fmul float %36, %1 + %38 = fadd float %37, %22 + %39 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) + %40 = extractelement <4 x float> %39, i32 0 + %41 = fmul float %40, %2 + %42 = fadd float %41, %26 + %43 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) + %44 = extractelement <4 x float> %43, i32 1 + %45 = fmul float %44, %2 + %46 = fadd float %45, %30 + %47 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) + %48 = extractelement <4 x float> %47, i32 2 + %49 = fmul float %48, %2 + %50 = fadd float %49, %34 + %51 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) + %52 = extractelement <4 x float> %51, i32 3 + %53 = fmul float %52, %2 + %54 = fadd float %53, %38 + %55 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) + %56 = extractelement <4 x float> %55, i32 0 + %57 = fmul float %56, %3 + %58 = fadd float %57, %42 + %59 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) + %60 = extractelement <4 x float> %59, i32 1 + %61 = fmul float %60, %3 + %62 = fadd float %61, %46 + %63 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) + %64 = extractelement <4 x float> %63, i32 2 + %65 = fmul float %64, %3 + %66 = fadd float %65, %50 + %67 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) + %68 = extractelement <4 x float> %67, i32 3 + %69 = fmul float %68, %3 + %70 = fadd float %69, %54 + %71 = insertelement <4 x float> undef, float %58, i32 0 + %72 = insertelement <4 x float> %71, float %62, i32 1 + %73 = insertelement <4 x float> %72, float %66, i32 2 + %74 = insertelement <4 x float> %73, float %70, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %74, i32 60, i32 1) + %75 = insertelement <4 x float> undef, float %temp.0, i32 0 + %76 = insertelement <4 x float> %75, float %temp1.0, i32 1 + %77 = insertelement <4 x float> %76, float %temp2.0, i32 2 + %78 = insertelement <4 x float> %77, float %temp3.0, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %78, i32 0, i32 2) + ret void + +LOOP: ; preds = %main_body, %ENDIF19 + %temp.1 = phi float [ %93, %ENDIF19 ], [ 0.000000e+00, %main_body ] + %temp1.1 = phi float [ %94, %ENDIF19 ], [ 1.000000e+00, %main_body ] + %temp2.1 = phi float [ %95, %ENDIF19 ], [ 0.000000e+00, %main_body ] + %temp3.1 = phi float [ %96, %ENDIF19 ], [ 0.000000e+00, %main_body ] + %temp4.0 = phi float [ %97, %ENDIF19 ], [ -2.000000e+00, %main_body ] + %79 = fcmp uge float %temp4.0, %0 + %80 = select i1 %79, float 1.000000e+00, float 0.000000e+00 + %81 = fsub float -0.000000e+00, %80 + %82 = fptosi float %81 to i32 + %83 = bitcast i32 %82 to float + %84 = bitcast float %83 to i32 + %85 = icmp ne i32 %84, 0 + br i1 %85, label %ENDIF, label %ENDIF16 + +ENDIF16: ; preds = %LOOP + %86 = fcmp une float %2, %temp4.0 + %87 = select i1 %86, float 1.000000e+00, float 0.000000e+00 + %88 = fsub float -0.000000e+00, %87 + %89 = fptosi float %88 to i32 + %90 = bitcast i32 %89 to float + %91 = bitcast float %90 to i32 + %92 = icmp ne i32 %91, 0 + br i1 %92, label %ENDIF, label %ENDIF19 + +ENDIF19: ; preds = %ENDIF16 + %93 = fadd float %temp.1, 1.000000e+00 + %94 = fadd float %temp1.1, 0.000000e+00 + %95 = fadd float %temp2.1, 0.000000e+00 + %96 = fadd float %temp3.1, 0.000000e+00 + %97 = fadd float %temp4.0, 1.000000e+00 + br label %LOOP +} + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="1" } diff --git a/test/CodeGen/AMDGPU/scratch-buffer.ll b/test/CodeGen/AMDGPU/scratch-buffer.ll new file mode 100644 index 00000000000..56088718ada --- /dev/null +++ b/test/CodeGen/AMDGPU/scratch-buffer.ll @@ -0,0 +1,87 @@ +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s + +; When a frame index offset is more than 12-bits, make sure we don't store +; it in mubuf's offset field. + +; Also, make sure we use the same register for storing the scratch buffer addresss +; for both stores. This register is allocated by the register scavenger, so we +; should be able to reuse the same regiser for each scratch buffer access. + +; CHECK-LABEL: {{^}}legal_offset_fi: +; CHECK: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0{{$}} +; CHECK: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen +; CHECK: v_mov_b32_e32 [[OFFSET]], 0x8000 +; CHECK: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} + +define void @legal_offset_fi(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) { +entry: + %scratch0 = alloca [8192 x i32] + %scratch1 = alloca [8192 x i32] + + %scratchptr0 = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 0 + store i32 1, i32* %scratchptr0 + + %scratchptr1 = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 0 + store i32 2, i32* %scratchptr1 + + %cmp = icmp eq i32 %cond, 0 + br i1 %cmp, label %if, label %else + +if: + %if_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %if_offset + %if_value = load i32, i32* %if_ptr + br label %done + +else: + %else_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %else_offset + %else_value = load i32, i32* %else_ptr + br label %done + +done: + %value = phi i32 [%if_value, %if], [%else_value, %else] + store i32 %value, i32 addrspace(1)* %out + ret void + + ret void + +} + +; CHECK-LABEL: {{^}}legal_offset_fi_offset +; CHECK: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen +; CHECK: v_add_i32_e32 [[OFFSET:v[0-9]+]], 0x8000 +; CHECK: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} + +define void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) { +entry: + %scratch0 = alloca [8192 x i32] + %scratch1 = alloca [8192 x i32] + + %offset0 = load i32, i32 addrspace(1)* %offsets + %scratchptr0 = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %offset0 + store i32 %offset0, i32* %scratchptr0 + + %offsetptr1 = getelementptr i32, i32 addrspace(1)* %offsets, i32 1 + %offset1 = load i32, i32 addrspace(1)* %offsetptr1 + %scratchptr1 = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %offset1 + store i32 %offset1, i32* %scratchptr1 + + %cmp = icmp eq i32 %cond, 0 + br i1 %cmp, label %if, label %else + +if: + %if_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %if_offset + %if_value = load i32, i32* %if_ptr + br label %done + +else: + %else_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %else_offset + %else_value = load i32, i32* %else_ptr + br label %done + +done: + %value = phi i32 [%if_value, %if], [%else_value, %else] + store i32 %value, i32 addrspace(1)* %out + ret void +} + diff --git a/test/CodeGen/AMDGPU/sdiv.ll b/test/CodeGen/AMDGPU/sdiv.ll new file mode 100644 index 00000000000..de645353a40 --- /dev/null +++ b/test/CodeGen/AMDGPU/sdiv.ll @@ -0,0 +1,104 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; The code generated by sdiv is long and complex and may frequently change. +; The goal of this test is to make sure the ISel doesn't fail. +; +; This program was previously failing to compile when one of the selectcc +; opcodes generated by the sdiv lowering was being legalized and optimized to: +; selectcc Remainder -1, 0, -1, SETGT +; This was fixed by adding an additional pattern in R600Instructions.td to +; match this pattern with a CNDGE_INT. + +; FUNC-LABEL: {{^}}sdiv_i32: +; EG: CF_END +define void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in + %den = load i32, i32 addrspace(1) * %den_ptr + %result = sdiv i32 %num, %den + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sdiv_i32_4: +define void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %num = load i32, i32 addrspace(1) * %in + %result = sdiv i32 %num, 4 + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; Multiply by a weird constant to make sure setIntDivIsCheap is +; working. + +; FUNC-LABEL: {{^}}slow_sdiv_i32_3435: +; SI: buffer_load_dword [[VAL:v[0-9]+]], +; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b +; SI: v_mul_hi_i32 [[TMP:v[0-9]+]], [[MAGIC]], [[VAL]] +; SI: v_add_i32 +; SI: v_lshrrev_b32 +; SI: v_ashrrev_i32 +; SI: v_add_i32 +; SI: buffer_store_dword +; SI: s_endpgm +define void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %num = load i32, i32 addrspace(1) * %in + %result = sdiv i32 %num, 3435 + store i32 %result, i32 addrspace(1)* %out + ret void +} + +define void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 + %num = load <2 x i32>, <2 x i32> addrspace(1) * %in + %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr + %result = sdiv <2 x i32> %num, %den + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +define void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %num = load <2 x i32>, <2 x i32> addrspace(1) * %in + %result = sdiv <2 x i32> %num, + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +define void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 + %num = load <4 x i32>, <4 x i32> addrspace(1) * %in + %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr + %result = sdiv <4 x i32> %num, %den + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +define void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %num = load <4 x i32>, <4 x i32> addrspace(1) * %in + %result = sdiv <4 x i32> %num, + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +; Tests for 64-bit divide bypass. +; define void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +; %result = sdiv i64 %a, %b +; store i64 %result, i64 addrspace(1)* %out, align 8 +; ret void +; } + +; define void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +; %result = srem i64 %a, %b +; store i64 %result, i64 addrspace(1)* %out, align 8 +; ret void +; } + +; define void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +; %resultdiv = sdiv i64 %a, %b +; %resultrem = srem i64 %a, %b +; %result = add i64 %resultdiv, %resultrem +; store i64 %result, i64 addrspace(1)* %out, align 8 +; ret void +; } diff --git a/test/CodeGen/AMDGPU/sdivrem24.ll b/test/CodeGen/AMDGPU/sdivrem24.ll new file mode 100644 index 00000000000..ad5df39f550 --- /dev/null +++ b/test/CodeGen/AMDGPU/sdivrem24.ll @@ -0,0 +1,239 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}sdiv24_i8: +; SI: v_cvt_f32_i32 +; SI: v_cvt_f32_i32 +; SI: v_rcp_f32 +; SI: v_cvt_i32_f32 + +; EG: INT_TO_FLT +; EG-DAG: INT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_INT +define void @sdiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { + %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 + %num = load i8, i8 addrspace(1) * %in + %den = load i8, i8 addrspace(1) * %den_ptr + %result = sdiv i8 %num, %den + store i8 %result, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sdiv24_i16: +; SI: v_cvt_f32_i32 +; SI: v_cvt_f32_i32 +; SI: v_rcp_f32 +; SI: v_cvt_i32_f32 + +; EG: INT_TO_FLT +; EG-DAG: INT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_INT +define void @sdiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { + %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 + %num = load i16, i16 addrspace(1) * %in, align 2 + %den = load i16, i16 addrspace(1) * %den_ptr, align 2 + %result = sdiv i16 %num, %den + store i16 %result, i16 addrspace(1)* %out, align 2 + ret void +} + +; FUNC-LABEL: {{^}}sdiv24_i32: +; SI: v_cvt_f32_i32 +; SI: v_cvt_f32_i32 +; SI: v_rcp_f32 +; SI: v_cvt_i32_f32 + +; EG: INT_TO_FLT +; EG-DAG: INT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_INT +define void @sdiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 8 + %den.i24.0 = shl i32 %den, 8 + %num.i24 = ashr i32 %num.i24.0, 8 + %den.i24 = ashr i32 %den.i24.0, 8 + %result = sdiv i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}sdiv25_i32: +; SI-NOT: v_cvt_f32_i32 +; SI-NOT: v_rcp_f32 + +; EG-NOT: INT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @sdiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 7 + %den.i24.0 = shl i32 %den, 7 + %num.i24 = ashr i32 %num.i24.0, 7 + %den.i24 = ashr i32 %den.i24.0, 7 + %result = sdiv i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_no_sdiv24_i32_1: +; SI-NOT: v_cvt_f32_i32 +; SI-NOT: v_rcp_f32 + +; EG-NOT: INT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @test_no_sdiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 8 + %den.i24.0 = shl i32 %den, 7 + %num.i24 = ashr i32 %num.i24.0, 8 + %den.i24 = ashr i32 %den.i24.0, 7 + %result = sdiv i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_no_sdiv24_i32_2: +; SI-NOT: v_cvt_f32_i32 +; SI-NOT: v_rcp_f32 + +; EG-NOT: INT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @test_no_sdiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 7 + %den.i24.0 = shl i32 %den, 8 + %num.i24 = ashr i32 %num.i24.0, 7 + %den.i24 = ashr i32 %den.i24.0, 8 + %result = sdiv i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}srem24_i8: +; SI: v_cvt_f32_i32 +; SI: v_cvt_f32_i32 +; SI: v_rcp_f32 +; SI: v_cvt_i32_f32 + +; EG: INT_TO_FLT +; EG-DAG: INT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_INT +define void @srem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { + %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 + %num = load i8, i8 addrspace(1) * %in + %den = load i8, i8 addrspace(1) * %den_ptr + %result = srem i8 %num, %den + store i8 %result, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}srem24_i16: +; SI: v_cvt_f32_i32 +; SI: v_cvt_f32_i32 +; SI: v_rcp_f32 +; SI: v_cvt_i32_f32 + +; EG: INT_TO_FLT +; EG-DAG: INT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_INT +define void @srem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { + %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 + %num = load i16, i16 addrspace(1) * %in, align 2 + %den = load i16, i16 addrspace(1) * %den_ptr, align 2 + %result = srem i16 %num, %den + store i16 %result, i16 addrspace(1)* %out, align 2 + ret void +} + +; FUNC-LABEL: {{^}}srem24_i32: +; SI: v_cvt_f32_i32 +; SI: v_cvt_f32_i32 +; SI: v_rcp_f32 +; SI: v_cvt_i32_f32 + +; EG: INT_TO_FLT +; EG-DAG: INT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_INT +define void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 8 + %den.i24.0 = shl i32 %den, 8 + %num.i24 = ashr i32 %num.i24.0, 8 + %den.i24 = ashr i32 %den.i24.0, 8 + %result = srem i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}srem25_i32: +; SI-NOT: v_cvt_f32_i32 +; SI-NOT: v_rcp_f32 + +; EG-NOT: INT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 7 + %den.i24.0 = shl i32 %den, 7 + %num.i24 = ashr i32 %num.i24.0, 7 + %den.i24 = ashr i32 %den.i24.0, 7 + %result = srem i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_no_srem24_i32_1: +; SI-NOT: v_cvt_f32_i32 +; SI-NOT: v_rcp_f32 + +; EG-NOT: INT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @test_no_srem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 8 + %den.i24.0 = shl i32 %den, 7 + %num.i24 = ashr i32 %num.i24.0, 8 + %den.i24 = ashr i32 %den.i24.0, 7 + %result = srem i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_no_srem24_i32_2: +; SI-NOT: v_cvt_f32_i32 +; SI-NOT: v_rcp_f32 + +; EG-NOT: INT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @test_no_srem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 7 + %den.i24.0 = shl i32 %den, 8 + %num.i24 = ashr i32 %num.i24.0, 7 + %den.i24 = ashr i32 %den.i24.0, 8 + %result = srem i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/sdivrem64.ll b/test/CodeGen/AMDGPU/sdivrem64.ll new file mode 100644 index 00000000000..a9b2b7f9df5 --- /dev/null +++ b/test/CodeGen/AMDGPU/sdivrem64.ll @@ -0,0 +1,225 @@ +;RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s +;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s +;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s + +;FUNC-LABEL: {{^}}test_sdiv: +;EG: RECIP_UINT +;EG: LSHL {{.*}}, 1, +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT + +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN: v_bfe_u32 +;GCN-NOT: v_mad_f32 +;SI-NOT: v_lshr_b64 +;VI-NOT: v_lshrrev_b64 +;GCN: s_endpgm +define void @test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %result = sdiv i64 %x, %y + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test_srem: +;EG: RECIP_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: AND_INT {{.*}}, 1, + +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN-NOT: v_mad_f32 +;SI-NOT: v_lshr_b64 +;VI-NOT: v_lshrrev_b64 +;GCN: s_endpgm +define void @test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %result = urem i64 %x, %y + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test_sdiv3264: +;EG: RECIP_UINT +;EG-NOT: BFE_UINT + +;GCN-NOT: s_bfe_u32 +;GCN-NOT: v_mad_f32 +;SI-NOT: v_lshr_b64 +;VI-NOT: v_lshrrev_b64 +;GCN: s_endpgm +define void @test_sdiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %1 = ashr i64 %x, 33 + %2 = ashr i64 %y, 33 + %result = sdiv i64 %1, %2 + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test_srem3264: +;EG: RECIP_UINT +;EG-NOT: BFE_UINT + +;GCN-NOT: s_bfe_u32 +;GCN-NOT: v_mad_f32 +;SI-NOT: v_lshr_b64 +;VI-NOT: v_lshrrev_b64 +;GCN: s_endpgm +define void @test_srem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %1 = ashr i64 %x, 33 + %2 = ashr i64 %y, 33 + %result = srem i64 %1, %2 + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test_sdiv2464: +;EG: INT_TO_FLT +;EG: INT_TO_FLT +;EG: FLT_TO_INT +;EG-NOT: RECIP_UINT +;EG-NOT: BFE_UINT + +;GCN-NOT: s_bfe_u32 +;GCN: v_mad_f32 +;SI-NOT: v_lshr_b64 +;VI-NOT: v_lshrrev_b64 +;GCN: s_endpgm +define void @test_sdiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %1 = ashr i64 %x, 40 + %2 = ashr i64 %y, 40 + %result = sdiv i64 %1, %2 + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test_srem2464: +;EG: INT_TO_FLT +;EG: INT_TO_FLT +;EG: FLT_TO_INT +;EG-NOT: RECIP_UINT +;EG-NOT: BFE_UINT + +;GCN-NOT: s_bfe_u32 +;GCN: v_mad_f32 +;SI-NOT: v_lshr_b64 +;VI-NOT: v_lshrrev_b64 +;GCN: s_endpgm +define void @test_srem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %1 = ashr i64 %x, 40 + %2 = ashr i64 %y, 40 + %result = srem i64 %1, %2 + store i64 %result, i64 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/select-i1.ll b/test/CodeGen/AMDGPU/select-i1.ll new file mode 100644 index 00000000000..6735394e93a --- /dev/null +++ b/test/CodeGen/AMDGPU/select-i1.ll @@ -0,0 +1,15 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FIXME: This should go in existing select.ll test, except the current testcase there is broken on SI + +; FUNC-LABEL: {{^}}select_i1: +; SI: v_cndmask_b32 +; SI-NOT: v_cndmask_b32 +define void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind { + %cmp = icmp ugt i32 %cond, 5 + %sel = select i1 %cmp, i1 %a, i1 %b + store i1 %sel, i1 addrspace(1)* %out, align 4 + ret void +} + diff --git a/test/CodeGen/AMDGPU/select-vectors.ll b/test/CodeGen/AMDGPU/select-vectors.ll new file mode 100644 index 00000000000..59082c65cc8 --- /dev/null +++ b/test/CodeGen/AMDGPU/select-vectors.ll @@ -0,0 +1,156 @@ +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; Test expansion of scalar selects on vectors. +; Evergreen not enabled since it seems to be having problems with doubles. + + +; FUNC-LABEL: {{^}}select_v4i8: +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +define void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) nounwind { + %cmp = icmp eq i8 %c, 0 + %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b + store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}select_v4i16: +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +define void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b, i32 %c) nounwind { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b + store <4 x i16> %select, <4 x i16> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}select_v2i32: +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: buffer_store_dwordx2 +define void @select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b + store <2 x i32> %select, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}select_v4i32: +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: buffer_store_dwordx4 +define void @select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b + store <4 x i32> %select, <4 x i32> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}select_v8i32: +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +define void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) nounwind { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b + store <8 x i32> %select, <8 x i32> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}select_v2f32: +; SI: buffer_store_dwordx2 +define void @select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <2 x float> %a, <2 x float> %b + store <2 x float> %select, <2 x float> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}select_v4f32: +; SI: buffer_store_dwordx4 +define void @select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) nounwind { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <4 x float> %a, <4 x float> %b + store <4 x float> %select, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}select_v8f32: +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +define void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) nounwind { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <8 x float> %a, <8 x float> %b + store <8 x float> %select, <8 x float> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}select_v2f64: +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +define void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) nounwind { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <2 x double> %a, <2 x double> %b + store <2 x double> %select, <2 x double> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}select_v4f64: +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +define void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) nounwind { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <4 x double> %a, <4 x double> %b + store <4 x double> %select, <4 x double> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: {{^}}select_v8f64: +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e64 +define void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) nounwind { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <8 x double> %a, <8 x double> %b + store <8 x double> %select, <8 x double> addrspace(1)* %out, align 16 + ret void +} diff --git a/test/CodeGen/AMDGPU/select.ll b/test/CodeGen/AMDGPU/select.ll new file mode 100644 index 00000000000..45f3cd5a7ac --- /dev/null +++ b/test/CodeGen/AMDGPU/select.ll @@ -0,0 +1,47 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + + +; Normally icmp + select is optimized to select_cc, when this happens the +; DAGLegalizer never sees the select and doesn't have a chance to leaglize it. +; +; In order to avoid the select_cc optimization, this test case calculates the +; condition for the select in a separate basic block. + +; FUNC-LABEL: {{^}}select: +; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X +; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X +; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY +; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY +; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW +; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW +define void @select (i32 addrspace(1)* %i32out, float addrspace(1)* %f32out, + <2 x i32> addrspace(1)* %v2i32out, <2 x float> addrspace(1)* %v2f32out, + <4 x i32> addrspace(1)* %v4i32out, <4 x float> addrspace(1)* %v4f32out, + i32 %cond) { +entry: + br label %for +body: + %inc = add i32 %i, 1 + %br_cmp.i = icmp eq i1 %br_cmp, 0 + br label %for +for: + %i = phi i32 [ %inc, %body], [ 0, %entry ] + %br_cmp = phi i1 [ %br_cmp.i, %body ], [ 0, %entry ] + %0 = icmp eq i32 %cond, %i + %1 = select i1 %br_cmp, i32 2, i32 3 + %2 = select i1 %br_cmp, float 2.0 , float 5.0 + %3 = select i1 %br_cmp, <2 x i32> , <2 x i32> + %4 = select i1 %br_cmp, <2 x float> , <2 x float> + %5 = select i1 %br_cmp, <4 x i32> , <4 x i32> + %6 = select i1 %br_cmp, <4 x float> , <4 x float> + br i1 %0, label %body, label %done + +done: + store i32 %1, i32 addrspace(1)* %i32out + store float %2, float addrspace(1)* %f32out + store <2 x i32> %3, <2 x i32> addrspace(1)* %v2i32out + store <2 x float> %4, <2 x float> addrspace(1)* %v2f32out + store <4 x i32> %5, <4 x i32> addrspace(1)* %v4i32out + store <4 x float> %6, <4 x float> addrspace(1)* %v4f32out + ret void +} diff --git a/test/CodeGen/AMDGPU/select64.ll b/test/CodeGen/AMDGPU/select64.ll new file mode 100644 index 00000000000..5cebb30dc72 --- /dev/null +++ b/test/CodeGen/AMDGPU/select64.ll @@ -0,0 +1,68 @@ +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +; CHECK-LABEL: {{^}}select0: +; i64 select should be split into two i32 selects, and we shouldn't need +; to use a shfit to extract the hi dword of the input. +; CHECK-NOT: s_lshr_b64 +; CHECK: v_cndmask +; CHECK: v_cndmask +define void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) { +entry: + %0 = icmp ugt i32 %cond, 5 + %1 = select i1 %0, i64 0, i64 %in + store i64 %1, i64 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}select_trunc_i64: +; CHECK: v_cndmask_b32 +; CHECK-NOT: v_cndmask_b32 +define void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind { + %cmp = icmp ugt i32 %cond, 5 + %sel = select i1 %cmp, i64 0, i64 %in + %trunc = trunc i64 %sel to i32 + store i32 %trunc, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}select_trunc_i64_2: +; CHECK: v_cndmask_b32 +; CHECK-NOT: v_cndmask_b32 +define void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind { + %cmp = icmp ugt i32 %cond, 5 + %sel = select i1 %cmp, i64 %a, i64 %b + %trunc = trunc i64 %sel to i32 + store i32 %trunc, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}v_select_trunc_i64_2: +; CHECK: v_cndmask_b32 +; CHECK-NOT: v_cndmask_b32 +define void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %cmp = icmp ugt i32 %cond, 5 + %a = load i64, i64 addrspace(1)* %aptr, align 8 + %b = load i64, i64 addrspace(1)* %bptr, align 8 + %sel = select i1 %cmp, i64 %a, i64 %b + %trunc = trunc i64 %sel to i32 + store i32 %trunc, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}v_select_i64_split_imm: +; CHECK: s_mov_b32 [[SHI:s[0-9]+]], 63 +; CHECK: s_mov_b32 [[SLO:s[0-9]+]], 0 +; CHECK-DAG: v_mov_b32_e32 [[VHI:v[0-9]+]], [[SHI]] +; CHECK-DAG: v_mov_b32_e32 [[VLO:v[0-9]+]], [[SLO]] +; CHECK-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, [[VLO]], {{v[0-9]+}} +; CHECK-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, [[VHI]], {{v[0-9]+}} +; CHECK: s_endpgm +define void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %cmp = icmp ugt i32 %cond, 5 + %a = load i64, i64 addrspace(1)* %aptr, align 8 + %b = load i64, i64 addrspace(1)* %bptr, align 8 + %sel = select i1 %cmp, i64 %a, i64 270582939648 ; 63 << 32 + store i64 %sel, i64 addrspace(1)* %out, align 8 + ret void +} diff --git a/test/CodeGen/AMDGPU/selectcc-cnd.ll b/test/CodeGen/AMDGPU/selectcc-cnd.ll new file mode 100644 index 00000000000..94d0ace7569 --- /dev/null +++ b/test/CodeGen/AMDGPU/selectcc-cnd.ll @@ -0,0 +1,12 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +;CHECK-NOT: SETE +;CHECK: CNDE {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1.0, literal.x, +;CHECK: 1073741824 +define void @test(float addrspace(1)* %out, float addrspace(1)* %in) { + %1 = load float, float addrspace(1)* %in + %2 = fcmp oeq float %1, 0.0 + %3 = select i1 %2, float 1.0, float 2.0 + store float %3, float addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/selectcc-cnde-int.ll b/test/CodeGen/AMDGPU/selectcc-cnde-int.ll new file mode 100644 index 00000000000..58a4ee7d62b --- /dev/null +++ b/test/CodeGen/AMDGPU/selectcc-cnde-int.ll @@ -0,0 +1,12 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +;CHECK-NOT: SETE_INT +;CHECK: CNDE_INT {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, literal.x, +;CHECK-NEXT: 2 +define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %1 = load i32, i32 addrspace(1)* %in + %2 = icmp eq i32 %1, 0 + %3 = select i1 %2, i32 1, i32 2 + store i32 %3, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll b/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll new file mode 100644 index 00000000000..e870ee891e6 --- /dev/null +++ b/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll @@ -0,0 +1,16 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; Note additional optimizations may cause this SGT to be replaced with a +; CND* instruction. +; CHECK: SETGT_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, literal.x, +; CHECK-NEXT: -1 +; Test a selectcc with i32 LHS/RHS and float True/False + +define void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) { +entry: + %0 = load i32, i32 addrspace(1)* %in + %1 = icmp sge i32 %0, 0 + %2 = select i1 %1, float 1.0, float 0.0 + store float %2, float addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/selectcc-opt.ll b/test/CodeGen/AMDGPU/selectcc-opt.ll new file mode 100644 index 00000000000..65be4a626a1 --- /dev/null +++ b/test/CodeGen/AMDGPU/selectcc-opt.ll @@ -0,0 +1,80 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + + +; FUNC-LABEL: {{^}}test_a: +; EG-NOT: CND +; EG: SET{{[NEQGTL]+}}_DX10 + +define void @test_a(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp olt float %in, 0.000000e+00 + %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 + %2 = fsub float -0.000000e+00, %1 + %3 = fptosi float %2 to i32 + %4 = bitcast i32 %3 to float + %5 = bitcast float %4 to i32 + %6 = icmp ne i32 %5, 0 + br i1 %6, label %IF, label %ENDIF + +IF: + %7 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + store i32 0, i32 addrspace(1)* %7 + br label %ENDIF + +ENDIF: + store i32 0, i32 addrspace(1)* %out + ret void +} + +; Same as test_a, but the branch labels are swapped to produce the inverse cc +; for the icmp instruction + +; EG-LABEL: {{^}}test_b: +; EG: SET{{[GTEQN]+}}_DX10 +; EG-NEXT: PRED_ +; EG-NEXT: ALU clause starting +define void @test_b(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp olt float %in, 0.0 + %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 + %2 = fsub float -0.000000e+00, %1 + %3 = fptosi float %2 to i32 + %4 = bitcast i32 %3 to float + %5 = bitcast float %4 to i32 + %6 = icmp ne i32 %5, 0 + br i1 %6, label %ENDIF, label %IF + +IF: + %7 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + store i32 0, i32 addrspace(1)* %7 + br label %ENDIF + +ENDIF: + store i32 0, i32 addrspace(1)* %out + ret void +} + +; Test a CND*_INT instruction with float true/false values +; EG-LABEL: {{^}}test_c: +; EG: CND{{[GTE]+}}_INT +define void @test_c(float addrspace(1)* %out, i32 %in) { +entry: + %0 = icmp sgt i32 %in, 0 + %1 = select i1 %0, float 2.0, float 3.0 + store float %1, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}selectcc_bool: +; SI: v_cmp_ne_i32 +; SI-NEXT: v_cndmask_b32_e64 +; SI-NOT: cmp +; SI-NOT: cndmask +define void @selectcc_bool(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %icmp0 = icmp ne i32 %a, %b + %ext = select i1 %icmp0, i32 -1, i32 0 + store i32 %ext, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/selectcc.ll b/test/CodeGen/AMDGPU/selectcc.ll new file mode 100644 index 00000000000..f378e15dd76 --- /dev/null +++ b/test/CodeGen/AMDGPU/selectcc.ll @@ -0,0 +1,20 @@ +; RUN: llc -verify-machineinstrs -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}selectcc_i64: +; EG: XOR_INT +; EG: XOR_INT +; EG: OR_INT +; EG: CNDE_INT +; EG: CNDE_INT +; SI: v_cmp_eq_i64 +; SI: v_cndmask +; SI: v_cndmask +define void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) { +entry: + %0 = icmp eq i64 %lhs, %rhs + %1 = select i1 %0, i64 %true, i64 %false + store i64 %1, i64 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/set-dx10.ll b/test/CodeGen/AMDGPU/set-dx10.ll new file mode 100644 index 00000000000..53694dcffa6 --- /dev/null +++ b/test/CodeGen/AMDGPU/set-dx10.ll @@ -0,0 +1,161 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; These tests check that floating point comparisons which are used by select +; to store integer true (-1) and false (0) values are lowered to one of the +; SET*DX10 instructions. + +; CHECK: {{^}}fcmp_une_select_fptosi: +; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, +; CHECK-NEXT: LSHR +; CHECK-NEXT: 1084227584(5.000000e+00) +define void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp une float %in, 5.0 + %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 + %2 = fsub float -0.000000e+00, %1 + %3 = fptosi float %2 to i32 + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; CHECK: {{^}}fcmp_une_select_i32: +; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, +; CHECK-NEXT: LSHR +; CHECK-NEXT: 1084227584(5.000000e+00) +define void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp une float %in, 5.0 + %1 = select i1 %0, i32 -1, i32 0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; CHECK: {{^}}fcmp_oeq_select_fptosi: +; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, +; CHECK-NEXT: LSHR +; CHECK-NEXT: 1084227584(5.000000e+00) +define void @fcmp_oeq_select_fptosi(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp oeq float %in, 5.0 + %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 + %2 = fsub float -0.000000e+00, %1 + %3 = fptosi float %2 to i32 + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; CHECK: {{^}}fcmp_oeq_select_i32: +; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, +; CHECK-NEXT: LSHR +; CHECK-NEXT: 1084227584(5.000000e+00) +define void @fcmp_oeq_select_i32(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp oeq float %in, 5.0 + %1 = select i1 %0, i32 -1, i32 0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; CHECK: {{^}}fcmp_ogt_select_fptosi: +; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, +; CHECK-NEXT: LSHR +; CHECK-NEXT: 1084227584(5.000000e+00) +define void @fcmp_ogt_select_fptosi(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp ogt float %in, 5.0 + %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 + %2 = fsub float -0.000000e+00, %1 + %3 = fptosi float %2 to i32 + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; CHECK: {{^}}fcmp_ogt_select_i32: +; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, +; CHECK-NEXT: LSHR +; CHECK-NEXT: 1084227584(5.000000e+00) +define void @fcmp_ogt_select_i32(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp ogt float %in, 5.0 + %1 = select i1 %0, i32 -1, i32 0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; CHECK: {{^}}fcmp_oge_select_fptosi: +; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, +; CHECK-NEXT: LSHR +; CHECK-NEXT: 1084227584(5.000000e+00) +define void @fcmp_oge_select_fptosi(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp oge float %in, 5.0 + %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 + %2 = fsub float -0.000000e+00, %1 + %3 = fptosi float %2 to i32 + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; CHECK: {{^}}fcmp_oge_select_i32: +; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, +; CHECK-NEXT: LSHR +; CHECK-NEXT: 1084227584(5.000000e+00) +define void @fcmp_oge_select_i32(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp oge float %in, 5.0 + %1 = select i1 %0, i32 -1, i32 0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; CHECK: {{^}}fcmp_ole_select_fptosi: +; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z, +; CHECK-NEXT: LSHR +; CHECK-NEXT: 1084227584(5.000000e+00) +define void @fcmp_ole_select_fptosi(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp ole float %in, 5.0 + %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 + %2 = fsub float -0.000000e+00, %1 + %3 = fptosi float %2 to i32 + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; CHECK: {{^}}fcmp_ole_select_i32: +; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z, +; CHECK-NEXT: LSHR +; CHECK-NEXT: 1084227584(5.000000e+00) +define void @fcmp_ole_select_i32(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp ole float %in, 5.0 + %1 = select i1 %0, i32 -1, i32 0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; CHECK: {{^}}fcmp_olt_select_fptosi: +; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z, +; CHECK-NEXT: LSHR +; CHECK-NEXT: 1084227584(5.000000e+00) +define void @fcmp_olt_select_fptosi(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp olt float %in, 5.0 + %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 + %2 = fsub float -0.000000e+00, %1 + %3 = fptosi float %2 to i32 + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; CHECK: {{^}}fcmp_olt_select_i32: +; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z, +; CHECK-NEXT: LSHR +; CHECK-NEXT: 1084227584(5.000000e+00) +define void @fcmp_olt_select_i32(i32 addrspace(1)* %out, float %in) { +entry: + %0 = fcmp olt float %in, 5.0 + %1 = select i1 %0, i32 -1, i32 0 + store i32 %1, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/setcc-equivalent.ll b/test/CodeGen/AMDGPU/setcc-equivalent.ll new file mode 100644 index 00000000000..11ea793650c --- /dev/null +++ b/test/CodeGen/AMDGPU/setcc-equivalent.ll @@ -0,0 +1,30 @@ +; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s + +; EG-LABEL: {{^}}and_setcc_setcc_i32: +; EG: AND_INT +; EG-NEXT: SETE_INT +define void @and_setcc_setcc_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { + %cmp1 = icmp eq i32 %a, -1 + %cmp2 = icmp eq i32 %b, -1 + %and = and i1 %cmp1, %cmp2 + %ext = sext i1 %and to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; EG-LABEL: {{^}}and_setcc_setcc_v4i32: +; EG: AND_INT +; EG: AND_INT +; EG: SETE_INT +; EG: AND_INT +; EG: SETE_INT +; EG: AND_INT +; EG: SETE_INT +define void @and_setcc_setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) { + %cmp1 = icmp eq <4 x i32> %a, + %cmp2 = icmp eq <4 x i32> %b, + %and = and <4 x i1> %cmp1, %cmp2 + %ext = sext <4 x i1> %and to <4 x i32> + store <4 x i32> %ext, <4 x i32> addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/setcc-opt.ll b/test/CodeGen/AMDGPU/setcc-opt.ll new file mode 100644 index 00000000000..4e6a10d6b78 --- /dev/null +++ b/test/CodeGen/AMDGPU/setcc-opt.ll @@ -0,0 +1,236 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}sext_bool_icmp_eq_0: +; GCN-NOT: v_cmp +; GCN: v_cmp_ne_i32_e32 vcc, +; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN-NEXT:buffer_store_byte [[RESULT]] +; GCN-NEXT: s_endpgm + +; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W +; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1 +define void @sext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %icmp0 = icmp eq i32 %a, %b + %ext = sext i1 %icmp0 to i32 + %icmp1 = icmp eq i32 %ext, 0 + store i1 %icmp1, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sext_bool_icmp_ne_0: +; GCN-NOT: v_cmp +; GCN: v_cmp_ne_i32_e32 vcc, +; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN-NEXT: s_endpgm + +; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W +; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1 +define void @sext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %icmp0 = icmp ne i32 %a, %b + %ext = sext i1 %icmp0 to i32 + %icmp1 = icmp ne i32 %ext, 0 + store i1 %icmp1, i1 addrspace(1)* %out + ret void +} + +; This really folds away to false +; FUNC-LABEL: {{^}}sext_bool_icmp_eq_1: +; GCN: v_cmp_eq_i32_e32 vcc, +; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, vcc +; GCN-NEXT: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}} +; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1, +; GCN-NEXT: buffer_store_byte [[TMP]] +; GCN-NEXT: s_endpgm +define void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %icmp0 = icmp eq i32 %a, %b + %ext = sext i1 %icmp0 to i32 + %icmp1 = icmp eq i32 %ext, 1 + store i1 %icmp1, i1 addrspace(1)* %out + ret void +} + +; This really folds away to true +; FUNC-LABEL: {{^}}sext_bool_icmp_ne_1: +; GCN: v_cmp_ne_i32_e32 vcc, +; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, vcc +; GCN-NEXT: v_cmp_ne_i32_e32 vcc, 1, [[TMP]]{{$}} +; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1, +; GCN-NEXT: buffer_store_byte [[TMP]] +; GCN-NEXT: s_endpgm +define void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %icmp0 = icmp ne i32 %a, %b + %ext = sext i1 %icmp0 to i32 + %icmp1 = icmp ne i32 %ext, 1 + store i1 %icmp1, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zext_bool_icmp_eq_0: +; GCN-NOT: v_cmp +; GCN: v_cmp_ne_i32_e32 vcc, +; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN-NEXT: s_endpgm +define void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %icmp0 = icmp eq i32 %a, %b + %ext = zext i1 %icmp0 to i32 + %icmp1 = icmp eq i32 %ext, 0 + store i1 %icmp1, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zext_bool_icmp_ne_0: +; GCN-NOT: v_cmp +; GCN: v_cmp_ne_i32_e32 vcc, +; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN-NEXT: s_endpgm +define void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %icmp0 = icmp ne i32 %a, %b + %ext = zext i1 %icmp0 to i32 + %icmp1 = icmp ne i32 %ext, 0 + store i1 %icmp1, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zext_bool_icmp_eq_1: +; GCN-NOT: v_cmp +; GCN: v_cmp_eq_i32_e32 vcc, +; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN-NEXT: s_endpgm +define void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %icmp0 = icmp eq i32 %a, %b + %ext = zext i1 %icmp0 to i32 + %icmp1 = icmp eq i32 %ext, 1 + store i1 %icmp1, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zext_bool_icmp_ne_1: +; GCN-NOT: v_cmp +; GCN: v_cmp_eq_i32_e32 vcc, +; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN-NEXT: buffer_store_byte [[RESULT]] +define void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %icmp0 = icmp ne i32 %a, %b + %ext = zext i1 %icmp0 to i32 + %icmp1 = icmp ne i32 %ext, 1 + store i1 %icmp1, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sext_bool_icmp_ne_k: +; SI-DAG: s_load_dword [[A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[B:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; VI-DAG: s_load_dword [[A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-DAG: s_load_dword [[B:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]] +; GCN: v_cmp_ne_i32_e32 vcc, 2, [[VB]]{{$}} +; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN: buffer_store_byte +; GCN: s_endpgm +define void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %icmp0 = icmp ne i32 %a, %b + %ext = sext i1 %icmp0 to i32 + %icmp1 = icmp ne i32 %ext, 2 + store i1 %icmp1, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}cmp_zext_k_i8max: +; GCN: buffer_load_ubyte [[B:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 +; GCN: v_mov_b32_e32 [[K255:v[0-9]+]], 0xff{{$}} +; GCN: v_cmp_ne_i32_e32 vcc, [[K255]], [[B]] +; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN: s_endpgm +define void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind { + %b.ext = zext i8 %b to i32 + %icmp0 = icmp ne i32 %b.ext, 255 + store i1 %icmp0, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}cmp_sext_k_neg1: +; GCN: buffer_load_sbyte [[B:v[0-9]+]] +; GCN: v_cmp_ne_i32_e32 vcc, -1, [[B]]{{$}} +; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN: s_endpgm +define void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nounwind { + %b = load i8, i8 addrspace(1)* %b.ptr + %b.ext = sext i8 %b to i32 + %icmp0 = icmp ne i32 %b.ext, -1 + store i1 %icmp0, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}cmp_sext_k_neg1_i8_sext_arg: +; GCN: s_load_dword [[B:s[0-9]+]] +; GCN: v_cmp_ne_i32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -1, [[B]] +; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP]] +; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN: s_endpgm +define void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) nounwind { + %b.ext = sext i8 %b to i32 + %icmp0 = icmp ne i32 %b.ext, -1 + store i1 %icmp0, i1 addrspace(1)* %out + ret void +} + +; FIXME: This ends up doing a buffer_load_ubyte, and and compare to +; 255. Seems to be because of ordering problems when not allowing load widths to be reduced. +; Should do a buffer_load_sbyte and compare with -1 + +; FUNC-LABEL: {{^}}cmp_sext_k_neg1_i8_arg: +; GCN-DAG: buffer_load_ubyte [[B:v[0-9]+]] +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xff{{$}} +; GCN: v_cmp_ne_i32_e32 vcc, [[K]], [[B]]{{$}} +; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN-NEXT: buffer_store_byte [[RESULT]] +; GCN: s_endpgm +define void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind { + %b.ext = sext i8 %b to i32 + %icmp0 = icmp ne i32 %b.ext, -1 + store i1 %icmp0, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}cmp_zext_k_neg1: +; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}} +; GCN: buffer_store_byte [[RESULT]] +; GCN: s_endpgm +define void @cmp_zext_k_neg1(i1 addrspace(1)* %out, i8 %b) nounwind { + %b.ext = zext i8 %b to i32 + %icmp0 = icmp ne i32 %b.ext, -1 + store i1 %icmp0, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zext_bool_icmp_ne_k: +; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}} +; GCN: buffer_store_byte [[RESULT]] +; GCN-NEXT: s_endpgm +define void @zext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %icmp0 = icmp ne i32 %a, %b + %ext = zext i1 %icmp0 to i32 + %icmp1 = icmp ne i32 %ext, 2 + store i1 %icmp1, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zext_bool_icmp_eq_k: +; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} +; GCN: buffer_store_byte [[RESULT]] +; GCN-NEXT: s_endpgm +define void @zext_bool_icmp_eq_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %icmp0 = icmp ne i32 %a, %b + %ext = zext i1 %icmp0 to i32 + %icmp1 = icmp eq i32 %ext, 2 + store i1 %icmp1, i1 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/setcc.ll b/test/CodeGen/AMDGPU/setcc.ll new file mode 100644 index 00000000000..f33a82df5ff --- /dev/null +++ b/test/CodeGen/AMDGPU/setcc.ll @@ -0,0 +1,377 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; FUNC-LABEL: {{^}}setcc_v2i32: +; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[3].X, KC0[3].Z +; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[2].W, KC0[3].Y + +define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) { + %result = icmp eq <2 x i32> %a, %b + %sext = sext <2 x i1> %result to <2 x i32> + store <2 x i32> %sext, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}setcc_v4i32: +; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 + %a = load <4 x i32>, <4 x i32> addrspace(1) * %in + %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr + %result = icmp eq <4 x i32> %a, %b + %sext = sext <4 x i1> %result to <4 x i32> + store <4 x i32> %sext, <4 x i32> addrspace(1)* %out + ret void +} + +;;;==========================================================================;;; +;; Float comparisons +;;;==========================================================================;;; + +; FUNC-LABEL: {{^}}f32_oeq: +; R600: SETE_DX10 +; SI: v_cmp_eq_f32 +define void @f32_oeq(i32 addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fcmp oeq float %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f32_ogt: +; R600: SETGT_DX10 +; SI: v_cmp_gt_f32 +define void @f32_ogt(i32 addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fcmp ogt float %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f32_oge: +; R600: SETGE_DX10 +; SI: v_cmp_ge_f32 +define void @f32_oge(i32 addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fcmp oge float %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f32_olt: +; R600: SETGT_DX10 +; SI: v_cmp_lt_f32 +define void @f32_olt(i32 addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fcmp olt float %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f32_ole: +; R600: SETGE_DX10 +; SI: v_cmp_le_f32 +define void @f32_ole(i32 addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fcmp ole float %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f32_one: +; R600-DAG: SETE_DX10 +; R600-DAG: SETE_DX10 +; R600-DAG: AND_INT +; R600-DAG: SETNE_DX10 +; R600-DAG: AND_INT +; R600-DAG: SETNE_INT + +; SI: v_cmp_lg_f32_e32 vcc +; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +define void @f32_one(i32 addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fcmp one float %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f32_ord: +; R600-DAG: SETE_DX10 +; R600-DAG: SETE_DX10 +; R600-DAG: AND_INT +; R600-DAG: SETNE_INT +; SI: v_cmp_o_f32 +define void @f32_ord(i32 addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fcmp ord float %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f32_ueq: +; R600-DAG: SETNE_DX10 +; R600-DAG: SETNE_DX10 +; R600-DAG: OR_INT +; R600-DAG: SETE_DX10 +; R600-DAG: OR_INT +; R600-DAG: SETNE_INT + +; SI: v_cmp_nlg_f32_e32 vcc +; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +define void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fcmp ueq float %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f32_ugt: +; R600: SETGE +; R600: SETE_DX10 +; SI: v_cmp_nle_f32_e32 vcc +; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +define void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fcmp ugt float %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f32_uge: +; R600: SETGT +; R600: SETE_DX10 + +; SI: v_cmp_nlt_f32_e32 vcc +; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +define void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fcmp uge float %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f32_ult: +; R600: SETGE +; R600: SETE_DX10 + +; SI: v_cmp_nge_f32_e32 vcc +; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +define void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fcmp ult float %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f32_ule: +; R600: SETGT +; R600: SETE_DX10 + +; SI: v_cmp_ngt_f32_e32 vcc +; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +define void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fcmp ule float %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f32_une: +; R600: SETNE_DX10 +; SI: v_cmp_neq_f32 +define void @f32_une(i32 addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fcmp une float %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f32_uno: +; R600: SETNE_DX10 +; R600: SETNE_DX10 +; R600: OR_INT +; R600: SETNE_INT +; SI: v_cmp_u_f32 +define void @f32_uno(i32 addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fcmp uno float %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +;;;==========================================================================;;; +;; 32-bit integer comparisons +;;;==========================================================================;;; + +; FUNC-LABEL: {{^}}i32_eq: +; R600: SETE_INT +; SI: v_cmp_eq_i32 +define void @i32_eq(i32 addrspace(1)* %out, i32 %a, i32 %b) { +entry: + %0 = icmp eq i32 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i32_ne: +; R600: SETNE_INT +; SI: v_cmp_ne_i32 +define void @i32_ne(i32 addrspace(1)* %out, i32 %a, i32 %b) { +entry: + %0 = icmp ne i32 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i32_ugt: +; R600: SETGT_UINT +; SI: v_cmp_gt_u32 +define void @i32_ugt(i32 addrspace(1)* %out, i32 %a, i32 %b) { +entry: + %0 = icmp ugt i32 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i32_uge: +; R600: SETGE_UINT +; SI: v_cmp_ge_u32 +define void @i32_uge(i32 addrspace(1)* %out, i32 %a, i32 %b) { +entry: + %0 = icmp uge i32 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i32_ult: +; R600: SETGT_UINT +; SI: v_cmp_lt_u32 +define void @i32_ult(i32 addrspace(1)* %out, i32 %a, i32 %b) { +entry: + %0 = icmp ult i32 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i32_ule: +; R600: SETGE_UINT +; SI: v_cmp_le_u32 +define void @i32_ule(i32 addrspace(1)* %out, i32 %a, i32 %b) { +entry: + %0 = icmp ule i32 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i32_sgt: +; R600: SETGT_INT +; SI: v_cmp_gt_i32 +define void @i32_sgt(i32 addrspace(1)* %out, i32 %a, i32 %b) { +entry: + %0 = icmp sgt i32 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i32_sge: +; R600: SETGE_INT +; SI: v_cmp_ge_i32 +define void @i32_sge(i32 addrspace(1)* %out, i32 %a, i32 %b) { +entry: + %0 = icmp sge i32 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i32_slt: +; R600: SETGT_INT +; SI: v_cmp_lt_i32 +define void @i32_slt(i32 addrspace(1)* %out, i32 %a, i32 %b) { +entry: + %0 = icmp slt i32 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i32_sle: +; R600: SETGE_INT +; SI: v_cmp_le_i32 +define void @i32_sle(i32 addrspace(1)* %out, i32 %a, i32 %b) { +entry: + %0 = icmp sle i32 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FIXME: This does 4 compares +; FUNC-LABEL: {{^}}v3i32_eq: +; SI-DAG: v_cmp_eq_i32 +; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, +; SI-DAG: v_cmp_eq_i32 +; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, +; SI-DAG: v_cmp_eq_i32 +; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, +; SI: s_endpgm +define void @v3i32_eq(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %ptra, <3 x i32> addrspace(1)* %ptrb) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.a = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %ptra, i32 %tid + %gep.b = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %ptrb, i32 %tid + %gep.out = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %out, i32 %tid + %a = load <3 x i32>, <3 x i32> addrspace(1)* %gep.a + %b = load <3 x i32>, <3 x i32> addrspace(1)* %gep.b + %cmp = icmp eq <3 x i32> %a, %b + %ext = sext <3 x i1> %cmp to <3 x i32> + store <3 x i32> %ext, <3 x i32> addrspace(1)* %gep.out + ret void +} + +; FUNC-LABEL: {{^}}v3i8_eq: +; SI-DAG: v_cmp_eq_i32 +; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, +; SI-DAG: v_cmp_eq_i32 +; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, +; SI-DAG: v_cmp_eq_i32 +; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, +; SI: s_endpgm +define void @v3i8_eq(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %ptra, <3 x i8> addrspace(1)* %ptrb) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep.a = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %ptra, i32 %tid + %gep.b = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %ptrb, i32 %tid + %gep.out = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %out, i32 %tid + %a = load <3 x i8>, <3 x i8> addrspace(1)* %gep.a + %b = load <3 x i8>, <3 x i8> addrspace(1)* %gep.b + %cmp = icmp eq <3 x i8> %a, %b + %ext = sext <3 x i1> %cmp to <3 x i8> + store <3 x i8> %ext, <3 x i8> addrspace(1)* %gep.out + ret void +} diff --git a/test/CodeGen/AMDGPU/setcc64.ll b/test/CodeGen/AMDGPU/setcc64.ll new file mode 100644 index 00000000000..231be7aa3da --- /dev/null +++ b/test/CodeGen/AMDGPU/setcc64.ll @@ -0,0 +1,259 @@ +;RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s + +; XXX: Merge this into setcc, once R600 supports 64-bit operations + +;;;==========================================================================;;; +;; Double comparisons +;;;==========================================================================;;; + +; FUNC-LABEL: {{^}}f64_oeq: +; SI: v_cmp_eq_f64 +define void @f64_oeq(i32 addrspace(1)* %out, double %a, double %b) { +entry: + %0 = fcmp oeq double %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f64_ogt: +; SI: v_cmp_gt_f64 +define void @f64_ogt(i32 addrspace(1)* %out, double %a, double %b) { +entry: + %0 = fcmp ogt double %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f64_oge: +; SI: v_cmp_ge_f64 +define void @f64_oge(i32 addrspace(1)* %out, double %a, double %b) { +entry: + %0 = fcmp oge double %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f64_olt: +; SI: v_cmp_lt_f64 +define void @f64_olt(i32 addrspace(1)* %out, double %a, double %b) { +entry: + %0 = fcmp olt double %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f64_ole: +; SI: v_cmp_le_f64 +define void @f64_ole(i32 addrspace(1)* %out, double %a, double %b) { +entry: + %0 = fcmp ole double %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f64_one: +; SI: v_cmp_lg_f64_e32 vcc +; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +define void @f64_one(i32 addrspace(1)* %out, double %a, double %b) { +entry: + %0 = fcmp one double %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f64_ord: +; SI: v_cmp_o_f64 +define void @f64_ord(i32 addrspace(1)* %out, double %a, double %b) { +entry: + %0 = fcmp ord double %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f64_ueq: +; SI: v_cmp_nlg_f64_e32 vcc +; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +define void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) { +entry: + %0 = fcmp ueq double %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f64_ugt: + +; SI: v_cmp_nle_f64_e32 vcc +; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +define void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) { +entry: + %0 = fcmp ugt double %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f64_uge: +; SI: v_cmp_nlt_f64_e32 vcc +; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +define void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) { +entry: + %0 = fcmp uge double %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f64_ult: +; SI: v_cmp_nge_f64_e32 vcc +; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +define void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) { +entry: + %0 = fcmp ult double %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f64_ule: +; SI: v_cmp_ngt_f64_e32 vcc +; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc +define void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) { +entry: + %0 = fcmp ule double %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f64_une: +; SI: v_cmp_neq_f64 +define void @f64_une(i32 addrspace(1)* %out, double %a, double %b) { +entry: + %0 = fcmp une double %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}f64_uno: +; SI: v_cmp_u_f64 +define void @f64_uno(i32 addrspace(1)* %out, double %a, double %b) { +entry: + %0 = fcmp uno double %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +;;;==========================================================================;;; +;; 64-bit integer comparisons +;;;==========================================================================;;; + +; FUNC-LABEL: {{^}}i64_eq: +; SI: v_cmp_eq_i64 +define void @i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) { +entry: + %0 = icmp eq i64 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i64_ne: +; SI: v_cmp_ne_i64 +define void @i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) { +entry: + %0 = icmp ne i64 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i64_ugt: +; SI: v_cmp_gt_u64 +define void @i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) { +entry: + %0 = icmp ugt i64 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i64_uge: +; SI: v_cmp_ge_u64 +define void @i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) { +entry: + %0 = icmp uge i64 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i64_ult: +; SI: v_cmp_lt_u64 +define void @i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) { +entry: + %0 = icmp ult i64 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i64_ule: +; SI: v_cmp_le_u64 +define void @i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) { +entry: + %0 = icmp ule i64 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i64_sgt: +; SI: v_cmp_gt_i64 +define void @i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) { +entry: + %0 = icmp sgt i64 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i64_sge: +; SI: v_cmp_ge_i64 +define void @i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) { +entry: + %0 = icmp sge i64 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i64_slt: +; SI: v_cmp_lt_i64 +define void @i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) { +entry: + %0 = icmp slt i64 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}i64_sle: +; SI: v_cmp_le_i64 +define void @i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) { +entry: + %0 = icmp sle i64 %a, %b + %1 = sext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/seto.ll b/test/CodeGen/AMDGPU/seto.ll new file mode 100644 index 00000000000..9b5d6b5dbd6 --- /dev/null +++ b/test/CodeGen/AMDGPU/seto.ll @@ -0,0 +1,15 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s + +; CHECK-LABEL: {{^}}main: +; CHECK: v_cmp_o_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]] +; CHECK-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1.0, [[CMP]] +define void @main(float %p) { +main_body: + %c = fcmp oeq float %p, %p + %r = select i1 %c, float 1.000000e+00, float 0.000000e+00 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %r, float %r, float %r, float %r) + ret void +} + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/test/CodeGen/AMDGPU/setuo.ll b/test/CodeGen/AMDGPU/setuo.ll new file mode 100644 index 00000000000..76346c4f624 --- /dev/null +++ b/test/CodeGen/AMDGPU/setuo.ll @@ -0,0 +1,15 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s + +; CHECK-LABEL: {{^}}main: +; CHECK: v_cmp_u_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]] +; CHECK-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1.0, [[CMP]] +define void @main(float %p) { +main_body: + %c = fcmp une float %p, %p + %r = select i1 %c, float 1.000000e+00, float 0.000000e+00 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %r, float %r, float %r, float %r) + ret void +} + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/test/CodeGen/AMDGPU/sext-eliminate.ll b/test/CodeGen/AMDGPU/sext-eliminate.ll new file mode 100644 index 00000000000..7dc6eb87f6b --- /dev/null +++ b/test/CodeGen/AMDGPU/sext-eliminate.ll @@ -0,0 +1,26 @@ +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + + +; FUNC-LABEL: {{^}}sext_in_reg_i1_i32_add: + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] +; EG: SUB_INT {{[* ]*}}[[RES]] +; EG-NOT: BFE +define void @sext_in_reg_i1_i32_add(i32 addrspace(1)* %out, i1 %a, i32 %b) { + %sext = sext i1 %a to i32 + %res = add i32 %b, %sext + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_i1_i32_sub: + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] +; EG: ADD_INT {{[* ]*}}[[RES]] +; EG-NOT: BFE +define void @sext_in_reg_i1_i32_sub(i32 addrspace(1)* %out, i1 %a, i32 %b) { + %sext = sext i1 %a to i32 + %res = sub i32 %b, %sext + store i32 %res, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/sext-in-reg.ll b/test/CodeGen/AMDGPU/sext-in-reg.ll new file mode 100644 index 00000000000..5aedda2ce1a --- /dev/null +++ b/test/CodeGen/AMDGPU/sext-in-reg.ll @@ -0,0 +1,611 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare i32 @llvm.AMDGPU.imax(i32, i32) nounwind readnone +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + + +; FUNC-LABEL: {{^}}sext_in_reg_i1_i32: +; SI: s_load_dword [[ARG:s[0-9]+]], +; SI: s_bfe_i32 [[SEXTRACT:s[0-9]+]], [[ARG]], 0x10000 +; SI: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], [[SEXTRACT]] +; SI: buffer_store_dword [[EXTRACT]], + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] +; EG: BFE_INT [[RES]], {{.*}}, 0.0, 1 +; EG-NEXT: LSHR * [[ADDR]] +define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) { + %shl = shl i32 %in, 31 + %sext = ashr i32 %shl, 31 + store i32 %sext, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32: +; SI: s_add_i32 [[VAL:s[0-9]+]], +; SI: s_sext_i32_i8 [[EXTRACT:s[0-9]+]], [[VAL]] +; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]] +; SI: buffer_store_dword [[VEXTRACT]], + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] +; EG: ADD_INT +; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal +; EG-NEXT: LSHR * [[ADDR]] +define void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %c = add i32 %a, %b ; add to prevent folding into extload + %shl = shl i32 %c, 24 + %ashr = ashr i32 %shl, 24 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_i16_to_i32: +; SI: s_add_i32 [[VAL:s[0-9]+]], +; SI: s_sext_i32_i16 [[EXTRACT:s[0-9]+]], [[VAL]] +; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]] +; SI: buffer_store_dword [[VEXTRACT]], + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] +; EG: ADD_INT +; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal +; EG-NEXT: LSHR * [[ADDR]] +define void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %c = add i32 %a, %b ; add to prevent folding into extload + %shl = shl i32 %c, 16 + %ashr = ashr i32 %shl, 16 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_i8_to_v1i32: +; SI: s_add_i32 [[VAL:s[0-9]+]], +; SI: s_sext_i32_i8 [[EXTRACT:s[0-9]+]], [[VAL]] +; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]] +; SI: buffer_store_dword [[VEXTRACT]], + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] +; EG: ADD_INT +; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal +; EG-NEXT: LSHR * [[ADDR]] +define void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind { + %c = add <1 x i32> %a, %b ; add to prevent folding into extload + %shl = shl <1 x i32> %c, + %ashr = ashr <1 x i32> %shl, + store <1 x i32> %ashr, <1 x i32> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_i1_to_i64: +; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]] +; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x10000 +; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] +; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] +; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} +define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %c = shl i64 %a, %b + %shl = shl i64 %c, 63 + %ashr = ashr i64 %shl, 63 + store i64 %ashr, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i64: +; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]] +; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x80000 +; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] +; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] +; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]] +; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]] +; EG: LSHL +; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal +; EG: ASHR [[RES_HI]] +; EG-NOT: BFE_INT +; EG: LSHR +; EG: LSHR +;; TODO Check address computation, using | with variables in {{}} does not work, +;; also the _LO/_HI order might be different +define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %c = shl i64 %a, %b + %shl = shl i64 %c, 56 + %ashr = ashr i64 %shl, 56 + store i64 %ashr, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_i16_to_i64: +; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]] +; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x100000 +; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] +; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] +; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]] +; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]] +; EG: LSHL +; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal +; EG: ASHR [[RES_HI]] +; EG-NOT: BFE_INT +; EG: LSHR +; EG: LSHR +;; TODO Check address computation, using | with variables in {{}} does not work, +;; also the _LO/_HI order might be different +define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %c = shl i64 %a, %b + %shl = shl i64 %c, 48 + %ashr = ashr i64 %shl, 48 + store i64 %ashr, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_i32_to_i64: +; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]] +; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x200000 +; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] +; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] +; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]] +; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]] +; EG-NOT: BFE_INT + +; EG: ASHR [[RES_HI]] + +; EG: LSHR +; EG: LSHR +;; TODO Check address computation, using | with variables in {{}} does not work, +;; also the _LO/_HI order might be different +define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %c = shl i64 %a, %b + %shl = shl i64 %c, 32 + %ashr = ashr i64 %shl, 32 + store i64 %ashr, i64 addrspace(1)* %out, align 8 + ret void +} + +; This is broken on Evergreen for some reason related to the <1 x i64> kernel arguments. +; XFUNC-LABEL: {{^}}sext_in_reg_i8_to_v1i64: +; XSI: s_bfe_i32 [[EXTRACT:s[0-9]+]], {{s[0-9]+}}, 524288 +; XSI: s_ashr_i32 {{v[0-9]+}}, [[EXTRACT]], 31 +; XSI: buffer_store_dword +; XEG: BFE_INT +; XEG: ASHR +; define void @sext_in_reg_i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a, <1 x i64> %b) nounwind { +; %c = add <1 x i64> %a, %b +; %shl = shl <1 x i64> %c, +; %ashr = ashr <1 x i64> %shl, +; store <1 x i64> %ashr, <1 x i64> addrspace(1)* %out, align 8 +; ret void +; } + +; FUNC-LABEL: {{^}}v_sext_in_reg_i1_to_i64: +; SI: buffer_load_dwordx2 +; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} +; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 1 +; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] +; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +define void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid + %a = load i64, i64 addrspace(1)* %a.gep, align 8 + %b = load i64, i64 addrspace(1)* %b.gep, align 8 + + %c = shl i64 %a, %b + %shl = shl i64 %c, 63 + %ashr = ashr i64 %shl, 63 + store i64 %ashr, i64 addrspace(1)* %out.gep, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_sext_in_reg_i8_to_i64: +; SI: buffer_load_dwordx2 +; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} +; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 8 +; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] +; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +define void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid + %a = load i64, i64 addrspace(1)* %a.gep, align 8 + %b = load i64, i64 addrspace(1)* %b.gep, align 8 + + %c = shl i64 %a, %b + %shl = shl i64 %c, 56 + %ashr = ashr i64 %shl, 56 + store i64 %ashr, i64 addrspace(1)* %out.gep, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_sext_in_reg_i16_to_i64: +; SI: buffer_load_dwordx2 +; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} +; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 16 +; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] +; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +define void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid + %a = load i64, i64 addrspace(1)* %a.gep, align 8 + %b = load i64, i64 addrspace(1)* %b.gep, align 8 + + %c = shl i64 %a, %b + %shl = shl i64 %c, 48 + %ashr = ashr i64 %shl, 48 + store i64 %ashr, i64 addrspace(1)* %out.gep, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_sext_in_reg_i32_to_i64: +; SI: buffer_load_dwordx2 +; SI: v_lshl_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, +; SI: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]] +; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[SHR]]{{\]}} +define void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid + %a = load i64, i64 addrspace(1)* %a.gep, align 8 + %b = load i64, i64 addrspace(1)* %b.gep, align 8 + + %c = shl i64 %a, %b + %shl = shl i64 %c, 32 + %ashr = ashr i64 %shl, 32 + store i64 %ashr, i64 addrspace(1)* %out.gep, align 8 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_i1_in_i32_other_amount: +; SI-NOT: s_lshl +; SI-NOT: s_ashr +; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001 + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] +; EG-NOT: BFE +; EG: ADD_INT +; EG: LSHL +; EG: ASHR [[RES]] +; EG: LSHR {{\*?}} [[ADDR]] +define void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %c = add i32 %a, %b + %x = shl i32 %c, 6 + %y = ashr i32 %x, 7 + store i32 %y, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_v2i1_in_v2i32_other_amount: +; SI-NOT: s_lshl +; SI-NOT: s_ashr +; SI-DAG: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001 +; SI-DAG: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001 +; SI: s_endpgm + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] +; EG-NOT: BFE +; EG: ADD_INT +; EG: LSHL +; EG: ASHR [[RES]] +; EG: LSHL +; EG: ASHR [[RES]] +; EG: LSHR {{\*?}} [[ADDR]] +define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { + %c = add <2 x i32> %a, %b + %x = shl <2 x i32> %c, + %y = ashr <2 x i32> %x, + store <2 x i32> %y, <2 x i32> addrspace(1)* %out, align 2 + ret void +} + + +; FUNC-LABEL: {{^}}sext_in_reg_v2i1_to_v2i32: +; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 +; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 +; SI: buffer_store_dwordx2 + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] +; EG: BFE_INT [[RES]] +; EG: BFE_INT [[RES]] +; EG: LSHR {{\*?}} [[ADDR]] +define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { + %c = add <2 x i32> %a, %b ; add to prevent folding into extload + %shl = shl <2 x i32> %c, + %ashr = ashr <2 x i32> %shl, + store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_v4i1_to_v4i32: +; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 +; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 +; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 +; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 +; SI: buffer_store_dwordx4 + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] +; EG: BFE_INT [[RES]] +; EG: BFE_INT [[RES]] +; EG: BFE_INT [[RES]] +; EG: BFE_INT [[RES]] +; EG: LSHR {{\*?}} [[ADDR]] +define void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind { + %c = add <4 x i32> %a, %b ; add to prevent folding into extload + %shl = shl <4 x i32> %c, + %ashr = ashr <4 x i32> %shl, + store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_v2i8_to_v2i32: +; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} +; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} +; SI: buffer_store_dwordx2 + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] +; EG: BFE_INT [[RES]] +; EG: BFE_INT [[RES]] +; EG: LSHR {{\*?}} [[ADDR]] +define void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { + %c = add <2 x i32> %a, %b ; add to prevent folding into extload + %shl = shl <2 x i32> %c, + %ashr = ashr <2 x i32> %shl, + store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_v4i8_to_v4i32: +; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} +; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} +; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} +; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} +; SI: buffer_store_dwordx4 + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] +; EG: BFE_INT [[RES]] +; EG: BFE_INT [[RES]] +; EG: BFE_INT [[RES]] +; EG: BFE_INT [[RES]] +; EG: LSHR {{\*?}} [[ADDR]] +define void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind { + %c = add <4 x i32> %a, %b ; add to prevent folding into extload + %shl = shl <4 x i32> %c, + %ashr = ashr <4 x i32> %shl, + store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_v2i16_to_v2i32: +; SI: s_sext_i32_i16 {{s[0-9]+}}, {{s[0-9]+}} +; SI: s_sext_i32_i16 {{s[0-9]+}}, {{s[0-9]+}} +; SI: buffer_store_dwordx2 + +; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] +; EG: BFE_INT [[RES]] +; EG: BFE_INT [[RES]] +; EG: LSHR {{\*?}} [[ADDR]] +define void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { + %c = add <2 x i32> %a, %b ; add to prevent folding into extload + %shl = shl <2 x i32> %c, + %ashr = ashr <2 x i32> %shl, + store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}testcase: +define void @testcase(i8 addrspace(1)* %out, i8 %a) nounwind { + %and_a_1 = and i8 %a, 1 + %cmp_eq = icmp eq i8 %and_a_1, 0 + %cmp_slt = icmp slt i8 %a, 0 + %sel0 = select i1 %cmp_slt, i8 0, i8 %a + %sel1 = select i1 %cmp_eq, i8 0, i8 %a + %xor = xor i8 %sel0, %sel1 + store i8 %xor, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}testcase_3: +define void @testcase_3(i8 addrspace(1)* %out, i8 %a) nounwind { + %and_a_1 = and i8 %a, 1 + %cmp_eq = icmp eq i8 %and_a_1, 0 + %cmp_slt = icmp slt i8 %a, 0 + %sel0 = select i1 %cmp_slt, i8 0, i8 %a + %sel1 = select i1 %cmp_eq, i8 0, i8 %a + %xor = xor i8 %sel0, %sel1 + store i8 %xor, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}vgpr_sext_in_reg_v4i8_to_v4i32: +; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 +; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 +; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 +; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 +define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind { + %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16 + %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16 + %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload + %shl = shl <4 x i32> %c, + %ashr = ashr <4 x i32> %shl, + store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}vgpr_sext_in_reg_v4i16_to_v4i32: +; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16 +; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16 +define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind { + %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16 + %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16 + %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload + %shl = shl <4 x i32> %c, + %ashr = ashr <4 x i32> %shl, + store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_to_illegal_type: +; SI: buffer_load_sbyte +; SI: v_max_i32 +; SI-NOT: bfe +; SI: buffer_store_short +define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind { + %tmp5 = load i8, i8 addrspace(1)* %src, align 1 + %tmp2 = sext i8 %tmp5 to i32 + %tmp3 = tail call i32 @llvm.AMDGPU.imax(i32 %tmp2, i32 0) nounwind readnone + %tmp4 = trunc i32 %tmp3 to i8 + %tmp6 = sext i8 %tmp4 to i16 + store i16 %tmp6, i16 addrspace(1)* %out, align 2 + ret void +} + +declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone + +; FUNC-LABEL: {{^}}bfe_0_width: +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind { + %load = load i32, i32 addrspace(1)* %ptr, align 4 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 8, i32 0) nounwind readnone + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_8_bfe_8: +; SI: v_bfe_i32 +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind { + %load = load i32, i32 addrspace(1)* %ptr, align 4 + %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone + %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone + store i32 %bfe1, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}bfe_8_bfe_16: +; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 +; SI: s_endpgm +define void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind { + %load = load i32, i32 addrspace(1)* %ptr, align 4 + %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone + %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 16) nounwind readnone + store i32 %bfe1, i32 addrspace(1)* %out, align 4 + ret void +} + +; This really should be folded into 1 +; FUNC-LABEL: {{^}}bfe_16_bfe_8: +; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind { + %load = load i32, i32 addrspace(1)* %ptr, align 4 + %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 16) nounwind readnone + %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone + store i32 %bfe1, i32 addrspace(1)* %out, align 4 + ret void +} + +; Make sure there isn't a redundant BFE +; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe: +; SI: s_sext_i32_i8 s{{[0-9]+}}, s{{[0-9]+}} +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %c = add i32 %a, %b ; add to prevent folding into extload + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 0, i32 8) nounwind readnone + %shl = shl i32 %bfe, 24 + %ashr = ashr i32 %shl, 24 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe_wrong: +define void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %c = add i32 %a, %b ; add to prevent folding into extload + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 8, i32 0) nounwind readnone + %shl = shl i32 %bfe, 24 + %ashr = ashr i32 %shl, 24 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe: +; SI: buffer_load_sbyte +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind { + %load = load i8, i8 addrspace(1)* %ptr, align 1 + %sext = sext i8 %load to i32 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 0, i32 8) nounwind readnone + %shl = shl i32 %bfe, 24 + %ashr = ashr i32 %shl, 24 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI: .text +; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe_0:{{.*$}} +; SI-NOT: {{[^@]}}bfe +; SI: s_endpgm +define void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind { + %load = load i8, i8 addrspace(1)* %ptr, align 1 + %sext = sext i8 %load to i32 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 8, i32 0) nounwind readnone + %shl = shl i32 %bfe, 24 + %ashr = ashr i32 %shl, 24 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_i1_bfe_offset_0: +; SI-NOT: shr +; SI-NOT: shl +; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1 +; SI: s_endpgm +define void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 31 + %shr = ashr i32 %shl, 31 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 0, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_i1_bfe_offset_1: +; SI: buffer_load_dword +; SI-NOT: shl +; SI-NOT: shr +; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 1 +; SI: s_endpgm +define void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 30 + %shr = ashr i32 %shl, 30 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 1) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_i2_bfe_offset_1: +; SI: buffer_load_dword +; SI-NOT: v_lshl +; SI-NOT: v_ashr +; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 2 +; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2 +; SI: s_endpgm +define void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %x = load i32, i32 addrspace(1)* %in, align 4 + %shl = shl i32 %x, 30 + %shr = ashr i32 %shl, 30 + %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 2) + store i32 %bfe, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/test/CodeGen/AMDGPU/sgpr-control-flow.ll new file mode 100644 index 00000000000..38289ced632 --- /dev/null +++ b/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -0,0 +1,105 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s +; +; +; Most SALU instructions ignore control flow, so we need to make sure +; they don't overwrite values from other blocks. + +; If the branch decision is made based on a value in an SGPR then all +; threads will execute the same code paths, so we don't need to worry +; about instructions in different blocks overwriting each other. +; SI-LABEL: {{^}}sgpr_if_else_salu_br: +; SI: s_add +; SI: s_add + +define void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { +entry: + %0 = icmp eq i32 %a, 0 + br i1 %0, label %if, label %else + +if: + %1 = add i32 %b, %c + br label %endif + +else: + %2 = add i32 %d, %e + br label %endif + +endif: + %3 = phi i32 [%1, %if], [%2, %else] + %4 = add i32 %3, %a + store i32 %4, i32 addrspace(1)* %out + ret void +} + +; The two S_ADD instructions should write to different registers, since +; different threads will take different control flow paths. + +; SI-LABEL: {{^}}sgpr_if_else_valu_br: +; SI: s_add_i32 [[SGPR:s[0-9]+]] +; SI-NOT: s_add_i32 [[SGPR]] + +define void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) { +entry: + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %tid_f = uitofp i32 %tid to float + %tmp1 = fcmp ueq float %tid_f, 0.0 + br i1 %tmp1, label %if, label %else + +if: + %tmp2 = add i32 %b, %c + br label %endif + +else: + %tmp3 = add i32 %d, %e + br label %endif + +endif: + %tmp4 = phi i32 [%tmp2, %if], [%tmp3, %else] + store i32 %tmp4, i32 addrspace(1)* %out + ret void +} + +; FIXME: Should write to different SGPR pairs instead of copying to +; VALU for i1 phi. + +; SI-LABEL: {{^}}sgpr_if_else_valu_cmp_phi_br: +; SI: buffer_load_dword [[AVAL:v[0-9]+]] +; SI: v_cmp_gt_i32_e32 [[CMP_IF:vcc]], 0, [[AVAL]] +; SI: v_cndmask_b32_e64 [[V_CMP:v[0-9]+]], 0, -1, [[CMP_IF]] + +; SI: BB2_1: +; SI: buffer_load_dword [[AVAL:v[0-9]+]] +; SI: v_cmp_eq_i32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]] +; SI: v_cndmask_b32_e64 [[V_CMP]], 0, -1, [[CMP_ELSE]] + +; SI: v_cmp_ne_i32_e32 [[CMP_CMP:vcc]], 0, [[V_CMP]] +; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP_CMP]] +; SI: buffer_store_dword [[RESULT]] +define void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) { +entry: + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %tmp1 = icmp eq i32 %tid, 0 + br i1 %tmp1, label %if, label %else + +if: + %gep.if = getelementptr i32, i32 addrspace(1)* %a, i32 %tid + %a.val = load i32, i32 addrspace(1)* %gep.if + %cmp.if = icmp eq i32 %a.val, 0 + br label %endif + +else: + %gep.else = getelementptr i32, i32 addrspace(1)* %b, i32 %tid + %b.val = load i32, i32 addrspace(1)* %gep.else + %cmp.else = icmp slt i32 %b.val, 0 + br label %endif + +endif: + %tmp4 = phi i1 [%cmp.if, %if], [%cmp.else, %else] + %ext = sext i1 %tmp4 to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +declare i32 @llvm.r600.read.tidig.x() #0 + +attributes #0 = { readnone } diff --git a/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll b/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll new file mode 100644 index 00000000000..df67fcca22f --- /dev/null +++ b/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll @@ -0,0 +1,19 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s + +; Copy VGPR -> SGPR used twice as an instruction operand, which is then +; used in an REG_SEQUENCE that also needs to be handled. + +; SI-LABEL: {{^}}test_dup_operands: +; SI: v_add_i32_e32 +define void @test_dup_operands(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) { + %a = load <2 x i32>, <2 x i32> addrspace(1)* %in + %lo = extractelement <2 x i32> %a, i32 0 + %hi = extractelement <2 x i32> %a, i32 1 + %add = add i32 %lo, %lo + %vec0 = insertelement <2 x i32> undef, i32 %add, i32 0 + %vec1 = insertelement <2 x i32> %vec0, i32 %hi, i32 1 + store <2 x i32> %vec1, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + diff --git a/test/CodeGen/AMDGPU/sgpr-copy.ll b/test/CodeGen/AMDGPU/sgpr-copy.ll new file mode 100644 index 00000000000..b849c4038bc --- /dev/null +++ b/test/CodeGen/AMDGPU/sgpr-copy.ll @@ -0,0 +1,379 @@ +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +; This test checks that no VGPR to SGPR copies are created by the register +; allocator. +; CHECK-LABEL: {{^}}phi1: +; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0 +; CHECK: v_mov_b32_e32 v{{[0-9]}}, [[DST]] + +define void @phi1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { +main_body: + %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 + %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1 + %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 0) + %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16) + %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 32) + %25 = fptosi float %23 to i32 + %26 = icmp ne i32 %25, 0 + br i1 %26, label %ENDIF, label %ELSE + +ELSE: ; preds = %main_body + %27 = fsub float -0.000000e+00, %22 + br label %ENDIF + +ENDIF: ; preds = %main_body, %ELSE + %temp.0 = phi float [ %27, %ELSE ], [ %22, %main_body ] + %28 = fadd float %temp.0, %24 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %28, float %28, float 0.000000e+00, float 1.000000e+00) + ret void +} + +; Make sure this program doesn't crash +; CHECK-LABEL: {{^}}phi2: +define void @phi2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { +main_body: + %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 + %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1 + %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16) + %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 32) + %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 36) + %25 = call float @llvm.SI.load.const(<16 x i8> %21, i32 40) + %26 = call float @llvm.SI.load.const(<16 x i8> %21, i32 48) + %27 = call float @llvm.SI.load.const(<16 x i8> %21, i32 52) + %28 = call float @llvm.SI.load.const(<16 x i8> %21, i32 56) + %29 = call float @llvm.SI.load.const(<16 x i8> %21, i32 64) + %30 = call float @llvm.SI.load.const(<16 x i8> %21, i32 68) + %31 = call float @llvm.SI.load.const(<16 x i8> %21, i32 72) + %32 = call float @llvm.SI.load.const(<16 x i8> %21, i32 76) + %33 = call float @llvm.SI.load.const(<16 x i8> %21, i32 80) + %34 = call float @llvm.SI.load.const(<16 x i8> %21, i32 84) + %35 = call float @llvm.SI.load.const(<16 x i8> %21, i32 88) + %36 = call float @llvm.SI.load.const(<16 x i8> %21, i32 92) + %37 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0 + %38 = load <32 x i8>, <32 x i8> addrspace(2)* %37, !tbaa !1 + %39 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %1, i32 0 + %40 = load <16 x i8>, <16 x i8> addrspace(2)* %39, !tbaa !1 + %41 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %5) + %42 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %5) + %43 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %3, <2 x i32> %5) + %44 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %3, <2 x i32> %5) + %45 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %3, <2 x i32> %5) + %46 = bitcast float %41 to i32 + %47 = bitcast float %42 to i32 + %48 = insertelement <2 x i32> undef, i32 %46, i32 0 + %49 = insertelement <2 x i32> %48, i32 %47, i32 1 + %50 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %49, <32 x i8> %38, <16 x i8> %40, i32 2) + %51 = extractelement <4 x float> %50, i32 2 + %52 = call float @fabs(float %51) + %53 = fmul float %43, %43 + %54 = fmul float %44, %44 + %55 = fadd float %54, %53 + %56 = fmul float %45, %45 + %57 = fadd float %55, %56 + %58 = call float @llvm.AMDGPU.rsq.f32(float %57) + %59 = fmul float %43, %58 + %60 = fmul float %44, %58 + %61 = fmul float %45, %58 + %62 = fmul float %59, %23 + %63 = fmul float %60, %24 + %64 = fadd float %63, %62 + %65 = fmul float %61, %25 + %66 = fadd float %64, %65 + %67 = fsub float -0.000000e+00, %26 + %68 = fmul float %66, %52 + %69 = fadd float %68, %67 + %70 = fmul float %27, %69 + %71 = fmul float %28, %69 + %72 = call float @fabs(float %70) + %73 = fcmp olt float 0x3EE4F8B580000000, %72 + %74 = sext i1 %73 to i32 + %75 = bitcast i32 %74 to float + %76 = bitcast float %75 to i32 + %77 = icmp ne i32 %76, 0 + br i1 %77, label %IF, label %ENDIF + +IF: ; preds = %main_body + %78 = fsub float -0.000000e+00, %70 + %79 = call float @llvm.AMDIL.exp.(float %78) + %80 = fsub float -0.000000e+00, %79 + %81 = fadd float 1.000000e+00, %80 + %82 = fdiv float 1.000000e+00, %70 + %83 = fmul float %81, %82 + %84 = fmul float %32, %83 + br label %ENDIF + +ENDIF: ; preds = %main_body, %IF + %temp4.0 = phi float [ %84, %IF ], [ %32, %main_body ] + %85 = call float @fabs(float %71) + %86 = fcmp olt float 0x3EE4F8B580000000, %85 + %87 = sext i1 %86 to i32 + %88 = bitcast i32 %87 to float + %89 = bitcast float %88 to i32 + %90 = icmp ne i32 %89, 0 + br i1 %90, label %IF25, label %ENDIF24 + +IF25: ; preds = %ENDIF + %91 = fsub float -0.000000e+00, %71 + %92 = call float @llvm.AMDIL.exp.(float %91) + %93 = fsub float -0.000000e+00, %92 + %94 = fadd float 1.000000e+00, %93 + %95 = fdiv float 1.000000e+00, %71 + %96 = fmul float %94, %95 + %97 = fmul float %36, %96 + br label %ENDIF24 + +ENDIF24: ; preds = %ENDIF, %IF25 + %temp8.0 = phi float [ %97, %IF25 ], [ %36, %ENDIF ] + %98 = fmul float %29, %temp4.0 + %99 = fmul float %30, %temp4.0 + %100 = fmul float %31, %temp4.0 + %101 = fmul float %33, %temp8.0 + %102 = fadd float %101, %98 + %103 = fmul float %34, %temp8.0 + %104 = fadd float %103, %99 + %105 = fmul float %35, %temp8.0 + %106 = fadd float %105, %100 + %107 = call float @llvm.pow.f32(float %52, float %22) + %108 = fsub float -0.000000e+00, %102 + %109 = fmul float %108, %107 + %110 = fsub float -0.000000e+00, %104 + %111 = fmul float %110, %107 + %112 = fsub float -0.000000e+00, %106 + %113 = fmul float %112, %107 + %114 = call i32 @llvm.SI.packf16(float %109, float %111) + %115 = bitcast i32 %114 to float + %116 = call i32 @llvm.SI.packf16(float %113, float 1.000000e+00) + %117 = bitcast i32 %116 to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %115, float %117, float %115, float %117) + ret void +} + +; We just want ot make sure the program doesn't crash +; CHECK-LABEL: {{^}}loop: + +define void @loop(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { +main_body: + %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 + %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1 + %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 0) + %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 4) + %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 8) + %25 = call float @llvm.SI.load.const(<16 x i8> %21, i32 12) + %26 = fptosi float %25 to i32 + %27 = bitcast i32 %26 to float + %28 = bitcast float %27 to i32 + br label %LOOP + +LOOP: ; preds = %ENDIF, %main_body + %temp4.0 = phi float [ %22, %main_body ], [ %temp5.0, %ENDIF ] + %temp5.0 = phi float [ %23, %main_body ], [ %temp6.0, %ENDIF ] + %temp6.0 = phi float [ %24, %main_body ], [ %temp4.0, %ENDIF ] + %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %37, %ENDIF ] + %29 = bitcast float %temp8.0 to i32 + %30 = icmp sge i32 %29, %28 + %31 = sext i1 %30 to i32 + %32 = bitcast i32 %31 to float + %33 = bitcast float %32 to i32 + %34 = icmp ne i32 %33, 0 + br i1 %34, label %IF, label %ENDIF + +IF: ; preds = %LOOP + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp4.0, float %temp5.0, float %temp6.0, float 1.000000e+00) + ret void + +ENDIF: ; preds = %LOOP + %35 = bitcast float %temp8.0 to i32 + %36 = add i32 %35, 1 + %37 = bitcast i32 %36 to float + br label %LOOP +} + +; Function Attrs: nounwind readnone +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 + +; Function Attrs: readonly +declare float @fabs(float) #2 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } +attributes #2 = { readonly } +attributes #3 = { readnone } +attributes #4 = { nounwind readonly } + +!0 = !{!"const", null} +!1 = !{!0, !0, i64 0, i32 1} + +; Function Attrs: nounwind readnone +declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1 + +; Function Attrs: readnone +declare float @llvm.AMDGPU.rsq.f32(float) #3 + +; Function Attrs: readnone +declare float @llvm.AMDIL.exp.(float) #3 + +; Function Attrs: nounwind readonly +declare float @llvm.pow.f32(float, float) #4 + +; Function Attrs: nounwind readnone +declare i32 @llvm.SI.packf16(float, float) #1 + +; This checks for a bug in the FixSGPRCopies pass where VReg96 +; registers were being identified as an SGPR regclass which was causing +; an assertion failure. + +; CHECK-LABEL: {{^}}sample_v3: +; CHECK: image_sample +; CHECK: image_sample +; CHECK: exp +; CHECK: s_endpgm +define void @sample_v3([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { + +entry: + %21 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0 + %22 = load <16 x i8>, <16 x i8> addrspace(2)* %21, !tbaa !2 + %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 16) + %24 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0 + %25 = load <32 x i8>, <32 x i8> addrspace(2)* %24, !tbaa !2 + %26 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0 + %27 = load <16 x i8>, <16 x i8> addrspace(2)* %26, !tbaa !2 + %28 = fcmp oeq float %23, 0.0 + br i1 %28, label %if, label %else + +if: + %val.if = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> , <32 x i8> %25, <16 x i8> %27, i32 2) + %val.if.0 = extractelement <4 x float> %val.if, i32 0 + %val.if.1 = extractelement <4 x float> %val.if, i32 1 + %val.if.2 = extractelement <4 x float> %val.if, i32 2 + br label %endif + +else: + %val.else = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> , <32 x i8> %25, <16 x i8> %27, i32 2) + %val.else.0 = extractelement <4 x float> %val.else, i32 0 + %val.else.1 = extractelement <4 x float> %val.else, i32 1 + %val.else.2 = extractelement <4 x float> %val.else, i32 2 + br label %endif + +endif: + %val.0 = phi float [%val.if.0, %if], [%val.else.0, %else] + %val.1 = phi float [%val.if.1, %if], [%val.else.1, %else] + %val.2 = phi float [%val.if.2, %if], [%val.else.2, %else] + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %val.0, float %val.1, float %val.2, float 0.0) + ret void +} + +!2 = !{!"const", null, i32 1} + +; CHECK-LABEL: {{^}}copy1: +; CHECK: buffer_load_dword +; CHECK: v_add +; CHECK: s_endpgm +define void @copy1(float addrspace(1)* %out, float addrspace(1)* %in0) { +entry: + %0 = load float, float addrspace(1)* %in0 + %1 = fcmp oeq float %0, 0.0 + br i1 %1, label %if0, label %endif + +if0: + %2 = bitcast float %0 to i32 + %3 = fcmp olt float %0, 0.0 + br i1 %3, label %if1, label %endif + +if1: + %4 = add i32 %2, 1 + br label %endif + +endif: + %5 = phi i32 [ 0, %entry ], [ %2, %if0 ], [ %4, %if1 ] + %6 = bitcast i32 %5 to float + store float %6, float addrspace(1)* %out + ret void +} + +; This test is just checking that we don't crash / assertion fail. +; CHECK-LABEL: {{^}}copy2: +; CHECK: s_endpgm + +define void @copy2([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { +entry: + br label %LOOP68 + +LOOP68: + %temp4.7 = phi float [ 0.000000e+00, %entry ], [ %v, %ENDIF69 ] + %t = phi i32 [ 20, %entry ], [ %x, %ENDIF69 ] + %g = icmp eq i32 0, %t + %l = bitcast float %temp4.7 to i32 + br i1 %g, label %IF70, label %ENDIF69 + +IF70: + %q = icmp ne i32 %l, 13 + %temp.8 = select i1 %q, float 1.000000e+00, float 0.000000e+00 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp.8, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00) + ret void + +ENDIF69: + %u = add i32 %l, %t + %v = bitcast i32 %u to float + %x = add i32 %t, -1 + br label %LOOP68 +} + +attributes #0 = { "ShaderType"="0" } + +; This test checks that image_sample resource descriptors aren't loaded into +; vgprs. The verifier will fail if this happens. +; CHECK-LABEL:{{^}}sample_rsrc: +; CHECK: image_sample +; CHECK: image_sample +; CHECK: s_endpgm +define void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 { +bb: + %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg1, i32 0, i32 0 + %tmp22 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 + %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp22, i32 16) + %tmp25 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(2)* %arg3, i32 0, i32 0 + %tmp26 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp25, !tbaa !0 + %tmp27 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %arg2, i32 0, i32 0 + %tmp28 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp27, !tbaa !0 + %tmp29 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg5, <2 x i32> %arg7) + %tmp30 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg5, <2 x i32> %arg7) + %tmp31 = bitcast float %tmp23 to i32 + %tmp36 = icmp ne i32 %tmp31, 0 + br i1 %tmp36, label %bb38, label %bb80 + +bb38: ; preds = %bb + %tmp52 = bitcast float %tmp29 to i32 + %tmp53 = bitcast float %tmp30 to i32 + %tmp54 = insertelement <2 x i32> undef, i32 %tmp52, i32 0 + %tmp55 = insertelement <2 x i32> %tmp54, i32 %tmp53, i32 1 + %tmp56 = bitcast <8 x i32> %tmp26 to <32 x i8> + %tmp57 = bitcast <4 x i32> %tmp28 to <16 x i8> + %tmp58 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp55, <32 x i8> %tmp56, <16 x i8> %tmp57, i32 2) + br label %bb71 + +bb80: ; preds = %bb + %tmp81 = bitcast float %tmp29 to i32 + %tmp82 = bitcast float %tmp30 to i32 + %tmp82.2 = add i32 %tmp82, 1 + %tmp83 = insertelement <2 x i32> undef, i32 %tmp81, i32 0 + %tmp84 = insertelement <2 x i32> %tmp83, i32 %tmp82.2, i32 1 + %tmp85 = bitcast <8 x i32> %tmp26 to <32 x i8> + %tmp86 = bitcast <4 x i32> %tmp28 to <16 x i8> + %tmp87 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp84, <32 x i8> %tmp85, <16 x i8> %tmp86, i32 2) + br label %bb71 + +bb71: ; preds = %bb80, %bb38 + %tmp72 = phi <4 x float> [ %tmp58, %bb38 ], [ %tmp87, %bb80 ] + %tmp88 = extractelement <4 x float> %tmp72, i32 0 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp88, float %tmp88, float %tmp88, float %tmp88) + ret void +} + +attributes #0 = { "ShaderType"="0" "unsafe-fp-math"="true" } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/shared-op-cycle.ll b/test/CodeGen/AMDGPU/shared-op-cycle.ll new file mode 100644 index 00000000000..f52a9baf4d1 --- /dev/null +++ b/test/CodeGen/AMDGPU/shared-op-cycle.ll @@ -0,0 +1,32 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; CHECK: {{^}}main: +; CHECK: MULADD_IEEE * +; CHECK-NOT: MULADD_IEEE * + +define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 { + %w0 = extractelement <4 x float> %reg0, i32 3 + %w1 = extractelement <4 x float> %reg1, i32 3 + %w2 = extractelement <4 x float> %reg2, i32 3 + %sq0 = fmul float %w0, %w0 + %r0 = fadd float %sq0, 2.0 + %sq1 = fmul float %w1, %w1 + %r1 = fadd float %sq1, 2.0 + %sq2 = fmul float %w2, %w2 + %r2 = fadd float %sq2, 2.0 + %v0 = insertelement <4 x float> undef, float %r0, i32 0 + %v1 = insertelement <4 x float> %v0, float %r1, i32 1 + %v2 = insertelement <4 x float> %v1, float %r2, i32 2 + %res = call float @llvm.AMDGPU.dp4(<4 x float> %v2, <4 x float> %v2) + %vecres = insertelement <4 x float> undef, float %res, i32 0 + call void @llvm.R600.store.swizzle(<4 x float> %vecres, i32 0, i32 2) + ret void +} + +; Function Attrs: readnone +declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="1" } +attributes #1 = { readnone } \ No newline at end of file diff --git a/test/CodeGen/AMDGPU/shl.ll b/test/CodeGen/AMDGPU/shl.ll new file mode 100644 index 00000000000..53b63dc4b8a --- /dev/null +++ b/test/CodeGen/AMDGPU/shl.ll @@ -0,0 +1,180 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI %s + +;EG: {{^}}shl_v2i32: +;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +;SI: {{^}}shl_v2i32: +;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +;VI: {{^}}shl_v2i32: +;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 + %a = load <2 x i32>, <2 x i32> addrspace(1) * %in + %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr + %result = shl <2 x i32> %a, %b + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +;EG: {{^}}shl_v4i32: +;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +;SI: {{^}}shl_v4i32: +;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +;VI: {{^}}shl_v4i32: +;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 + %a = load <4 x i32>, <4 x i32> addrspace(1) * %in + %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr + %result = shl <4 x i32> %a, %b + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +;EG: {{^}}shl_i64: +;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]] +;EG: LSHR {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}} +;EG: LSHR {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1 +;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal +;EG-DAG: LSHL {{\*? *}}[[HISMTMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], [[SHIFT]] +;EG-DAG: OR_INT {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], {{[[HISMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}} +;EG-DAG: LSHL {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], [[OPLO]], {{PS|[[SHIFT]]}} +;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal +;EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}} +;EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0 + +;SI: {{^}}shl_i64: +;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} + +;VI: {{^}}shl_i64: +;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} + +define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { + %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 + %a = load i64, i64 addrspace(1) * %in + %b = load i64, i64 addrspace(1) * %b_ptr + %result = shl i64 %a, %b + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;EG: {{^}}shl_v2i64: +;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]] +;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]] +;EG-DAG: LSHR {{\*? *}}[[COMPSHA]] +;EG-DAG: LSHR {{\*? *}}[[COMPSHB]] +;EG-DAG: LSHR {{.*}}, 1 +;EG-DAG: LSHR {{.*}}, 1 +;EG-DAG: ADD_INT {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal +;EG-DAG: ADD_INT {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal +;EG-DAG: LSHL {{.*}}, [[SHA]] +;EG-DAG: LSHL {{.*}}, [[SHB]] +;EG-DAG: LSHL {{.*}}, [[SHA]] +;EG-DAG: LSHL {{.*}}, [[SHB]] +;EG-DAG: LSHL +;EG-DAG: LSHL +;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal +;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal +;EG-DAG: CNDE_INT {{.*}}, 0.0 +;EG-DAG: CNDE_INT {{.*}}, 0.0 +;EG-DAG: CNDE_INT +;EG-DAG: CNDE_INT + +;SI: {{^}}shl_v2i64: +;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} +;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} + +;VI: {{^}}shl_v2i64: +;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} +;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} + +define void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 + %a = load <2 x i64>, <2 x i64> addrspace(1) * %in + %b = load <2 x i64>, <2 x i64> addrspace(1) * %b_ptr + %result = shl <2 x i64> %a, %b + store <2 x i64> %result, <2 x i64> addrspace(1)* %out + ret void +} + +;EG: {{^}}shl_v4i64: +;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]] +;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]] +;EG-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]] +;EG-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]] +;EG-DAG: LSHR {{\*? *}}[[COMPSHA]] +;EG-DAG: LSHR {{\*? *}}[[COMPSHB]] +;EG-DAG: LSHR {{\*? *}}[[COMPSHC]] +;EG-DAG: LSHR {{\*? *}}[[COMPSHD]] +;EG-DAG: LSHR {{.*}}, 1 +;EG-DAG: LSHR {{.*}}, 1 +;EG-DAG: LSHR {{.*}}, 1 +;EG-DAG: LSHR {{.*}}, 1 +;EG-DAG: ADD_INT {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal +;EG-DAG: ADD_INT {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal +;EG-DAG: ADD_INT {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal +;EG-DAG: ADD_INT {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal +;EG-DAG: LSHL {{.*}}, [[SHA]] +;EG-DAG: LSHL {{.*}}, [[SHB]] +;EG-DAG: LSHL {{.*}}, [[SHC]] +;EG-DAG: LSHL {{.*}}, [[SHD]] +;EG-DAG: LSHL {{.*}}, [[SHA]] +;EG-DAG: LSHL {{.*}}, [[SHB]] +;EG-DAG: LSHL {{.*}}, [[SHC]] +;EG-DAG: LSHL {{.*}}, [[SHD]] +;EG-DAG: LSHL +;EG-DAG: LSHL +;EG-DAG: LSHL +;EG-DAG: LSHL +;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal +;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal +;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal +;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal +;EG-DAG: CNDE_INT {{.*}}, 0.0 +;EG-DAG: CNDE_INT {{.*}}, 0.0 +;EG-DAG: CNDE_INT {{.*}}, 0.0 +;EG-DAG: CNDE_INT {{.*}}, 0.0 +;EG-DAG: CNDE_INT +;EG-DAG: CNDE_INT +;EG-DAG: CNDE_INT +;EG-DAG: CNDE_INT + +;SI: {{^}}shl_v4i64: +;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} +;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} +;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} +;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} + +;VI: {{^}}shl_v4i64: +;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} +;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} +;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} +;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} + +define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 + %a = load <4 x i64>, <4 x i64> addrspace(1) * %in + %b = load <4 x i64>, <4 x i64> addrspace(1) * %b_ptr + %result = shl <4 x i64> %a, %b + store <4 x i64> %result, <4 x i64> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/shl_add_constant.ll b/test/CodeGen/AMDGPU/shl_add_constant.ll new file mode 100644 index 00000000000..b1485bfaaeb --- /dev/null +++ b/test/CodeGen/AMDGPU/shl_add_constant.ll @@ -0,0 +1,90 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare i32 @llvm.r600.read.tidig.x() #1 + +; Test with inline immediate + +; FUNC-LABEL: {{^}}shl_2_add_9_i32: +; SI: v_lshlrev_b32_e32 [[REG:v[0-9]+]], 2, {{v[0-9]+}} +; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], 36, [[REG]] +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x + %val = load i32, i32 addrspace(1)* %ptr, align 4 + %add = add i32 %val, 9 + %result = shl i32 %add, 2 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}shl_2_add_9_i32_2_add_uses: +; SI-DAG: v_add_i32_e32 [[ADDREG:v[0-9]+]], 9, {{v[0-9]+}} +; SI-DAG: v_lshlrev_b32_e32 [[SHLREG:v[0-9]+]], 2, {{v[0-9]+}} +; SI-DAG: buffer_store_dword [[ADDREG]] +; SI-DAG: buffer_store_dword [[SHLREG]] +; SI: s_endpgm +define void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x + %val = load i32, i32 addrspace(1)* %ptr, align 4 + %add = add i32 %val, 9 + %result = shl i32 %add, 2 + store i32 %result, i32 addrspace(1)* %out0, align 4 + store i32 %add, i32 addrspace(1)* %out1, align 4 + ret void +} + +; Test with add literal constant + +; FUNC-LABEL: {{^}}shl_2_add_999_i32: +; SI: v_lshlrev_b32_e32 [[REG:v[0-9]+]], 2, {{v[0-9]+}} +; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], 0xf9c, [[REG]] +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x + %val = load i32, i32 addrspace(1)* %ptr, align 4 + %shl = add i32 %val, 999 + %result = shl i32 %shl, 2 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_add_shl_add_constant: +; SI-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3 +; SI: s_add_i32 [[TMP:s[0-9]+]], [[SHL3]], [[Y]] +; SI: s_add_i32 [[RESULT:s[0-9]+]], [[TMP]], 0x3d8 +; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]] +; SI: buffer_store_dword [[VRESULT]] +define void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { + %add.0 = add i32 %x, 123 + %shl = shl i32 %add.0, 3 + %add.1 = add i32 %shl, %y + store i32 %add.1, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_add_shl_add_constant_inv: +; SI-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; SI: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3 +; SI: s_add_i32 [[TMP:s[0-9]+]], [[SHL3]], [[Y]] +; SI: s_add_i32 [[RESULT:s[0-9]+]], [[TMP]], 0x3d8 +; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]] +; SI: buffer_store_dword [[VRESULT]] + +define void @test_add_shl_add_constant_inv(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { + %add.0 = add i32 %x, 123 + %shl = shl i32 %add.0, 3 + %add.1 = add i32 %y, %shl + store i32 %add.1, i32 addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/shl_add_ptr.ll b/test/CodeGen/AMDGPU/shl_add_ptr.ll new file mode 100644 index 00000000000..6671e909cd1 --- /dev/null +++ b/test/CodeGen/AMDGPU/shl_add_ptr.ll @@ -0,0 +1,284 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s + +; Test that doing a shift of a pointer with a constant add will be +; folded into the constant offset addressing mode even if the add has +; multiple uses. This is relevant to accessing 2 separate, adjacent +; LDS globals. + + +declare i32 @llvm.r600.read.tidig.x() #1 + +@lds0 = addrspace(3) global [512 x float] undef, align 4 +@lds1 = addrspace(3) global [512 x float] undef, align 4 + + +; Make sure the (add tid, 2) << 2 gets folded into the ds's offset as (tid << 2) + 8 + +; SI-LABEL: {{^}}load_shl_base_lds_0: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: ds_read_b32 {{v[0-9]+}}, [[PTR]] offset:8 +; SI: s_endpgm +define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 + store float %val0, float addrspace(1)* %out + ret void +} + +; Make sure once the first use is folded into the addressing mode, the +; remaining add use goes through the normal shl + add constant fold. + +; SI-LABEL: {{^}}load_shl_base_lds_1: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8 +; SI: v_add_i32_e32 [[ADDUSE:v[0-9]+]], 8, v{{[0-9]+}} +; SI-DAG: buffer_store_dword [[RESULT]] +; SI-DAG: buffer_store_dword [[ADDUSE]] +; SI: s_endpgm +define void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + %shl_add_use = shl i32 %idx.0, 2 + store i32 %shl_add_use, i32 addrspace(1)* %add_use, align 4 + store float %val0, float addrspace(1)* %out + ret void +} + +@maxlds = addrspace(3) global [65536 x i8] undef, align 4 + +; SI-LABEL: {{^}}load_shl_base_lds_max_offset +; SI: ds_read_u8 v{{[0-9]+}}, v{{[0-9]+}} offset:65535 +; SI: s_endpgm +define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 65535 + %arrayidx0 = getelementptr inbounds [65536 x i8], [65536 x i8] addrspace(3)* @maxlds, i32 0, i32 %idx.0 + %val0 = load i8, i8 addrspace(3)* %arrayidx0 + store i32 %idx.0, i32 addrspace(1)* %add_use + store i8 %val0, i8 addrspace(1)* %out + ret void +} + +; The two globals are placed adjacent in memory, so the same base +; pointer can be used with an offset into the second one. + +; SI-LABEL: {{^}}load_shl_base_lds_2: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: s_mov_b32 m0, -1 +; SI-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9 +; SI: s_endpgm +define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 64 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 + %val0 = load float, float addrspace(3)* %arrayidx0, align 4 + %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0 + %val1 = load float, float addrspace(3)* %arrayidx1, align 4 + %sum = fadd float %val0, %val1 + store float %sum, float addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}store_shl_base_lds_0: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: ds_write_b32 [[PTR]], {{v[0-9]+}} offset:8 +; SI: s_endpgm +define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 + store float 1.0, float addrspace(3)* %arrayidx0, align 4 + store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 + ret void +} + + +; -------------------------------------------------------------------------------- +; Atomics. + +@lds2 = addrspace(3) global [512 x i32] undef, align 4 + +; define void @atomic_load_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +; %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 +; %idx.0 = add nsw i32 %tid.x, 2 +; %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 +; %val = load atomic i32, i32 addrspace(3)* %arrayidx0 seq_cst, align 4 +; store i32 %val, i32 addrspace(1)* %out, align 4 +; store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 +; ret void +; } + + +; SI-LABEL: {{^}}atomic_cmpxchg_shl_base_lds_0: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: ds_cmpst_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, {{v[0-9]+}} offset:8 +; SI: s_endpgm +define void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use, i32 %swap) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 + %pair = cmpxchg i32 addrspace(3)* %arrayidx0, i32 7, i32 %swap seq_cst monotonic + %result = extractvalue { i32, i1 } %pair, 0 + store i32 %result, i32 addrspace(1)* %out, align 4 + store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 + ret void +} + +; SI-LABEL: {{^}}atomic_swap_shl_base_lds_0: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: ds_wrxchg_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 +; SI: s_endpgm +define void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 + %val = atomicrmw xchg i32 addrspace(3)* %arrayidx0, i32 3 seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 + ret void +} + +; SI-LABEL: {{^}}atomic_add_shl_base_lds_0: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: ds_add_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 +; SI: s_endpgm +define void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 + %val = atomicrmw add i32 addrspace(3)* %arrayidx0, i32 3 seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 + ret void +} + +; SI-LABEL: {{^}}atomic_sub_shl_base_lds_0: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: ds_sub_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 +; SI: s_endpgm +define void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 + %val = atomicrmw sub i32 addrspace(3)* %arrayidx0, i32 3 seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 + ret void +} + +; SI-LABEL: {{^}}atomic_and_shl_base_lds_0: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: ds_and_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 +; SI: s_endpgm +define void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 + %val = atomicrmw and i32 addrspace(3)* %arrayidx0, i32 3 seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 + ret void +} + +; SI-LABEL: {{^}}atomic_or_shl_base_lds_0: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: ds_or_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 +; SI: s_endpgm +define void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 + %val = atomicrmw or i32 addrspace(3)* %arrayidx0, i32 3 seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 + ret void +} + +; SI-LABEL: {{^}}atomic_xor_shl_base_lds_0: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: ds_xor_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 +; SI: s_endpgm +define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 + %val = atomicrmw xor i32 addrspace(3)* %arrayidx0, i32 3 seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 + ret void +} + +; define void @atomic_nand_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +; %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 +; %idx.0 = add nsw i32 %tid.x, 2 +; %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 +; %val = atomicrmw nand i32 addrspace(3)* %arrayidx0, i32 3 seq_cst +; store i32 %val, i32 addrspace(1)* %out, align 4 +; store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 +; ret void +; } + +; SI-LABEL: {{^}}atomic_min_shl_base_lds_0: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: ds_min_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 +; SI: s_endpgm +define void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 + %val = atomicrmw min i32 addrspace(3)* %arrayidx0, i32 3 seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 + ret void +} + +; SI-LABEL: {{^}}atomic_max_shl_base_lds_0: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: ds_max_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 +; SI: s_endpgm +define void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 + %val = atomicrmw max i32 addrspace(3)* %arrayidx0, i32 3 seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 + ret void +} + +; SI-LABEL: {{^}}atomic_umin_shl_base_lds_0: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: ds_min_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 +; SI: s_endpgm +define void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 + %val = atomicrmw umin i32 addrspace(3)* %arrayidx0, i32 3 seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 + ret void +} + +; SI-LABEL: {{^}}atomic_umax_shl_base_lds_0: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: ds_max_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 +; SI: s_endpgm +define void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 + %val = atomicrmw umax i32 addrspace(3)* %arrayidx0, i32 3 seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/si-annotate-cf-assertion.ll b/test/CodeGen/AMDGPU/si-annotate-cf-assertion.ll new file mode 100644 index 00000000000..69d719385ac --- /dev/null +++ b/test/CodeGen/AMDGPU/si-annotate-cf-assertion.ll @@ -0,0 +1,25 @@ +; REQUIRES: asserts +; XFAIL: * +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs-asm-verbose=false < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs-asm-verbose=false < %s | FileCheck %s + + +define void @test(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind { +; CHECK-LABEL: {{^}}test: + +entry: + switch i32 %x, label %sw.default [ + i32 0, label %sw.bb + i32 60, label %sw.bb + ] + +sw.bb: + unreachable + +sw.default: + unreachable + +sw.epilog: + ret void +} + diff --git a/test/CodeGen/AMDGPU/si-annotate-cf.ll b/test/CodeGen/AMDGPU/si-annotate-cf.ll new file mode 100644 index 00000000000..bbcb861f37d --- /dev/null +++ b/test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -0,0 +1,63 @@ +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}break_inserted_outside_of_loop: + +; SI: [[LOOP_LABEL:[A-Z0-9]+]]: +; Lowered break instructin: +; SI: s_or_b64 +; Lowered Loop instruction: +; SI: s_andn2_b64 +; s_cbranch_execnz [[LOOP_LABEL]] +; SI: s_endpgm +define void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a, i32 %b) { +main_body: + %0 = and i32 %a, %b + %1 = trunc i32 %0 to i1 + br label %ENDIF + +ENDLOOP: + store i32 0, i32 addrspace(1)* %out + ret void + +ENDIF: + br i1 %1, label %ENDLOOP, label %ENDIF +} + + +; FUNC-LABEL: {{^}}phi_cond_outside_loop: +; FIXME: This could be folded into the s_or_b64 instruction +; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0 +; SI: [[LOOP_LABEL:[A-Z0-9]+]] +; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}} + +; SI_IF_BREAK instruction: +; SI: s_or_b64 [[BREAK:s\[[0-9]+:[0-9]+\]]], vcc, [[ZERO]] + +; SI_LOOP instruction: +; SI: s_andn2_b64 exec, exec, [[BREAK]] +; SI: s_cbranch_execnz [[LOOP_LABEL]] +; SI: s_endpgm + +define void @phi_cond_outside_loop(i32 %a, i32 %b) { +entry: + %0 = icmp eq i32 %a , 0 + br i1 %0, label %if, label %else + +if: + br label %endif + +else: + %1 = icmp eq i32 %b, 0 + br label %endif + +endif: + %2 = phi i1 [0, %if], [%1, %else] + br label %loop + +loop: + br i1 %2, label %exit, label %loop + +exit: + ret void +} diff --git a/test/CodeGen/AMDGPU/si-lod-bias.ll b/test/CodeGen/AMDGPU/si-lod-bias.ll new file mode 100644 index 00000000000..944499a1146 --- /dev/null +++ b/test/CodeGen/AMDGPU/si-lod-bias.ll @@ -0,0 +1,52 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +; This shader has the potential to generated illegal VGPR to SGPR copies if +; the wrong register class is used for the REG_SEQUENCE instructions. + +; CHECK: {{^}}main: +; CHECK: image_sample_b v{{\[[0-9]:[0-9]\]}}, 15, 0, 0, 0, 0, 0, 0, 0, v{{\[[0-9]:[0-9]\]}} + +define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { +main_body: + %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 + %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1 + %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16) + %23 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0 + %24 = load <32 x i8>, <32 x i8> addrspace(2)* %23, !tbaa !1 + %25 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %1, i32 0 + %26 = load <16 x i8>, <16 x i8> addrspace(2)* %25, !tbaa !1 + %27 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %5) + %28 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %5) + %29 = bitcast float %22 to i32 + %30 = bitcast float %27 to i32 + %31 = bitcast float %28 to i32 + %32 = insertelement <4 x i32> undef, i32 %29, i32 0 + %33 = insertelement <4 x i32> %32, i32 %30, i32 1 + %34 = insertelement <4 x i32> %33, i32 %31, i32 2 + %35 = insertelement <4 x i32> %34, i32 undef, i32 3 + %36 = call <4 x float> @llvm.SI.sampleb.v4i32(<4 x i32> %35, <32 x i8> %24, <16 x i8> %26, i32 2) + %37 = extractelement <4 x float> %36, i32 0 + %38 = extractelement <4 x float> %36, i32 1 + %39 = extractelement <4 x float> %36, i32 2 + %40 = extractelement <4 x float> %36, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %37, float %38, float %39, float %40) + ret void +} + +; Function Attrs: nounwind readnone +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.SI.sampleb.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } + +!0 = !{!"const", null} +!1 = !{!0, !0, i64 0, i32 1} diff --git a/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/test/CodeGen/AMDGPU/si-sgpr-spill.ll new file mode 100644 index 00000000000..84652701f77 --- /dev/null +++ b/test/CodeGen/AMDGPU/si-sgpr-spill.ll @@ -0,0 +1,1568 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck %s + +; These tests check that the compiler won't crash when it needs to spill +; SGPRs. + +; CHECK-LABEL: {{^}}main: +; CHECK: s_wqm +; Writing to M0 from an SMRD instruction will hang the GPU. +; CHECK-NOT: s_buffer_load_dword m0 +; CHECK: s_endpgm +@ddxy_lds = external addrspace(3) global [64 x i32] + +define void @main([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { +main_body: + %21 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0 + %22 = load <16 x i8>, <16 x i8> addrspace(2)* %21, !tbaa !0 + %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 96) + %24 = call float @llvm.SI.load.const(<16 x i8> %22, i32 100) + %25 = call float @llvm.SI.load.const(<16 x i8> %22, i32 104) + %26 = call float @llvm.SI.load.const(<16 x i8> %22, i32 112) + %27 = call float @llvm.SI.load.const(<16 x i8> %22, i32 116) + %28 = call float @llvm.SI.load.const(<16 x i8> %22, i32 120) + %29 = call float @llvm.SI.load.const(<16 x i8> %22, i32 128) + %30 = call float @llvm.SI.load.const(<16 x i8> %22, i32 132) + %31 = call float @llvm.SI.load.const(<16 x i8> %22, i32 140) + %32 = call float @llvm.SI.load.const(<16 x i8> %22, i32 144) + %33 = call float @llvm.SI.load.const(<16 x i8> %22, i32 160) + %34 = call float @llvm.SI.load.const(<16 x i8> %22, i32 176) + %35 = call float @llvm.SI.load.const(<16 x i8> %22, i32 180) + %36 = call float @llvm.SI.load.const(<16 x i8> %22, i32 184) + %37 = call float @llvm.SI.load.const(<16 x i8> %22, i32 192) + %38 = call float @llvm.SI.load.const(<16 x i8> %22, i32 196) + %39 = call float @llvm.SI.load.const(<16 x i8> %22, i32 200) + %40 = call float @llvm.SI.load.const(<16 x i8> %22, i32 208) + %41 = call float @llvm.SI.load.const(<16 x i8> %22, i32 212) + %42 = call float @llvm.SI.load.const(<16 x i8> %22, i32 216) + %43 = call float @llvm.SI.load.const(<16 x i8> %22, i32 224) + %44 = call float @llvm.SI.load.const(<16 x i8> %22, i32 240) + %45 = call float @llvm.SI.load.const(<16 x i8> %22, i32 244) + %46 = call float @llvm.SI.load.const(<16 x i8> %22, i32 248) + %47 = call float @llvm.SI.load.const(<16 x i8> %22, i32 256) + %48 = call float @llvm.SI.load.const(<16 x i8> %22, i32 272) + %49 = call float @llvm.SI.load.const(<16 x i8> %22, i32 276) + %50 = call float @llvm.SI.load.const(<16 x i8> %22, i32 280) + %51 = call float @llvm.SI.load.const(<16 x i8> %22, i32 288) + %52 = call float @llvm.SI.load.const(<16 x i8> %22, i32 292) + %53 = call float @llvm.SI.load.const(<16 x i8> %22, i32 296) + %54 = call float @llvm.SI.load.const(<16 x i8> %22, i32 304) + %55 = call float @llvm.SI.load.const(<16 x i8> %22, i32 308) + %56 = call float @llvm.SI.load.const(<16 x i8> %22, i32 312) + %57 = call float @llvm.SI.load.const(<16 x i8> %22, i32 368) + %58 = call float @llvm.SI.load.const(<16 x i8> %22, i32 372) + %59 = call float @llvm.SI.load.const(<16 x i8> %22, i32 376) + %60 = call float @llvm.SI.load.const(<16 x i8> %22, i32 384) + %61 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0 + %62 = load <32 x i8>, <32 x i8> addrspace(2)* %61, !tbaa !0 + %63 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0 + %64 = load <16 x i8>, <16 x i8> addrspace(2)* %63, !tbaa !0 + %65 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 1 + %66 = load <32 x i8>, <32 x i8> addrspace(2)* %65, !tbaa !0 + %67 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 1 + %68 = load <16 x i8>, <16 x i8> addrspace(2)* %67, !tbaa !0 + %69 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 2 + %70 = load <32 x i8>, <32 x i8> addrspace(2)* %69, !tbaa !0 + %71 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 2 + %72 = load <16 x i8>, <16 x i8> addrspace(2)* %71, !tbaa !0 + %73 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 3 + %74 = load <32 x i8>, <32 x i8> addrspace(2)* %73, !tbaa !0 + %75 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 3 + %76 = load <16 x i8>, <16 x i8> addrspace(2)* %75, !tbaa !0 + %77 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 4 + %78 = load <32 x i8>, <32 x i8> addrspace(2)* %77, !tbaa !0 + %79 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 4 + %80 = load <16 x i8>, <16 x i8> addrspace(2)* %79, !tbaa !0 + %81 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 5 + %82 = load <32 x i8>, <32 x i8> addrspace(2)* %81, !tbaa !0 + %83 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 5 + %84 = load <16 x i8>, <16 x i8> addrspace(2)* %83, !tbaa !0 + %85 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 6 + %86 = load <32 x i8>, <32 x i8> addrspace(2)* %85, !tbaa !0 + %87 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 6 + %88 = load <16 x i8>, <16 x i8> addrspace(2)* %87, !tbaa !0 + %89 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 7 + %90 = load <32 x i8>, <32 x i8> addrspace(2)* %89, !tbaa !0 + %91 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 7 + %92 = load <16 x i8>, <16 x i8> addrspace(2)* %91, !tbaa !0 + %93 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %4, <2 x i32> %6) + %94 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %4, <2 x i32> %6) + %95 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %4, <2 x i32> %6) + %96 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %4, <2 x i32> %6) + %97 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %4, <2 x i32> %6) + %98 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %4, <2 x i32> %6) + %99 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %4, <2 x i32> %6) + %100 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %4, <2 x i32> %6) + %101 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %4, <2 x i32> %6) + %102 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %4, <2 x i32> %6) + %103 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %4, <2 x i32> %6) + %104 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %4, <2 x i32> %6) + %105 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %4, <2 x i32> %6) + %106 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %4, <2 x i32> %6) + %107 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %4, <2 x i32> %6) + %108 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %4, <2 x i32> %6) + %109 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %4, <2 x i32> %6) + %110 = call i32 @llvm.SI.tid() + %111 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %110 + %112 = bitcast float %93 to i32 + store i32 %112, i32 addrspace(3)* %111 + %113 = bitcast float %94 to i32 + store i32 %113, i32 addrspace(3)* %111 + %114 = call i32 @llvm.SI.tid() + %115 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %114 + %116 = and i32 %114, -4 + %117 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %116 + %118 = add i32 %116, 1 + %119 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %118 + %120 = bitcast float %93 to i32 + store i32 %120, i32 addrspace(3)* %115 + %121 = load i32, i32 addrspace(3)* %117 + %122 = bitcast i32 %121 to float + %123 = load i32, i32 addrspace(3)* %119 + %124 = bitcast i32 %123 to float + %125 = fsub float %124, %122 + %126 = bitcast float %94 to i32 + store i32 %126, i32 addrspace(3)* %115 + %127 = load i32, i32 addrspace(3)* %117 + %128 = bitcast i32 %127 to float + %129 = load i32, i32 addrspace(3)* %119 + %130 = bitcast i32 %129 to float + %131 = fsub float %130, %128 + %132 = insertelement <4 x float> undef, float %125, i32 0 + %133 = insertelement <4 x float> %132, float %131, i32 1 + %134 = insertelement <4 x float> %133, float %131, i32 2 + %135 = insertelement <4 x float> %134, float %131, i32 3 + %136 = extractelement <4 x float> %135, i32 0 + %137 = extractelement <4 x float> %135, i32 1 + %138 = fmul float %60, %93 + %139 = fmul float %60, %94 + %140 = fmul float %60, %94 + %141 = fmul float %60, %94 + %142 = call i32 @llvm.SI.tid() + %143 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %142 + %144 = bitcast float %138 to i32 + store i32 %144, i32 addrspace(3)* %143 + %145 = bitcast float %139 to i32 + store i32 %145, i32 addrspace(3)* %143 + %146 = bitcast float %140 to i32 + store i32 %146, i32 addrspace(3)* %143 + %147 = bitcast float %141 to i32 + store i32 %147, i32 addrspace(3)* %143 + %148 = call i32 @llvm.SI.tid() + %149 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %148 + %150 = and i32 %148, -4 + %151 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %150 + %152 = add i32 %150, 2 + %153 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %152 + %154 = bitcast float %138 to i32 + store i32 %154, i32 addrspace(3)* %149 + %155 = load i32, i32 addrspace(3)* %151 + %156 = bitcast i32 %155 to float + %157 = load i32, i32 addrspace(3)* %153 + %158 = bitcast i32 %157 to float + %159 = fsub float %158, %156 + %160 = bitcast float %139 to i32 + store i32 %160, i32 addrspace(3)* %149 + %161 = load i32, i32 addrspace(3)* %151 + %162 = bitcast i32 %161 to float + %163 = load i32, i32 addrspace(3)* %153 + %164 = bitcast i32 %163 to float + %165 = fsub float %164, %162 + %166 = bitcast float %140 to i32 + store i32 %166, i32 addrspace(3)* %149 + %167 = load i32, i32 addrspace(3)* %151 + %168 = bitcast i32 %167 to float + %169 = load i32, i32 addrspace(3)* %153 + %170 = bitcast i32 %169 to float + %171 = fsub float %170, %168 + %172 = bitcast float %141 to i32 + store i32 %172, i32 addrspace(3)* %149 + %173 = load i32, i32 addrspace(3)* %151 + %174 = bitcast i32 %173 to float + %175 = load i32, i32 addrspace(3)* %153 + %176 = bitcast i32 %175 to float + %177 = fsub float %176, %174 + %178 = insertelement <4 x float> undef, float %159, i32 0 + %179 = insertelement <4 x float> %178, float %165, i32 1 + %180 = insertelement <4 x float> %179, float %171, i32 2 + %181 = insertelement <4 x float> %180, float %177, i32 3 + %182 = extractelement <4 x float> %181, i32 0 + %183 = extractelement <4 x float> %181, i32 1 + %184 = fdiv float 1.000000e+00, %97 + %185 = fmul float %33, %184 + %186 = fcmp uge float 1.000000e+00, %185 + %187 = select i1 %186, float %185, float 1.000000e+00 + %188 = fmul float %187, %30 + %189 = call float @ceil(float %188) + %190 = fcmp uge float 3.000000e+00, %189 + %191 = select i1 %190, float 3.000000e+00, float %189 + %192 = fdiv float 1.000000e+00, %191 + %193 = fdiv float 1.000000e+00, %30 + %194 = fmul float %191, %193 + %195 = fmul float %31, %194 + %196 = fmul float %95, %95 + %197 = fmul float %96, %96 + %198 = fadd float %197, %196 + %199 = fmul float %97, %97 + %200 = fadd float %198, %199 + %201 = call float @llvm.AMDGPU.rsq.f32(float %200) + %202 = fmul float %95, %201 + %203 = fmul float %96, %201 + %204 = fmul float %202, %29 + %205 = fmul float %203, %29 + %206 = fmul float %204, -1.000000e+00 + %207 = fmul float %205, 1.000000e+00 + %208 = fmul float %206, %32 + %209 = fmul float %207, %32 + %210 = fsub float -0.000000e+00, %208 + %211 = fadd float %93, %210 + %212 = fsub float -0.000000e+00, %209 + %213 = fadd float %94, %212 + %214 = fmul float %206, %192 + %215 = fmul float %207, %192 + %216 = fmul float -1.000000e+00, %192 + %217 = bitcast float %136 to i32 + %218 = bitcast float %182 to i32 + %219 = bitcast float %137 to i32 + %220 = bitcast float %183 to i32 + %221 = insertelement <8 x i32> undef, i32 %217, i32 0 + %222 = insertelement <8 x i32> %221, i32 %218, i32 1 + %223 = insertelement <8 x i32> %222, i32 %219, i32 2 + %224 = insertelement <8 x i32> %223, i32 %220, i32 3 + br label %LOOP + +LOOP: ; preds = %ENDIF, %main_body + %temp24.0 = phi float [ 1.000000e+00, %main_body ], [ %258, %ENDIF ] + %temp28.0 = phi float [ %211, %main_body ], [ %253, %ENDIF ] + %temp29.0 = phi float [ %213, %main_body ], [ %255, %ENDIF ] + %temp30.0 = phi float [ 1.000000e+00, %main_body ], [ %257, %ENDIF ] + %225 = fcmp oge float %temp24.0, %191 + %226 = sext i1 %225 to i32 + %227 = bitcast i32 %226 to float + %228 = bitcast float %227 to i32 + %229 = icmp ne i32 %228, 0 + br i1 %229, label %IF, label %ENDIF + +IF: ; preds = %LOOP + %230 = bitcast float %136 to i32 + %231 = bitcast float %182 to i32 + %232 = bitcast float %137 to i32 + %233 = bitcast float %183 to i32 + %234 = insertelement <8 x i32> undef, i32 %230, i32 0 + %235 = insertelement <8 x i32> %234, i32 %231, i32 1 + %236 = insertelement <8 x i32> %235, i32 %232, i32 2 + %237 = insertelement <8 x i32> %236, i32 %233, i32 3 + br label %LOOP65 + +ENDIF: ; preds = %LOOP + %238 = bitcast float %temp28.0 to i32 + %239 = bitcast float %temp29.0 to i32 + %240 = insertelement <8 x i32> %224, i32 %238, i32 4 + %241 = insertelement <8 x i32> %240, i32 %239, i32 5 + %242 = insertelement <8 x i32> %241, i32 undef, i32 6 + %243 = insertelement <8 x i32> %242, i32 undef, i32 7 + %244 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %243, <32 x i8> %62, <16 x i8> %64, i32 2) + %245 = extractelement <4 x float> %244, i32 3 + %246 = fcmp oge float %temp30.0, %245 + %247 = sext i1 %246 to i32 + %248 = bitcast i32 %247 to float + %249 = bitcast float %248 to i32 + %250 = and i32 %249, 1065353216 + %251 = bitcast i32 %250 to float + %252 = fmul float %214, %251 + %253 = fadd float %252, %temp28.0 + %254 = fmul float %215, %251 + %255 = fadd float %254, %temp29.0 + %256 = fmul float %216, %251 + %257 = fadd float %256, %temp30.0 + %258 = fadd float %temp24.0, 1.000000e+00 + br label %LOOP + +LOOP65: ; preds = %ENDIF66, %IF + %temp24.1 = phi float [ 0.000000e+00, %IF ], [ %610, %ENDIF66 ] + %temp28.1 = phi float [ %temp28.0, %IF ], [ %605, %ENDIF66 ] + %temp29.1 = phi float [ %temp29.0, %IF ], [ %607, %ENDIF66 ] + %temp30.1 = phi float [ %temp30.0, %IF ], [ %609, %ENDIF66 ] + %temp32.0 = phi float [ 1.000000e+00, %IF ], [ %611, %ENDIF66 ] + %259 = fcmp oge float %temp24.1, %195 + %260 = sext i1 %259 to i32 + %261 = bitcast i32 %260 to float + %262 = bitcast float %261 to i32 + %263 = icmp ne i32 %262, 0 + br i1 %263, label %IF67, label %ENDIF66 + +IF67: ; preds = %LOOP65 + %264 = bitcast float %136 to i32 + %265 = bitcast float %182 to i32 + %266 = bitcast float %137 to i32 + %267 = bitcast float %183 to i32 + %268 = bitcast float %temp28.1 to i32 + %269 = bitcast float %temp29.1 to i32 + %270 = insertelement <8 x i32> undef, i32 %264, i32 0 + %271 = insertelement <8 x i32> %270, i32 %265, i32 1 + %272 = insertelement <8 x i32> %271, i32 %266, i32 2 + %273 = insertelement <8 x i32> %272, i32 %267, i32 3 + %274 = insertelement <8 x i32> %273, i32 %268, i32 4 + %275 = insertelement <8 x i32> %274, i32 %269, i32 5 + %276 = insertelement <8 x i32> %275, i32 undef, i32 6 + %277 = insertelement <8 x i32> %276, i32 undef, i32 7 + %278 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %277, <32 x i8> %66, <16 x i8> %68, i32 2) + %279 = extractelement <4 x float> %278, i32 0 + %280 = extractelement <4 x float> %278, i32 1 + %281 = extractelement <4 x float> %278, i32 2 + %282 = extractelement <4 x float> %278, i32 3 + %283 = fmul float %282, %47 + %284 = bitcast float %136 to i32 + %285 = bitcast float %182 to i32 + %286 = bitcast float %137 to i32 + %287 = bitcast float %183 to i32 + %288 = bitcast float %temp28.1 to i32 + %289 = bitcast float %temp29.1 to i32 + %290 = insertelement <8 x i32> undef, i32 %284, i32 0 + %291 = insertelement <8 x i32> %290, i32 %285, i32 1 + %292 = insertelement <8 x i32> %291, i32 %286, i32 2 + %293 = insertelement <8 x i32> %292, i32 %287, i32 3 + %294 = insertelement <8 x i32> %293, i32 %288, i32 4 + %295 = insertelement <8 x i32> %294, i32 %289, i32 5 + %296 = insertelement <8 x i32> %295, i32 undef, i32 6 + %297 = insertelement <8 x i32> %296, i32 undef, i32 7 + %298 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %297, <32 x i8> %82, <16 x i8> %84, i32 2) + %299 = extractelement <4 x float> %298, i32 0 + %300 = extractelement <4 x float> %298, i32 1 + %301 = extractelement <4 x float> %298, i32 2 + %302 = bitcast float %136 to i32 + %303 = bitcast float %182 to i32 + %304 = bitcast float %137 to i32 + %305 = bitcast float %183 to i32 + %306 = bitcast float %temp28.1 to i32 + %307 = bitcast float %temp29.1 to i32 + %308 = insertelement <8 x i32> undef, i32 %302, i32 0 + %309 = insertelement <8 x i32> %308, i32 %303, i32 1 + %310 = insertelement <8 x i32> %309, i32 %304, i32 2 + %311 = insertelement <8 x i32> %310, i32 %305, i32 3 + %312 = insertelement <8 x i32> %311, i32 %306, i32 4 + %313 = insertelement <8 x i32> %312, i32 %307, i32 5 + %314 = insertelement <8 x i32> %313, i32 undef, i32 6 + %315 = insertelement <8 x i32> %314, i32 undef, i32 7 + %316 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %315, <32 x i8> %78, <16 x i8> %80, i32 2) + %317 = extractelement <4 x float> %316, i32 0 + %318 = extractelement <4 x float> %316, i32 1 + %319 = extractelement <4 x float> %316, i32 2 + %320 = fmul float %317, %23 + %321 = fmul float %318, %24 + %322 = fmul float %319, %25 + %323 = fmul float %299, %26 + %324 = fadd float %323, %320 + %325 = fmul float %300, %27 + %326 = fadd float %325, %321 + %327 = fmul float %301, %28 + %328 = fadd float %327, %322 + %329 = fadd float %279, %324 + %330 = fadd float %280, %326 + %331 = fadd float %281, %328 + %332 = bitcast float %136 to i32 + %333 = bitcast float %182 to i32 + %334 = bitcast float %137 to i32 + %335 = bitcast float %183 to i32 + %336 = bitcast float %temp28.1 to i32 + %337 = bitcast float %temp29.1 to i32 + %338 = insertelement <8 x i32> undef, i32 %332, i32 0 + %339 = insertelement <8 x i32> %338, i32 %333, i32 1 + %340 = insertelement <8 x i32> %339, i32 %334, i32 2 + %341 = insertelement <8 x i32> %340, i32 %335, i32 3 + %342 = insertelement <8 x i32> %341, i32 %336, i32 4 + %343 = insertelement <8 x i32> %342, i32 %337, i32 5 + %344 = insertelement <8 x i32> %343, i32 undef, i32 6 + %345 = insertelement <8 x i32> %344, i32 undef, i32 7 + %346 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %345, <32 x i8> %62, <16 x i8> %64, i32 2) + %347 = extractelement <4 x float> %346, i32 0 + %348 = extractelement <4 x float> %346, i32 1 + %349 = extractelement <4 x float> %346, i32 2 + %350 = fadd float %347, -5.000000e-01 + %351 = fadd float %348, -5.000000e-01 + %352 = fadd float %349, -5.000000e-01 + %353 = fmul float %350, %350 + %354 = fmul float %351, %351 + %355 = fadd float %354, %353 + %356 = fmul float %352, %352 + %357 = fadd float %355, %356 + %358 = call float @llvm.AMDGPU.rsq.f32(float %357) + %359 = fmul float %350, %358 + %360 = fmul float %351, %358 + %361 = fmul float %352, %358 + %362 = bitcast float %136 to i32 + %363 = bitcast float %182 to i32 + %364 = bitcast float %137 to i32 + %365 = bitcast float %183 to i32 + %366 = bitcast float %temp28.1 to i32 + %367 = bitcast float %temp29.1 to i32 + %368 = insertelement <8 x i32> undef, i32 %362, i32 0 + %369 = insertelement <8 x i32> %368, i32 %363, i32 1 + %370 = insertelement <8 x i32> %369, i32 %364, i32 2 + %371 = insertelement <8 x i32> %370, i32 %365, i32 3 + %372 = insertelement <8 x i32> %371, i32 %366, i32 4 + %373 = insertelement <8 x i32> %372, i32 %367, i32 5 + %374 = insertelement <8 x i32> %373, i32 undef, i32 6 + %375 = insertelement <8 x i32> %374, i32 undef, i32 7 + %376 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %375, <32 x i8> %70, <16 x i8> %72, i32 2) + %377 = extractelement <4 x float> %376, i32 0 + %378 = extractelement <4 x float> %376, i32 1 + %379 = extractelement <4 x float> %376, i32 2 + %380 = extractelement <4 x float> %376, i32 3 + %381 = fsub float -0.000000e+00, %95 + %382 = fsub float -0.000000e+00, %96 + %383 = fsub float -0.000000e+00, %97 + %384 = fmul float %359, %381 + %385 = fmul float %360, %382 + %386 = fadd float %385, %384 + %387 = fmul float %361, %383 + %388 = fadd float %386, %387 + %389 = fmul float %388, %359 + %390 = fmul float %388, %360 + %391 = fmul float %388, %361 + %392 = fmul float 2.000000e+00, %389 + %393 = fmul float 2.000000e+00, %390 + %394 = fmul float 2.000000e+00, %391 + %395 = fsub float -0.000000e+00, %392 + %396 = fadd float %381, %395 + %397 = fsub float -0.000000e+00, %393 + %398 = fadd float %382, %397 + %399 = fsub float -0.000000e+00, %394 + %400 = fadd float %383, %399 + %401 = fmul float %396, %98 + %402 = fmul float %396, %99 + %403 = fmul float %396, %100 + %404 = fmul float %398, %101 + %405 = fadd float %404, %401 + %406 = fmul float %398, %102 + %407 = fadd float %406, %402 + %408 = fmul float %398, %103 + %409 = fadd float %408, %403 + %410 = fmul float %400, %104 + %411 = fadd float %410, %405 + %412 = fmul float %400, %105 + %413 = fadd float %412, %407 + %414 = fmul float %400, %106 + %415 = fadd float %414, %409 + %416 = bitcast float %136 to i32 + %417 = bitcast float %182 to i32 + %418 = bitcast float %137 to i32 + %419 = bitcast float %183 to i32 + %420 = bitcast float %temp28.1 to i32 + %421 = bitcast float %temp29.1 to i32 + %422 = insertelement <8 x i32> undef, i32 %416, i32 0 + %423 = insertelement <8 x i32> %422, i32 %417, i32 1 + %424 = insertelement <8 x i32> %423, i32 %418, i32 2 + %425 = insertelement <8 x i32> %424, i32 %419, i32 3 + %426 = insertelement <8 x i32> %425, i32 %420, i32 4 + %427 = insertelement <8 x i32> %426, i32 %421, i32 5 + %428 = insertelement <8 x i32> %427, i32 undef, i32 6 + %429 = insertelement <8 x i32> %428, i32 undef, i32 7 + %430 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %429, <32 x i8> %86, <16 x i8> %88, i32 2) + %431 = extractelement <4 x float> %430, i32 0 + %432 = extractelement <4 x float> %430, i32 1 + %433 = extractelement <4 x float> %430, i32 2 + %434 = fmul float %48, %411 + %435 = fmul float %49, %411 + %436 = fmul float %50, %411 + %437 = fmul float %51, %413 + %438 = fadd float %437, %434 + %439 = fmul float %52, %413 + %440 = fadd float %439, %435 + %441 = fmul float %53, %413 + %442 = fadd float %441, %436 + %443 = fmul float %54, %415 + %444 = fadd float %443, %438 + %445 = fmul float %55, %415 + %446 = fadd float %445, %440 + %447 = fmul float %56, %415 + %448 = fadd float %447, %442 + %449 = insertelement <4 x float> undef, float %444, i32 0 + %450 = insertelement <4 x float> %449, float %446, i32 1 + %451 = insertelement <4 x float> %450, float %448, i32 2 + %452 = insertelement <4 x float> %451, float %195, i32 3 + %453 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %452) + %454 = extractelement <4 x float> %453, i32 0 + %455 = extractelement <4 x float> %453, i32 1 + %456 = extractelement <4 x float> %453, i32 2 + %457 = extractelement <4 x float> %453, i32 3 + %458 = call float @fabs(float %456) + %459 = fdiv float 1.000000e+00, %458 + %460 = fmul float %454, %459 + %461 = fadd float %460, 1.500000e+00 + %462 = fmul float %455, %459 + %463 = fadd float %462, 1.500000e+00 + %464 = bitcast float %463 to i32 + %465 = bitcast float %461 to i32 + %466 = bitcast float %457 to i32 + %467 = insertelement <4 x i32> undef, i32 %464, i32 0 + %468 = insertelement <4 x i32> %467, i32 %465, i32 1 + %469 = insertelement <4 x i32> %468, i32 %466, i32 2 + %470 = insertelement <4 x i32> %469, i32 undef, i32 3 + %471 = call <4 x float> @llvm.SI.sample.v4i32(<4 x i32> %470, <32 x i8> %90, <16 x i8> %92, i32 4) + %472 = extractelement <4 x float> %471, i32 0 + %473 = extractelement <4 x float> %471, i32 1 + %474 = extractelement <4 x float> %471, i32 2 + %475 = fmul float %431, %472 + %476 = fadd float %475, %329 + %477 = fmul float %432, %473 + %478 = fadd float %477, %330 + %479 = fmul float %433, %474 + %480 = fadd float %479, %331 + %481 = fmul float %107, %107 + %482 = fmul float %108, %108 + %483 = fadd float %482, %481 + %484 = fmul float %109, %109 + %485 = fadd float %483, %484 + %486 = call float @llvm.AMDGPU.rsq.f32(float %485) + %487 = fmul float %107, %486 + %488 = fmul float %108, %486 + %489 = fmul float %109, %486 + %490 = fmul float %377, %40 + %491 = fmul float %378, %41 + %492 = fmul float %379, %42 + %493 = fmul float %359, %487 + %494 = fmul float %360, %488 + %495 = fadd float %494, %493 + %496 = fmul float %361, %489 + %497 = fadd float %495, %496 + %498 = fmul float %497, %359 + %499 = fmul float %497, %360 + %500 = fmul float %497, %361 + %501 = fmul float 2.000000e+00, %498 + %502 = fmul float 2.000000e+00, %499 + %503 = fmul float 2.000000e+00, %500 + %504 = fsub float -0.000000e+00, %501 + %505 = fadd float %487, %504 + %506 = fsub float -0.000000e+00, %502 + %507 = fadd float %488, %506 + %508 = fsub float -0.000000e+00, %503 + %509 = fadd float %489, %508 + %510 = fmul float %95, %95 + %511 = fmul float %96, %96 + %512 = fadd float %511, %510 + %513 = fmul float %97, %97 + %514 = fadd float %512, %513 + %515 = call float @llvm.AMDGPU.rsq.f32(float %514) + %516 = fmul float %95, %515 + %517 = fmul float %96, %515 + %518 = fmul float %97, %515 + %519 = fmul float %505, %516 + %520 = fmul float %507, %517 + %521 = fadd float %520, %519 + %522 = fmul float %509, %518 + %523 = fadd float %521, %522 + %524 = fsub float -0.000000e+00, %523 + %525 = fcmp uge float %524, 0.000000e+00 + %526 = select i1 %525, float %524, float 0.000000e+00 + %527 = fmul float %43, %380 + %528 = fadd float %527, 1.000000e+00 + %529 = call float @llvm.pow.f32(float %526, float %528) + %530 = fmul float %476, %37 + %531 = fmul float %478, %38 + %532 = fmul float %480, %39 + %533 = fmul float %359, %487 + %534 = fmul float %360, %488 + %535 = fadd float %534, %533 + %536 = fmul float %361, %489 + %537 = fadd float %535, %536 + %538 = fcmp uge float %537, 0.000000e+00 + %539 = select i1 %538, float %537, float 0.000000e+00 + %540 = fmul float %530, %539 + %541 = fmul float %531, %539 + %542 = fmul float %532, %539 + %543 = fmul float %490, %529 + %544 = fadd float %543, %540 + %545 = fmul float %491, %529 + %546 = fadd float %545, %541 + %547 = fmul float %492, %529 + %548 = fadd float %547, %542 + %549 = fmul float %476, %34 + %550 = fmul float %478, %35 + %551 = fmul float %480, %36 + %552 = fmul float %544, %57 + %553 = fadd float %552, %549 + %554 = fmul float %546, %58 + %555 = fadd float %554, %550 + %556 = fmul float %548, %59 + %557 = fadd float %556, %551 + %558 = bitcast float %136 to i32 + %559 = bitcast float %182 to i32 + %560 = bitcast float %137 to i32 + %561 = bitcast float %183 to i32 + %562 = bitcast float %temp28.1 to i32 + %563 = bitcast float %temp29.1 to i32 + %564 = insertelement <8 x i32> undef, i32 %558, i32 0 + %565 = insertelement <8 x i32> %564, i32 %559, i32 1 + %566 = insertelement <8 x i32> %565, i32 %560, i32 2 + %567 = insertelement <8 x i32> %566, i32 %561, i32 3 + %568 = insertelement <8 x i32> %567, i32 %562, i32 4 + %569 = insertelement <8 x i32> %568, i32 %563, i32 5 + %570 = insertelement <8 x i32> %569, i32 undef, i32 6 + %571 = insertelement <8 x i32> %570, i32 undef, i32 7 + %572 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %571, <32 x i8> %74, <16 x i8> %76, i32 2) + %573 = extractelement <4 x float> %572, i32 0 + %574 = extractelement <4 x float> %572, i32 1 + %575 = extractelement <4 x float> %572, i32 2 + %576 = fmul float %573, %44 + %577 = fadd float %576, %553 + %578 = fmul float %574, %45 + %579 = fadd float %578, %555 + %580 = fmul float %575, %46 + %581 = fadd float %580, %557 + %582 = call i32 @llvm.SI.packf16(float %577, float %579) + %583 = bitcast i32 %582 to float + %584 = call i32 @llvm.SI.packf16(float %581, float %283) + %585 = bitcast i32 %584 to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %583, float %585, float %583, float %585) + ret void + +ENDIF66: ; preds = %LOOP65 + %586 = bitcast float %temp28.1 to i32 + %587 = bitcast float %temp29.1 to i32 + %588 = insertelement <8 x i32> %237, i32 %586, i32 4 + %589 = insertelement <8 x i32> %588, i32 %587, i32 5 + %590 = insertelement <8 x i32> %589, i32 undef, i32 6 + %591 = insertelement <8 x i32> %590, i32 undef, i32 7 + %592 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %591, <32 x i8> %62, <16 x i8> %64, i32 2) + %593 = extractelement <4 x float> %592, i32 3 + %594 = fcmp oge float %temp30.1, %593 + %595 = sext i1 %594 to i32 + %596 = bitcast i32 %595 to float + %597 = bitcast float %596 to i32 + %598 = and i32 %597, 1065353216 + %599 = bitcast i32 %598 to float + %600 = fmul float 5.000000e-01, %temp32.0 + %601 = fsub float -0.000000e+00, %600 + %602 = fmul float %599, %temp32.0 + %603 = fadd float %602, %601 + %604 = fmul float %214, %603 + %605 = fadd float %604, %temp28.1 + %606 = fmul float %215, %603 + %607 = fadd float %606, %temp29.1 + %608 = fmul float %216, %603 + %609 = fadd float %608, %temp30.1 + %610 = fadd float %temp24.1, 1.000000e+00 + %611 = fmul float %temp32.0, 5.000000e-01 + br label %LOOP65 +} + +; Function Attrs: nounwind readnone +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 + +; Function Attrs: readnone +declare i32 @llvm.SI.tid() #2 + +; Function Attrs: readonly +declare float @ceil(float) #3 + +; Function Attrs: readnone +declare float @llvm.AMDGPU.rsq.f32(float) #2 + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.SI.sampled.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32) #1 + +; Function Attrs: readnone +declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #2 + +; Function Attrs: readnone +declare float @fabs(float) #2 + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.SI.sample.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1 + +; Function Attrs: nounwind readonly +declare float @llvm.pow.f32(float, float) #4 + +; Function Attrs: nounwind readnone +declare i32 @llvm.SI.packf16(float, float) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } +attributes #2 = { readnone } +attributes #3 = { readonly } +attributes #4 = { nounwind readonly } + +!0 = !{!"const", null, i32 1} + +; CHECK-LABEL: {{^}}main1: +; CHECK: s_endpgm +define void @main1([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { +main_body: + %21 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0 + %22 = load <16 x i8>, <16 x i8> addrspace(2)* %21, !tbaa !0 + %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 0) + %24 = call float @llvm.SI.load.const(<16 x i8> %22, i32 4) + %25 = call float @llvm.SI.load.const(<16 x i8> %22, i32 8) + %26 = call float @llvm.SI.load.const(<16 x i8> %22, i32 12) + %27 = call float @llvm.SI.load.const(<16 x i8> %22, i32 28) + %28 = call float @llvm.SI.load.const(<16 x i8> %22, i32 48) + %29 = call float @llvm.SI.load.const(<16 x i8> %22, i32 52) + %30 = call float @llvm.SI.load.const(<16 x i8> %22, i32 56) + %31 = call float @llvm.SI.load.const(<16 x i8> %22, i32 64) + %32 = call float @llvm.SI.load.const(<16 x i8> %22, i32 68) + %33 = call float @llvm.SI.load.const(<16 x i8> %22, i32 72) + %34 = call float @llvm.SI.load.const(<16 x i8> %22, i32 76) + %35 = call float @llvm.SI.load.const(<16 x i8> %22, i32 128) + %36 = call float @llvm.SI.load.const(<16 x i8> %22, i32 132) + %37 = call float @llvm.SI.load.const(<16 x i8> %22, i32 144) + %38 = call float @llvm.SI.load.const(<16 x i8> %22, i32 148) + %39 = call float @llvm.SI.load.const(<16 x i8> %22, i32 152) + %40 = call float @llvm.SI.load.const(<16 x i8> %22, i32 160) + %41 = call float @llvm.SI.load.const(<16 x i8> %22, i32 164) + %42 = call float @llvm.SI.load.const(<16 x i8> %22, i32 168) + %43 = call float @llvm.SI.load.const(<16 x i8> %22, i32 172) + %44 = call float @llvm.SI.load.const(<16 x i8> %22, i32 176) + %45 = call float @llvm.SI.load.const(<16 x i8> %22, i32 180) + %46 = call float @llvm.SI.load.const(<16 x i8> %22, i32 184) + %47 = call float @llvm.SI.load.const(<16 x i8> %22, i32 192) + %48 = call float @llvm.SI.load.const(<16 x i8> %22, i32 196) + %49 = call float @llvm.SI.load.const(<16 x i8> %22, i32 200) + %50 = call float @llvm.SI.load.const(<16 x i8> %22, i32 208) + %51 = call float @llvm.SI.load.const(<16 x i8> %22, i32 212) + %52 = call float @llvm.SI.load.const(<16 x i8> %22, i32 216) + %53 = call float @llvm.SI.load.const(<16 x i8> %22, i32 220) + %54 = call float @llvm.SI.load.const(<16 x i8> %22, i32 236) + %55 = call float @llvm.SI.load.const(<16 x i8> %22, i32 240) + %56 = call float @llvm.SI.load.const(<16 x i8> %22, i32 244) + %57 = call float @llvm.SI.load.const(<16 x i8> %22, i32 248) + %58 = call float @llvm.SI.load.const(<16 x i8> %22, i32 252) + %59 = call float @llvm.SI.load.const(<16 x i8> %22, i32 256) + %60 = call float @llvm.SI.load.const(<16 x i8> %22, i32 260) + %61 = call float @llvm.SI.load.const(<16 x i8> %22, i32 264) + %62 = call float @llvm.SI.load.const(<16 x i8> %22, i32 268) + %63 = call float @llvm.SI.load.const(<16 x i8> %22, i32 272) + %64 = call float @llvm.SI.load.const(<16 x i8> %22, i32 276) + %65 = call float @llvm.SI.load.const(<16 x i8> %22, i32 280) + %66 = call float @llvm.SI.load.const(<16 x i8> %22, i32 284) + %67 = call float @llvm.SI.load.const(<16 x i8> %22, i32 288) + %68 = call float @llvm.SI.load.const(<16 x i8> %22, i32 292) + %69 = call float @llvm.SI.load.const(<16 x i8> %22, i32 464) + %70 = call float @llvm.SI.load.const(<16 x i8> %22, i32 468) + %71 = call float @llvm.SI.load.const(<16 x i8> %22, i32 472) + %72 = call float @llvm.SI.load.const(<16 x i8> %22, i32 496) + %73 = call float @llvm.SI.load.const(<16 x i8> %22, i32 500) + %74 = call float @llvm.SI.load.const(<16 x i8> %22, i32 504) + %75 = call float @llvm.SI.load.const(<16 x i8> %22, i32 512) + %76 = call float @llvm.SI.load.const(<16 x i8> %22, i32 516) + %77 = call float @llvm.SI.load.const(<16 x i8> %22, i32 524) + %78 = call float @llvm.SI.load.const(<16 x i8> %22, i32 532) + %79 = call float @llvm.SI.load.const(<16 x i8> %22, i32 536) + %80 = call float @llvm.SI.load.const(<16 x i8> %22, i32 540) + %81 = call float @llvm.SI.load.const(<16 x i8> %22, i32 544) + %82 = call float @llvm.SI.load.const(<16 x i8> %22, i32 548) + %83 = call float @llvm.SI.load.const(<16 x i8> %22, i32 552) + %84 = call float @llvm.SI.load.const(<16 x i8> %22, i32 556) + %85 = call float @llvm.SI.load.const(<16 x i8> %22, i32 560) + %86 = call float @llvm.SI.load.const(<16 x i8> %22, i32 564) + %87 = call float @llvm.SI.load.const(<16 x i8> %22, i32 568) + %88 = call float @llvm.SI.load.const(<16 x i8> %22, i32 572) + %89 = call float @llvm.SI.load.const(<16 x i8> %22, i32 576) + %90 = call float @llvm.SI.load.const(<16 x i8> %22, i32 580) + %91 = call float @llvm.SI.load.const(<16 x i8> %22, i32 584) + %92 = call float @llvm.SI.load.const(<16 x i8> %22, i32 588) + %93 = call float @llvm.SI.load.const(<16 x i8> %22, i32 592) + %94 = call float @llvm.SI.load.const(<16 x i8> %22, i32 596) + %95 = call float @llvm.SI.load.const(<16 x i8> %22, i32 600) + %96 = call float @llvm.SI.load.const(<16 x i8> %22, i32 604) + %97 = call float @llvm.SI.load.const(<16 x i8> %22, i32 608) + %98 = call float @llvm.SI.load.const(<16 x i8> %22, i32 612) + %99 = call float @llvm.SI.load.const(<16 x i8> %22, i32 616) + %100 = call float @llvm.SI.load.const(<16 x i8> %22, i32 624) + %101 = call float @llvm.SI.load.const(<16 x i8> %22, i32 628) + %102 = call float @llvm.SI.load.const(<16 x i8> %22, i32 632) + %103 = call float @llvm.SI.load.const(<16 x i8> %22, i32 636) + %104 = call float @llvm.SI.load.const(<16 x i8> %22, i32 640) + %105 = call float @llvm.SI.load.const(<16 x i8> %22, i32 644) + %106 = call float @llvm.SI.load.const(<16 x i8> %22, i32 648) + %107 = call float @llvm.SI.load.const(<16 x i8> %22, i32 652) + %108 = call float @llvm.SI.load.const(<16 x i8> %22, i32 656) + %109 = call float @llvm.SI.load.const(<16 x i8> %22, i32 660) + %110 = call float @llvm.SI.load.const(<16 x i8> %22, i32 664) + %111 = call float @llvm.SI.load.const(<16 x i8> %22, i32 668) + %112 = call float @llvm.SI.load.const(<16 x i8> %22, i32 672) + %113 = call float @llvm.SI.load.const(<16 x i8> %22, i32 676) + %114 = call float @llvm.SI.load.const(<16 x i8> %22, i32 680) + %115 = call float @llvm.SI.load.const(<16 x i8> %22, i32 684) + %116 = call float @llvm.SI.load.const(<16 x i8> %22, i32 688) + %117 = call float @llvm.SI.load.const(<16 x i8> %22, i32 692) + %118 = call float @llvm.SI.load.const(<16 x i8> %22, i32 696) + %119 = call float @llvm.SI.load.const(<16 x i8> %22, i32 700) + %120 = call float @llvm.SI.load.const(<16 x i8> %22, i32 704) + %121 = call float @llvm.SI.load.const(<16 x i8> %22, i32 708) + %122 = call float @llvm.SI.load.const(<16 x i8> %22, i32 712) + %123 = call float @llvm.SI.load.const(<16 x i8> %22, i32 716) + %124 = call float @llvm.SI.load.const(<16 x i8> %22, i32 864) + %125 = call float @llvm.SI.load.const(<16 x i8> %22, i32 868) + %126 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0 + %127 = load <32 x i8>, <32 x i8> addrspace(2)* %126, !tbaa !0 + %128 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0 + %129 = load <16 x i8>, <16 x i8> addrspace(2)* %128, !tbaa !0 + %130 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 1 + %131 = load <32 x i8>, <32 x i8> addrspace(2)* %130, !tbaa !0 + %132 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 1 + %133 = load <16 x i8>, <16 x i8> addrspace(2)* %132, !tbaa !0 + %134 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 2 + %135 = load <32 x i8>, <32 x i8> addrspace(2)* %134, !tbaa !0 + %136 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 2 + %137 = load <16 x i8>, <16 x i8> addrspace(2)* %136, !tbaa !0 + %138 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 3 + %139 = load <32 x i8>, <32 x i8> addrspace(2)* %138, !tbaa !0 + %140 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 3 + %141 = load <16 x i8>, <16 x i8> addrspace(2)* %140, !tbaa !0 + %142 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 4 + %143 = load <32 x i8>, <32 x i8> addrspace(2)* %142, !tbaa !0 + %144 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 4 + %145 = load <16 x i8>, <16 x i8> addrspace(2)* %144, !tbaa !0 + %146 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 5 + %147 = load <32 x i8>, <32 x i8> addrspace(2)* %146, !tbaa !0 + %148 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 5 + %149 = load <16 x i8>, <16 x i8> addrspace(2)* %148, !tbaa !0 + %150 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 6 + %151 = load <32 x i8>, <32 x i8> addrspace(2)* %150, !tbaa !0 + %152 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 6 + %153 = load <16 x i8>, <16 x i8> addrspace(2)* %152, !tbaa !0 + %154 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 7 + %155 = load <32 x i8>, <32 x i8> addrspace(2)* %154, !tbaa !0 + %156 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 7 + %157 = load <16 x i8>, <16 x i8> addrspace(2)* %156, !tbaa !0 + %158 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 8 + %159 = load <32 x i8>, <32 x i8> addrspace(2)* %158, !tbaa !0 + %160 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 8 + %161 = load <16 x i8>, <16 x i8> addrspace(2)* %160, !tbaa !0 + %162 = fcmp ugt float %17, 0.000000e+00 + %163 = select i1 %162, float 1.000000e+00, float 0.000000e+00 + %164 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %4, <2 x i32> %6) + %165 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %4, <2 x i32> %6) + %166 = call float @llvm.SI.fs.interp(i32 2, i32 0, i32 %4, <2 x i32> %6) + %167 = call float @llvm.SI.fs.interp(i32 3, i32 0, i32 %4, <2 x i32> %6) + %168 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %4, <2 x i32> %6) + %169 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %4, <2 x i32> %6) + %170 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %4, <2 x i32> %6) + %171 = call float @llvm.SI.fs.interp(i32 3, i32 1, i32 %4, <2 x i32> %6) + %172 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %4, <2 x i32> %6) + %173 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %4, <2 x i32> %6) + %174 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %4, <2 x i32> %6) + %175 = call float @llvm.SI.fs.interp(i32 3, i32 2, i32 %4, <2 x i32> %6) + %176 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %4, <2 x i32> %6) + %177 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %4, <2 x i32> %6) + %178 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %4, <2 x i32> %6) + %179 = call float @llvm.SI.fs.interp(i32 3, i32 3, i32 %4, <2 x i32> %6) + %180 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %4, <2 x i32> %6) + %181 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %4, <2 x i32> %6) + %182 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %4, <2 x i32> %6) + %183 = call float @llvm.SI.fs.interp(i32 3, i32 4, i32 %4, <2 x i32> %6) + %184 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %4, <2 x i32> %6) + %185 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %4, <2 x i32> %6) + %186 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %4, <2 x i32> %6) + %187 = call float @llvm.SI.fs.interp(i32 3, i32 5, i32 %4, <2 x i32> %6) + %188 = call float @llvm.SI.fs.interp(i32 0, i32 6, i32 %4, <2 x i32> %6) + %189 = call float @llvm.SI.fs.interp(i32 1, i32 6, i32 %4, <2 x i32> %6) + %190 = call float @llvm.SI.fs.interp(i32 2, i32 6, i32 %4, <2 x i32> %6) + %191 = call float @llvm.SI.fs.interp(i32 3, i32 6, i32 %4, <2 x i32> %6) + %192 = call float @llvm.SI.fs.interp(i32 0, i32 7, i32 %4, <2 x i32> %6) + %193 = call float @llvm.SI.fs.interp(i32 1, i32 7, i32 %4, <2 x i32> %6) + %194 = call float @llvm.SI.fs.interp(i32 2, i32 7, i32 %4, <2 x i32> %6) + %195 = call float @llvm.SI.fs.interp(i32 3, i32 7, i32 %4, <2 x i32> %6) + %196 = fmul float %14, %124 + %197 = fadd float %196, %125 + %198 = call float @llvm.AMDIL.clamp.(float %163, float 0.000000e+00, float 1.000000e+00) + %199 = call float @llvm.AMDIL.clamp.(float 0.000000e+00, float 0.000000e+00, float 1.000000e+00) + %200 = call float @llvm.AMDIL.clamp.(float 0.000000e+00, float 0.000000e+00, float 1.000000e+00) + %201 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00) + %202 = bitcast float %198 to i32 + %203 = icmp ne i32 %202, 0 + %. = select i1 %203, float -1.000000e+00, float 1.000000e+00 + %204 = fsub float -0.000000e+00, %164 + %205 = fadd float %44, %204 + %206 = fsub float -0.000000e+00, %165 + %207 = fadd float %45, %206 + %208 = fsub float -0.000000e+00, %166 + %209 = fadd float %46, %208 + %210 = fmul float %205, %205 + %211 = fmul float %207, %207 + %212 = fadd float %211, %210 + %213 = fmul float %209, %209 + %214 = fadd float %212, %213 + %215 = call float @llvm.AMDGPU.rsq.f32(float %214) + %216 = fmul float %205, %215 + %217 = fmul float %207, %215 + %218 = fmul float %209, %215 + %219 = fmul float %., %54 + %220 = fmul float %13, %47 + %221 = fmul float %197, %48 + %222 = bitcast float %174 to i32 + %223 = bitcast float %175 to i32 + %224 = insertelement <2 x i32> undef, i32 %222, i32 0 + %225 = insertelement <2 x i32> %224, i32 %223, i32 1 + %226 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %225, <32 x i8> %131, <16 x i8> %133, i32 2) + %227 = extractelement <4 x float> %226, i32 0 + %228 = extractelement <4 x float> %226, i32 1 + %229 = extractelement <4 x float> %226, i32 2 + %230 = extractelement <4 x float> %226, i32 3 + %231 = fmul float %227, 0x4012611180000000 + %232 = fmul float %228, 0x4012611180000000 + %233 = fmul float %229, 0x4012611180000000 + %234 = call float @llvm.AMDGPU.lrp(float %27, float %231, float 1.000000e+00) + %235 = call float @llvm.AMDGPU.lrp(float %27, float %232, float 1.000000e+00) + %236 = call float @llvm.AMDGPU.lrp(float %27, float %233, float 1.000000e+00) + %237 = fmul float %216, %184 + %238 = fmul float %217, %185 + %239 = fadd float %238, %237 + %240 = fmul float %218, %186 + %241 = fadd float %239, %240 + %242 = fmul float %216, %187 + %243 = fmul float %217, %188 + %244 = fadd float %243, %242 + %245 = fmul float %218, %189 + %246 = fadd float %244, %245 + %247 = fmul float %216, %190 + %248 = fmul float %217, %191 + %249 = fadd float %248, %247 + %250 = fmul float %218, %192 + %251 = fadd float %249, %250 + %252 = call float @llvm.AMDIL.clamp.(float %251, float 0.000000e+00, float 1.000000e+00) + %253 = fmul float %214, 0x3F5A36E2E0000000 + %254 = call float @llvm.AMDIL.clamp.(float %253, float 0.000000e+00, float 1.000000e+00) + %255 = fsub float -0.000000e+00, %254 + %256 = fadd float 1.000000e+00, %255 + %257 = call float @llvm.pow.f32(float %252, float 2.500000e-01) + %258 = fmul float %39, %257 + %259 = fmul float %241, %258 + %260 = fmul float %246, %258 + %261 = fmul float %259, %230 + %262 = fmul float %260, %230 + %263 = fadd float %252, 0x3EE4F8B580000000 + %264 = fsub float -0.000000e+00, %252 + %265 = fadd float 1.000000e+00, %264 + %266 = fmul float 1.200000e+01, %265 + %267 = fadd float %266, 4.000000e+00 + %268 = fsub float -0.000000e+00, %267 + %269 = fmul float %268, %263 + %270 = fsub float -0.000000e+00, %267 + %271 = fmul float %270, %263 + %272 = fsub float -0.000000e+00, %267 + %273 = fmul float %272, %263 + %274 = fdiv float 1.000000e+00, %269 + %275 = fdiv float 1.000000e+00, %271 + %276 = fdiv float 1.000000e+00, %273 + %277 = fmul float %261, %274 + %278 = fmul float %262, %275 + %279 = fmul float %263, %276 + br label %LOOP + +LOOP: ; preds = %LOOP, %main_body + %temp144.0 = phi float [ 1.000000e+00, %main_body ], [ %292, %LOOP ] + %temp168.0 = phi float [ %176, %main_body ], [ %288, %LOOP ] + %temp169.0 = phi float [ %177, %main_body ], [ %289, %LOOP ] + %temp170.0 = phi float [ %256, %main_body ], [ %290, %LOOP ] + %280 = bitcast float %temp168.0 to i32 + %281 = bitcast float %temp169.0 to i32 + %282 = insertelement <4 x i32> undef, i32 %280, i32 0 + %283 = insertelement <4 x i32> %282, i32 %281, i32 1 + %284 = insertelement <4 x i32> %283, i32 0, i32 2 + %285 = insertelement <4 x i32> %284, i32 undef, i32 3 + %286 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %285, <32 x i8> %147, <16 x i8> %149, i32 2) + %287 = extractelement <4 x float> %286, i32 3 + %288 = fadd float %temp168.0, %277 + %289 = fadd float %temp169.0, %278 + %290 = fadd float %temp170.0, %279 + %291 = fsub float -0.000000e+00, %287 + %292 = fadd float %290, %291 + %293 = fcmp oge float 0.000000e+00, %292 + %294 = sext i1 %293 to i32 + %295 = bitcast i32 %294 to float + %296 = bitcast float %295 to i32 + %297 = icmp ne i32 %296, 0 + br i1 %297, label %IF189, label %LOOP + +IF189: ; preds = %LOOP + %298 = extractelement <4 x float> %286, i32 0 + %299 = extractelement <4 x float> %286, i32 1 + %300 = extractelement <4 x float> %286, i32 2 + %301 = fsub float -0.000000e+00, %292 + %302 = fadd float %temp144.0, %301 + %303 = fdiv float 1.000000e+00, %302 + %304 = fmul float %292, %303 + %305 = fadd float %304, -1.000000e+00 + %306 = fmul float %305, %277 + %307 = fadd float %306, %288 + %308 = fmul float %305, %278 + %309 = fadd float %308, %289 + %310 = fsub float -0.000000e+00, %176 + %311 = fadd float %307, %310 + %312 = fsub float -0.000000e+00, %177 + %313 = fadd float %309, %312 + %314 = fadd float %176, %311 + %315 = fadd float %177, %313 + %316 = fmul float %311, %67 + %317 = fmul float %313, %68 + %318 = fmul float %316, %55 + %319 = fmul float %316, %56 + %320 = fmul float %317, %57 + %321 = fadd float %320, %318 + %322 = fmul float %317, %58 + %323 = fadd float %322, %319 + %324 = fadd float %178, %321 + %325 = fadd float %179, %323 + %326 = fmul float %316, %59 + %327 = fmul float %316, %60 + %328 = fmul float %316, %61 + %329 = fmul float %316, %62 + %330 = fmul float %317, %63 + %331 = fadd float %330, %326 + %332 = fmul float %317, %64 + %333 = fadd float %332, %327 + %334 = fmul float %317, %65 + %335 = fadd float %334, %328 + %336 = fmul float %317, %66 + %337 = fadd float %336, %329 + %338 = fadd float %168, %331 + %339 = fadd float %169, %333 + %340 = fadd float %170, %335 + %341 = fadd float %171, %337 + %342 = bitcast float %338 to i32 + %343 = bitcast float %339 to i32 + %344 = insertelement <2 x i32> undef, i32 %342, i32 0 + %345 = insertelement <2 x i32> %344, i32 %343, i32 1 + %346 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %345, <32 x i8> %135, <16 x i8> %137, i32 2) + %347 = extractelement <4 x float> %346, i32 0 + %348 = extractelement <4 x float> %346, i32 1 + %349 = extractelement <4 x float> %346, i32 2 + %350 = extractelement <4 x float> %346, i32 3 + %351 = fmul float %347, %23 + %352 = fmul float %348, %24 + %353 = fmul float %349, %25 + %354 = fmul float %350, %26 + %355 = fmul float %351, %180 + %356 = fmul float %352, %181 + %357 = fmul float %353, %182 + %358 = fmul float %354, %183 + %359 = fsub float -0.000000e+00, %350 + %360 = fadd float 1.000000e+00, %359 + %361 = fmul float %360, %49 + %362 = call float @llvm.AMDGPU.lrp(float %361, float %347, float %355) + %363 = call float @llvm.AMDGPU.lrp(float %361, float %348, float %356) + %364 = call float @llvm.AMDGPU.lrp(float %361, float %349, float %357) + %365 = bitcast float %340 to i32 + %366 = bitcast float %341 to i32 + %367 = insertelement <2 x i32> undef, i32 %365, i32 0 + %368 = insertelement <2 x i32> %367, i32 %366, i32 1 + %369 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %368, <32 x i8> %151, <16 x i8> %153, i32 2) + %370 = extractelement <4 x float> %369, i32 2 + %371 = fmul float %362, %234 + %372 = fmul float %363, %235 + %373 = fmul float %364, %236 + %374 = fmul float %358, %230 + %375 = bitcast float %314 to i32 + %376 = bitcast float %315 to i32 + %377 = insertelement <2 x i32> undef, i32 %375, i32 0 + %378 = insertelement <2 x i32> %377, i32 %376, i32 1 + %379 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %378, <32 x i8> %139, <16 x i8> %141, i32 2) + %380 = extractelement <4 x float> %379, i32 0 + %381 = extractelement <4 x float> %379, i32 1 + %382 = extractelement <4 x float> %379, i32 2 + %383 = extractelement <4 x float> %379, i32 3 + %384 = fcmp olt float 0.000000e+00, %382 + %385 = sext i1 %384 to i32 + %386 = bitcast i32 %385 to float + %387 = bitcast float %386 to i32 + %388 = icmp ne i32 %387, 0 + %.224 = select i1 %388, float %381, float %380 + %.225 = select i1 %388, float %383, float %381 + %389 = bitcast float %324 to i32 + %390 = bitcast float %325 to i32 + %391 = insertelement <2 x i32> undef, i32 %389, i32 0 + %392 = insertelement <2 x i32> %391, i32 %390, i32 1 + %393 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %392, <32 x i8> %143, <16 x i8> %145, i32 2) + %394 = extractelement <4 x float> %393, i32 0 + %395 = extractelement <4 x float> %393, i32 1 + %396 = extractelement <4 x float> %393, i32 2 + %397 = extractelement <4 x float> %393, i32 3 + %398 = fcmp olt float 0.000000e+00, %396 + %399 = sext i1 %398 to i32 + %400 = bitcast i32 %399 to float + %401 = bitcast float %400 to i32 + %402 = icmp ne i32 %401, 0 + %temp112.1 = select i1 %402, float %395, float %394 + %temp113.1 = select i1 %402, float %397, float %395 + %403 = fmul float %.224, 2.000000e+00 + %404 = fadd float %403, -1.000000e+00 + %405 = fmul float %.225, 2.000000e+00 + %406 = fadd float %405, -1.000000e+00 + %407 = fmul float %temp112.1, 2.000000e+00 + %408 = fadd float %407, -1.000000e+00 + %409 = fmul float %temp113.1, 2.000000e+00 + %410 = fadd float %409, -1.000000e+00 + %411 = fsub float -0.000000e+00, %404 + %412 = fmul float %411, %35 + %413 = fsub float -0.000000e+00, %406 + %414 = fmul float %413, %35 + %415 = fsub float -0.000000e+00, %408 + %416 = fmul float %415, %36 + %417 = fsub float -0.000000e+00, %410 + %418 = fmul float %417, %36 + %419 = fmul float %416, %370 + %420 = fmul float %418, %370 + %421 = call float @fabs(float %412) + %422 = call float @fabs(float %414) + %423 = fsub float -0.000000e+00, %421 + %424 = fadd float 1.000000e+00, %423 + %425 = fsub float -0.000000e+00, %422 + %426 = fadd float 1.000000e+00, %425 + %427 = fmul float %424, %419 + %428 = fadd float %427, %412 + %429 = fmul float %426, %420 + %430 = fadd float %429, %414 + %431 = fmul float %428, %428 + %432 = fmul float %430, %430 + %433 = fadd float %431, %432 + %434 = fsub float -0.000000e+00, %433 + %435 = fadd float 0x3FF00068E0000000, %434 + %436 = call float @llvm.AMDIL.clamp.(float %435, float 0.000000e+00, float 1.000000e+00) + %437 = call float @llvm.AMDGPU.rsq.f32(float %436) + %438 = fmul float %437, %436 + %439 = fsub float -0.000000e+00, %436 + %440 = call float @llvm.AMDGPU.cndlt(float %439, float %438, float 0.000000e+00) + %441 = fmul float %184, %428 + %442 = fmul float %185, %428 + %443 = fmul float %186, %428 + %444 = fmul float %187, %430 + %445 = fadd float %444, %441 + %446 = fmul float %188, %430 + %447 = fadd float %446, %442 + %448 = fmul float %189, %430 + %449 = fadd float %448, %443 + %450 = fmul float %190, %440 + %451 = fadd float %450, %445 + %452 = fmul float %191, %440 + %453 = fadd float %452, %447 + %454 = fmul float %192, %440 + %455 = fadd float %454, %449 + %456 = fmul float %451, %451 + %457 = fmul float %453, %453 + %458 = fadd float %457, %456 + %459 = fmul float %455, %455 + %460 = fadd float %458, %459 + %461 = call float @llvm.AMDGPU.rsq.f32(float %460) + %462 = fmul float %451, %461 + %463 = fmul float %453, %461 + %464 = fmul float %455, %461 + %465 = fcmp olt float 0.000000e+00, %219 + %466 = sext i1 %465 to i32 + %467 = bitcast i32 %466 to float + %468 = bitcast float %467 to i32 + %469 = icmp ne i32 %468, 0 + br i1 %469, label %IF198, label %ENDIF197 + +IF198: ; preds = %IF189 + %470 = fsub float -0.000000e+00, %462 + %471 = fsub float -0.000000e+00, %463 + %472 = fsub float -0.000000e+00, %464 + br label %ENDIF197 + +ENDIF197: ; preds = %IF189, %IF198 + %temp14.0 = phi float [ %472, %IF198 ], [ %464, %IF189 ] + %temp13.0 = phi float [ %471, %IF198 ], [ %463, %IF189 ] + %temp12.0 = phi float [ %470, %IF198 ], [ %462, %IF189 ] + %473 = bitcast float %220 to i32 + %474 = bitcast float %221 to i32 + %475 = insertelement <2 x i32> undef, i32 %473, i32 0 + %476 = insertelement <2 x i32> %475, i32 %474, i32 1 + %477 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %476, <32 x i8> %159, <16 x i8> %161, i32 2) + %478 = extractelement <4 x float> %477, i32 0 + %479 = extractelement <4 x float> %477, i32 1 + %480 = extractelement <4 x float> %477, i32 2 + %481 = extractelement <4 x float> %477, i32 3 + %482 = fmul float %478, %40 + %483 = fadd float %482, %41 + %484 = fmul float %479, %40 + %485 = fadd float %484, %41 + %486 = fmul float %480, %40 + %487 = fadd float %486, %41 + %488 = fmul float %481, %42 + %489 = fadd float %488, %43 + %490 = bitcast float %172 to i32 + %491 = bitcast float %173 to i32 + %492 = insertelement <2 x i32> undef, i32 %490, i32 0 + %493 = insertelement <2 x i32> %492, i32 %491, i32 1 + %494 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %493, <32 x i8> %155, <16 x i8> %157, i32 2) + %495 = extractelement <4 x float> %494, i32 0 + %496 = extractelement <4 x float> %494, i32 1 + %497 = extractelement <4 x float> %494, i32 2 + %498 = extractelement <4 x float> %494, i32 3 + %499 = fmul float %498, 3.200000e+01 + %500 = fadd float %499, -1.600000e+01 + %501 = call float @llvm.AMDIL.exp.(float %500) + %502 = fmul float %495, %501 + %503 = fmul float %496, %501 + %504 = fmul float %497, %501 + %505 = fmul float %28, %502 + %506 = fadd float %505, %193 + %507 = fmul float %29, %503 + %508 = fadd float %507, %194 + %509 = fmul float %30, %504 + %510 = fadd float %509, %195 + %511 = fmul float %506, %489 + %512 = fmul float %508, %489 + %513 = fmul float %510, %489 + %514 = fmul float %489, 5.000000e-01 + %515 = fadd float %514, 5.000000e-01 + %516 = fmul float %483, %515 + %517 = fadd float %516, %511 + %518 = fmul float %485, %515 + %519 = fadd float %518, %512 + %520 = fmul float %487, %515 + %521 = fadd float %520, %513 + %522 = fmul float %517, %371 + %523 = fmul float %519, %372 + %524 = fmul float %521, %373 + %525 = fmul float %428, 0x3FDB272440000000 + %526 = fmul float %430, 0xBFDB272440000000 + %527 = fadd float %526, %525 + %528 = fmul float %440, 0x3FE99999A0000000 + %529 = fadd float %527, %528 + %530 = fmul float %529, 5.000000e-01 + %531 = fadd float %530, 0x3FE3333340000000 + %532 = fmul float %531, %531 + %533 = fmul float %522, %532 + %534 = fmul float %523, %532 + %535 = fmul float %524, %532 + %536 = fsub float -0.000000e+00, %72 + %537 = fsub float -0.000000e+00, %73 + %538 = fsub float -0.000000e+00, %74 + %539 = fmul float %temp12.0, %536 + %540 = fmul float %temp13.0, %537 + %541 = fadd float %540, %539 + %542 = fmul float %temp14.0, %538 + %543 = fadd float %541, %542 + %544 = call float @llvm.AMDIL.clamp.(float %543, float 0.000000e+00, float 1.000000e+00) + %545 = fmul float %371, %544 + %546 = fmul float %372, %544 + %547 = fmul float %373, %544 + %548 = fmul float %545, %69 + %549 = fmul float %546, %70 + %550 = fmul float %547, %71 + %551 = fsub float -0.000000e+00, %164 + %552 = fadd float %97, %551 + %553 = fsub float -0.000000e+00, %165 + %554 = fadd float %98, %553 + %555 = fsub float -0.000000e+00, %166 + %556 = fadd float %99, %555 + %557 = fmul float %552, %552 + %558 = fmul float %554, %554 + %559 = fadd float %558, %557 + %560 = fmul float %556, %556 + %561 = fadd float %559, %560 + %562 = call float @llvm.AMDGPU.rsq.f32(float %561) + %563 = fmul float %562, %561 + %564 = fsub float -0.000000e+00, %561 + %565 = call float @llvm.AMDGPU.cndlt(float %564, float %563, float 0.000000e+00) + %566 = fsub float -0.000000e+00, %84 + %567 = fadd float %565, %566 + %568 = fsub float -0.000000e+00, %83 + %569 = fadd float %565, %568 + %570 = fsub float -0.000000e+00, %82 + %571 = fadd float %565, %570 + %572 = fsub float -0.000000e+00, %84 + %573 = fadd float %83, %572 + %574 = fsub float -0.000000e+00, %83 + %575 = fadd float %82, %574 + %576 = fsub float -0.000000e+00, %82 + %577 = fadd float %81, %576 + %578 = fdiv float 1.000000e+00, %573 + %579 = fdiv float 1.000000e+00, %575 + %580 = fdiv float 1.000000e+00, %577 + %581 = fmul float %567, %578 + %582 = fmul float %569, %579 + %583 = fmul float %571, %580 + %584 = fcmp olt float %565, %83 + %585 = sext i1 %584 to i32 + %586 = bitcast i32 %585 to float + %587 = bitcast float %586 to i32 + %588 = icmp ne i32 %587, 0 + br i1 %588, label %ENDIF200, label %ELSE202 + +ELSE202: ; preds = %ENDIF197 + %589 = fcmp olt float %565, %82 + %590 = sext i1 %589 to i32 + %591 = bitcast i32 %590 to float + %592 = bitcast float %591 to i32 + %593 = icmp ne i32 %592, 0 + br i1 %593, label %ENDIF200, label %ELSE205 + +ENDIF200: ; preds = %ELSE205, %ELSE202, %ENDIF197 + %temp80.0 = phi float [ %581, %ENDIF197 ], [ %.226, %ELSE205 ], [ %582, %ELSE202 ] + %temp88.0 = phi float [ %122, %ENDIF197 ], [ %.227, %ELSE205 ], [ %120, %ELSE202 ] + %temp89.0 = phi float [ %123, %ENDIF197 ], [ %.228, %ELSE205 ], [ %121, %ELSE202 ] + %temp90.0 = phi float [ %120, %ENDIF197 ], [ %116, %ELSE205 ], [ %118, %ELSE202 ] + %temp91.0 = phi float [ %121, %ENDIF197 ], [ %117, %ELSE205 ], [ %119, %ELSE202 ] + %594 = fcmp olt float %565, %83 + %595 = sext i1 %594 to i32 + %596 = bitcast i32 %595 to float + %597 = bitcast float %596 to i32 + %598 = icmp ne i32 %597, 0 + br i1 %598, label %ENDIF209, label %ELSE211 + +ELSE205: ; preds = %ELSE202 + %599 = fcmp olt float %565, %81 + %600 = sext i1 %599 to i32 + %601 = bitcast i32 %600 to float + %602 = bitcast float %601 to i32 + %603 = icmp ne i32 %602, 0 + %.226 = select i1 %603, float %583, float 1.000000e+00 + %.227 = select i1 %603, float %118, float %116 + %.228 = select i1 %603, float %119, float %117 + br label %ENDIF200 + +ELSE211: ; preds = %ENDIF200 + %604 = fcmp olt float %565, %82 + %605 = sext i1 %604 to i32 + %606 = bitcast i32 %605 to float + %607 = bitcast float %606 to i32 + %608 = icmp ne i32 %607, 0 + br i1 %608, label %ENDIF209, label %ELSE214 + +ENDIF209: ; preds = %ELSE214, %ELSE211, %ENDIF200 + %temp52.0 = phi float [ %108, %ENDIF200 ], [ %100, %ELSE214 ], [ %104, %ELSE211 ] + %temp53.0 = phi float [ %109, %ENDIF200 ], [ %101, %ELSE214 ], [ %105, %ELSE211 ] + %temp54.0 = phi float [ %110, %ENDIF200 ], [ %102, %ELSE214 ], [ %106, %ELSE211 ] + %temp55.0 = phi float [ %111, %ENDIF200 ], [ %103, %ELSE214 ], [ %107, %ELSE211 ] + %temp68.0 = phi float [ %112, %ENDIF200 ], [ %.230, %ELSE214 ], [ %108, %ELSE211 ] + %temp69.0 = phi float [ %113, %ENDIF200 ], [ %.231, %ELSE214 ], [ %109, %ELSE211 ] + %temp70.0 = phi float [ %114, %ENDIF200 ], [ %.232, %ELSE214 ], [ %110, %ELSE211 ] + %temp71.0 = phi float [ %115, %ENDIF200 ], [ %.233, %ELSE214 ], [ %111, %ELSE211 ] + %609 = fmul float %164, %85 + %610 = fmul float %165, %86 + %611 = fadd float %609, %610 + %612 = fmul float %166, %87 + %613 = fadd float %611, %612 + %614 = fmul float %167, %88 + %615 = fadd float %613, %614 + %616 = fmul float %164, %89 + %617 = fmul float %165, %90 + %618 = fadd float %616, %617 + %619 = fmul float %166, %91 + %620 = fadd float %618, %619 + %621 = fmul float %167, %92 + %622 = fadd float %620, %621 + %623 = fmul float %164, %93 + %624 = fmul float %165, %94 + %625 = fadd float %623, %624 + %626 = fmul float %166, %95 + %627 = fadd float %625, %626 + %628 = fmul float %167, %96 + %629 = fadd float %627, %628 + %630 = fsub float -0.000000e+00, %78 + %631 = fadd float 1.000000e+00, %630 + %632 = call float @fabs(float %615) + %633 = call float @fabs(float %622) + %634 = fcmp oge float %631, %632 + %635 = sext i1 %634 to i32 + %636 = bitcast i32 %635 to float + %637 = bitcast float %636 to i32 + %638 = and i32 %637, 1065353216 + %639 = bitcast i32 %638 to float + %640 = fcmp oge float %631, %633 + %641 = sext i1 %640 to i32 + %642 = bitcast i32 %641 to float + %643 = bitcast float %642 to i32 + %644 = and i32 %643, 1065353216 + %645 = bitcast i32 %644 to float + %646 = fmul float %639, %645 + %647 = fmul float %629, %646 + %648 = fmul float %615, %temp68.0 + %649 = fadd float %648, %temp70.0 + %650 = fmul float %622, %temp69.0 + %651 = fadd float %650, %temp71.0 + %652 = fmul float %615, %temp52.0 + %653 = fadd float %652, %temp54.0 + %654 = fmul float %622, %temp53.0 + %655 = fadd float %654, %temp55.0 + %656 = fadd float %temp80.0, -1.000000e+00 + %657 = fmul float %656, %77 + %658 = fadd float %657, 1.000000e+00 + %659 = call float @llvm.AMDIL.clamp.(float %658, float 0.000000e+00, float 1.000000e+00) + %660 = bitcast float %649 to i32 + %661 = bitcast float %651 to i32 + %662 = bitcast float 0.000000e+00 to i32 + %663 = insertelement <4 x i32> undef, i32 %660, i32 0 + %664 = insertelement <4 x i32> %663, i32 %661, i32 1 + %665 = insertelement <4 x i32> %664, i32 %662, i32 2 + %666 = insertelement <4 x i32> %665, i32 undef, i32 3 + %667 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %666, <32 x i8> %127, <16 x i8> %129, i32 2) + %668 = extractelement <4 x float> %667, i32 0 + %669 = extractelement <4 x float> %667, i32 1 + %670 = bitcast float %653 to i32 + %671 = bitcast float %655 to i32 + %672 = bitcast float 0.000000e+00 to i32 + %673 = insertelement <4 x i32> undef, i32 %670, i32 0 + %674 = insertelement <4 x i32> %673, i32 %671, i32 1 + %675 = insertelement <4 x i32> %674, i32 %672, i32 2 + %676 = insertelement <4 x i32> %675, i32 undef, i32 3 + %677 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %676, <32 x i8> %127, <16 x i8> %129, i32 2) + %678 = extractelement <4 x float> %677, i32 0 + %679 = extractelement <4 x float> %677, i32 1 + %680 = fsub float -0.000000e+00, %669 + %681 = fadd float 1.000000e+00, %680 + %682 = fsub float -0.000000e+00, %679 + %683 = fadd float 1.000000e+00, %682 + %684 = fmul float %681, 2.500000e-01 + %685 = fmul float %683, 2.500000e-01 + %686 = fsub float -0.000000e+00, %684 + %687 = fadd float %668, %686 + %688 = fsub float -0.000000e+00, %685 + %689 = fadd float %678, %688 + %690 = fmul float %647, %temp88.0 + %691 = fadd float %690, %temp89.0 + %692 = fmul float %647, %temp90.0 + %693 = fadd float %692, %temp91.0 + %694 = call float @llvm.AMDIL.clamp.(float %691, float 0.000000e+00, float 1.000000e+00) + %695 = call float @llvm.AMDIL.clamp.(float %693, float 0.000000e+00, float 1.000000e+00) + %696 = fsub float -0.000000e+00, %694 + %697 = fadd float %668, %696 + %698 = fsub float -0.000000e+00, %695 + %699 = fadd float %678, %698 + %700 = fmul float %668, %668 + %701 = fmul float %678, %678 + %702 = fsub float -0.000000e+00, %700 + %703 = fadd float %687, %702 + %704 = fsub float -0.000000e+00, %701 + %705 = fadd float %689, %704 + %706 = fcmp uge float %703, %75 + %707 = select i1 %706, float %703, float %75 + %708 = fcmp uge float %705, %75 + %709 = select i1 %708, float %705, float %75 + %710 = fmul float %697, %697 + %711 = fadd float %710, %707 + %712 = fmul float %699, %699 + %713 = fadd float %712, %709 + %714 = fdiv float 1.000000e+00, %711 + %715 = fdiv float 1.000000e+00, %713 + %716 = fmul float %707, %714 + %717 = fmul float %709, %715 + %718 = fcmp oge float %697, 0.000000e+00 + %719 = sext i1 %718 to i32 + %720 = bitcast i32 %719 to float + %721 = bitcast float %720 to i32 + %722 = icmp ne i32 %721, 0 + %.229 = select i1 %722, float 1.000000e+00, float %716 + %723 = fcmp oge float %699, 0.000000e+00 + %724 = sext i1 %723 to i32 + %725 = bitcast i32 %724 to float + %726 = bitcast float %725 to i32 + %727 = icmp ne i32 %726, 0 + %temp28.0 = select i1 %727, float 1.000000e+00, float %717 + %728 = call float @llvm.AMDGPU.lrp(float %659, float %temp28.0, float %.229) + %729 = call float @llvm.pow.f32(float %728, float %76) + %730 = fmul float %729, %79 + %731 = fadd float %730, %80 + %732 = call float @llvm.AMDIL.clamp.(float %731, float 0.000000e+00, float 1.000000e+00) + %733 = fmul float %732, %732 + %734 = fmul float 2.000000e+00, %732 + %735 = fsub float -0.000000e+00, %734 + %736 = fadd float 3.000000e+00, %735 + %737 = fmul float %733, %736 + %738 = fmul float %548, %737 + %739 = fmul float %549, %737 + %740 = fmul float %550, %737 + %741 = fmul float %738, %515 + %742 = fadd float %741, %533 + %743 = fmul float %739, %515 + %744 = fadd float %743, %534 + %745 = fmul float %740, %515 + %746 = fadd float %745, %535 + %747 = call float @llvm.AMDGPU.lrp(float %230, float %287, float 1.000000e+00) + %748 = call float @llvm.AMDGPU.lrp(float %37, float %298, float 1.000000e+00) + %749 = call float @llvm.AMDGPU.lrp(float %37, float %299, float 1.000000e+00) + %750 = call float @llvm.AMDGPU.lrp(float %37, float %300, float 1.000000e+00) + %751 = call float @llvm.AMDGPU.lrp(float %38, float %747, float 1.000000e+00) + %752 = fmul float %748, %751 + %753 = fmul float %749, %751 + %754 = fmul float %750, %751 + %755 = fmul float %742, %752 + %756 = fmul float %744, %753 + %757 = fmul float %746, %754 + %758 = fmul float %temp12.0, %216 + %759 = fmul float %temp13.0, %217 + %760 = fadd float %759, %758 + %761 = fmul float %temp14.0, %218 + %762 = fadd float %760, %761 + %763 = call float @fabs(float %762) + %764 = fmul float %763, %763 + %765 = fmul float %764, %50 + %766 = fadd float %765, %51 + %767 = call float @llvm.AMDIL.clamp.(float %766, float 0.000000e+00, float 1.000000e+00) + %768 = fsub float -0.000000e+00, %767 + %769 = fadd float 1.000000e+00, %768 + %770 = fmul float %33, %769 + %771 = fmul float %33, %769 + %772 = fmul float %33, %769 + %773 = fmul float %34, %769 + %774 = call float @llvm.AMDGPU.lrp(float %770, float %31, float %755) + %775 = call float @llvm.AMDGPU.lrp(float %771, float %31, float %756) + %776 = call float @llvm.AMDGPU.lrp(float %772, float %31, float %757) + %777 = call float @llvm.AMDGPU.lrp(float %773, float %32, float %374) + %778 = fcmp uge float %774, 0x3E6FFFFE60000000 + %779 = select i1 %778, float %774, float 0x3E6FFFFE60000000 + %780 = fcmp uge float %775, 0x3E6FFFFE60000000 + %781 = select i1 %780, float %775, float 0x3E6FFFFE60000000 + %782 = fcmp uge float %776, 0x3E6FFFFE60000000 + %783 = select i1 %782, float %776, float 0x3E6FFFFE60000000 + %784 = fcmp uge float %779, 6.550400e+04 + %785 = select i1 %784, float 6.550400e+04, float %779 + %786 = fcmp uge float %781, 6.550400e+04 + %787 = select i1 %786, float 6.550400e+04, float %781 + %788 = fcmp uge float %783, 6.550400e+04 + %789 = select i1 %788, float 6.550400e+04, float %783 + %790 = fmul float %777, %52 + %791 = fadd float %790, %53 + %792 = call float @llvm.AMDIL.clamp.(float %791, float 0.000000e+00, float 1.000000e+00) + %793 = call i32 @llvm.SI.packf16(float %785, float %787) + %794 = bitcast i32 %793 to float + %795 = call i32 @llvm.SI.packf16(float %789, float %792) + %796 = bitcast i32 %795 to float + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %794, float %796, float %794, float %796) + ret void + +ELSE214: ; preds = %ELSE211 + %797 = fcmp olt float %565, %81 + %798 = sext i1 %797 to i32 + %799 = bitcast i32 %798 to float + %800 = bitcast float %799 to i32 + %801 = icmp ne i32 %800, 0 + %.230 = select i1 %801, float %104, float %100 + %.231 = select i1 %801, float %105, float %101 + %.232 = select i1 %801, float %106, float %102 + %.233 = select i1 %801, float %107, float %103 + br label %ENDIF209 +} + +; Function Attrs: readnone +declare float @llvm.AMDIL.clamp.(float, float, float) #2 + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1 + +; Function Attrs: readnone +declare float @llvm.AMDGPU.lrp(float, float, float) #2 + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.SI.samplel.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1 + +; Function Attrs: readnone +declare float @llvm.AMDGPU.cndlt(float, float, float) #2 + +; Function Attrs: readnone +declare float @llvm.AMDIL.exp.(float) #2 + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } +attributes #2 = { readnone } +attributes #3 = { nounwind readonly } +attributes #4 = { readonly } diff --git a/test/CodeGen/AMDGPU/si-spill-cf.ll b/test/CodeGen/AMDGPU/si-spill-cf.ll new file mode 100644 index 00000000000..4b2d8ec6bf0 --- /dev/null +++ b/test/CodeGen/AMDGPU/si-spill-cf.ll @@ -0,0 +1,501 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s -verify-machineinstrs | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s -verify-machineinstrs | FileCheck -check-prefix=SI %s + +; If this occurs it is likely due to reordering and the restore was +; originally supposed to happen before SI_END_CF. +; SI: s_or_b64 exec, exec, [[SAVED:s\[[0-9]+:[0-9]+\]|[a-z]+]] +; SI-NOT: v_readlane_b32 [[SAVED]] + +define void @main() #0 { +main_body: + %0 = call float @llvm.SI.load.const(<16 x i8> undef, i32 16) + %1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 32) + %2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 80) + %3 = call float @llvm.SI.load.const(<16 x i8> undef, i32 84) + %4 = call float @llvm.SI.load.const(<16 x i8> undef, i32 88) + %5 = call float @llvm.SI.load.const(<16 x i8> undef, i32 96) + %6 = call float @llvm.SI.load.const(<16 x i8> undef, i32 100) + %7 = call float @llvm.SI.load.const(<16 x i8> undef, i32 104) + %8 = call float @llvm.SI.load.const(<16 x i8> undef, i32 112) + %9 = call float @llvm.SI.load.const(<16 x i8> undef, i32 116) + %10 = call float @llvm.SI.load.const(<16 x i8> undef, i32 120) + %11 = call float @llvm.SI.load.const(<16 x i8> undef, i32 128) + %12 = call float @llvm.SI.load.const(<16 x i8> undef, i32 132) + %13 = call float @llvm.SI.load.const(<16 x i8> undef, i32 136) + %14 = call float @llvm.SI.load.const(<16 x i8> undef, i32 144) + %15 = call float @llvm.SI.load.const(<16 x i8> undef, i32 148) + %16 = call float @llvm.SI.load.const(<16 x i8> undef, i32 152) + %17 = call float @llvm.SI.load.const(<16 x i8> undef, i32 160) + %18 = call float @llvm.SI.load.const(<16 x i8> undef, i32 164) + %19 = call float @llvm.SI.load.const(<16 x i8> undef, i32 168) + %20 = call float @llvm.SI.load.const(<16 x i8> undef, i32 176) + %21 = call float @llvm.SI.load.const(<16 x i8> undef, i32 180) + %22 = call float @llvm.SI.load.const(<16 x i8> undef, i32 184) + %23 = call float @llvm.SI.load.const(<16 x i8> undef, i32 192) + %24 = call float @llvm.SI.load.const(<16 x i8> undef, i32 196) + %25 = call float @llvm.SI.load.const(<16 x i8> undef, i32 200) + %26 = call float @llvm.SI.load.const(<16 x i8> undef, i32 208) + %27 = call float @llvm.SI.load.const(<16 x i8> undef, i32 212) + %28 = call float @llvm.SI.load.const(<16 x i8> undef, i32 216) + %29 = call float @llvm.SI.load.const(<16 x i8> undef, i32 224) + %30 = call float @llvm.SI.load.const(<16 x i8> undef, i32 228) + %31 = call float @llvm.SI.load.const(<16 x i8> undef, i32 232) + %32 = call float @llvm.SI.load.const(<16 x i8> undef, i32 240) + %33 = call float @llvm.SI.load.const(<16 x i8> undef, i32 244) + %34 = call float @llvm.SI.load.const(<16 x i8> undef, i32 248) + %35 = call float @llvm.SI.load.const(<16 x i8> undef, i32 256) + %36 = call float @llvm.SI.load.const(<16 x i8> undef, i32 260) + %37 = call float @llvm.SI.load.const(<16 x i8> undef, i32 264) + %38 = call float @llvm.SI.load.const(<16 x i8> undef, i32 272) + %39 = call float @llvm.SI.load.const(<16 x i8> undef, i32 276) + %40 = call float @llvm.SI.load.const(<16 x i8> undef, i32 280) + %41 = call float @llvm.SI.load.const(<16 x i8> undef, i32 288) + %42 = call float @llvm.SI.load.const(<16 x i8> undef, i32 292) + %43 = call float @llvm.SI.load.const(<16 x i8> undef, i32 296) + %44 = call float @llvm.SI.load.const(<16 x i8> undef, i32 304) + %45 = call float @llvm.SI.load.const(<16 x i8> undef, i32 308) + %46 = call float @llvm.SI.load.const(<16 x i8> undef, i32 312) + %47 = call float @llvm.SI.load.const(<16 x i8> undef, i32 320) + %48 = call float @llvm.SI.load.const(<16 x i8> undef, i32 324) + %49 = call float @llvm.SI.load.const(<16 x i8> undef, i32 328) + %50 = call float @llvm.SI.load.const(<16 x i8> undef, i32 336) + %51 = call float @llvm.SI.load.const(<16 x i8> undef, i32 340) + %52 = call float @llvm.SI.load.const(<16 x i8> undef, i32 344) + %53 = call float @llvm.SI.load.const(<16 x i8> undef, i32 352) + %54 = call float @llvm.SI.load.const(<16 x i8> undef, i32 356) + %55 = call float @llvm.SI.load.const(<16 x i8> undef, i32 360) + %56 = call float @llvm.SI.load.const(<16 x i8> undef, i32 368) + %57 = call float @llvm.SI.load.const(<16 x i8> undef, i32 372) + %58 = call float @llvm.SI.load.const(<16 x i8> undef, i32 376) + %59 = call float @llvm.SI.load.const(<16 x i8> undef, i32 384) + %60 = call float @llvm.SI.load.const(<16 x i8> undef, i32 388) + %61 = call float @llvm.SI.load.const(<16 x i8> undef, i32 392) + %62 = call float @llvm.SI.load.const(<16 x i8> undef, i32 400) + %63 = call float @llvm.SI.load.const(<16 x i8> undef, i32 404) + %64 = call float @llvm.SI.load.const(<16 x i8> undef, i32 408) + %65 = call float @llvm.SI.load.const(<16 x i8> undef, i32 416) + %66 = call float @llvm.SI.load.const(<16 x i8> undef, i32 420) + br label %LOOP + +LOOP: ; preds = %ENDIF2795, %main_body + %temp894.0 = phi float [ 0.000000e+00, %main_body ], [ %temp894.1, %ENDIF2795 ] + %temp18.0 = phi float [ undef, %main_body ], [ %temp18.1, %ENDIF2795 ] + %67 = icmp sgt i32 undef, 4 + br i1 %67, label %ENDLOOP, label %ENDIF + +ENDLOOP: ; preds = %ELSE2566, %LOOP + %68 = call float @llvm.AMDGPU.lrp(float %0, float undef, float undef) + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float undef, float %68, float undef, float 1.000000e+00) + ret void + +ENDIF: ; preds = %LOOP + %69 = fsub float %2, undef + %70 = fsub float %3, undef + %71 = fsub float %4, undef + %72 = fmul float %69, 0.000000e+00 + %73 = fmul float %70, undef + %74 = fmul float %71, undef + %75 = fsub float %6, undef + %76 = fsub float %7, undef + %77 = fmul float %75, undef + %78 = fmul float %76, 0.000000e+00 + %79 = call float @llvm.minnum.f32(float %74, float %78) + %80 = call float @llvm.maxnum.f32(float %72, float 0.000000e+00) + %81 = call float @llvm.maxnum.f32(float %73, float %77) + %82 = call float @llvm.maxnum.f32(float undef, float %79) + %83 = call float @llvm.minnum.f32(float %80, float %81) + %84 = call float @llvm.minnum.f32(float %83, float undef) + %85 = fsub float %14, undef + %86 = fsub float %15, undef + %87 = fsub float %16, undef + %88 = fmul float %85, undef + %89 = fmul float %86, undef + %90 = fmul float %87, undef + %91 = fsub float %17, undef + %92 = fsub float %18, undef + %93 = fsub float %19, undef + %94 = fmul float %91, 0.000000e+00 + %95 = fmul float %92, undef + %96 = fmul float %93, undef + %97 = call float @llvm.minnum.f32(float %89, float %95) + %98 = call float @llvm.maxnum.f32(float %88, float %94) + %99 = call float @llvm.maxnum.f32(float %90, float %96) + %100 = call float @llvm.maxnum.f32(float undef, float %97) + %101 = call float @llvm.maxnum.f32(float %100, float undef) + %102 = call float @llvm.minnum.f32(float %98, float undef) + %103 = call float @llvm.minnum.f32(float %102, float %99) + %104 = fsub float %30, undef + %105 = fsub float %31, undef + %106 = fmul float %104, 0.000000e+00 + %107 = fmul float %105, 0.000000e+00 + %108 = call float @llvm.minnum.f32(float undef, float %106) + %109 = call float @llvm.maxnum.f32(float undef, float %107) + %110 = call float @llvm.maxnum.f32(float undef, float %108) + %111 = call float @llvm.maxnum.f32(float %110, float undef) + %112 = call float @llvm.minnum.f32(float undef, float %109) + %113 = fsub float %32, undef + %114 = fsub float %33, undef + %115 = fsub float %34, undef + %116 = fmul float %113, 0.000000e+00 + %117 = fmul float %114, undef + %118 = fmul float %115, undef + %119 = fsub float %35, undef + %120 = fsub float %36, undef + %121 = fsub float %37, undef + %122 = fmul float %119, undef + %123 = fmul float %120, undef + %124 = fmul float %121, undef + %125 = call float @llvm.minnum.f32(float %116, float %122) + %126 = call float @llvm.minnum.f32(float %117, float %123) + %127 = call float @llvm.minnum.f32(float %118, float %124) + %128 = call float @llvm.maxnum.f32(float %125, float %126) + %129 = call float @llvm.maxnum.f32(float %128, float %127) + %130 = fsub float %38, undef + %131 = fsub float %39, undef + %132 = fsub float %40, undef + %133 = fmul float %130, 0.000000e+00 + %134 = fmul float %131, undef + %135 = fmul float %132, undef + %136 = fsub float %41, undef + %137 = fsub float %42, undef + %138 = fsub float %43, undef + %139 = fmul float %136, undef + %140 = fmul float %137, undef + %141 = fmul float %138, undef + %142 = call float @llvm.minnum.f32(float %133, float %139) + %143 = call float @llvm.minnum.f32(float %134, float %140) + %144 = call float @llvm.minnum.f32(float %135, float %141) + %145 = call float @llvm.maxnum.f32(float %142, float %143) + %146 = call float @llvm.maxnum.f32(float %145, float %144) + %147 = fsub float %44, undef + %148 = fsub float %45, undef + %149 = fsub float %46, undef + %150 = fmul float %147, 0.000000e+00 + %151 = fmul float %148, 0.000000e+00 + %152 = fmul float %149, undef + %153 = fsub float %47, undef + %154 = fsub float %48, undef + %155 = fsub float %49, undef + %156 = fmul float %153, undef + %157 = fmul float %154, 0.000000e+00 + %158 = fmul float %155, undef + %159 = call float @llvm.minnum.f32(float %150, float %156) + %160 = call float @llvm.minnum.f32(float %151, float %157) + %161 = call float @llvm.minnum.f32(float %152, float %158) + %162 = call float @llvm.maxnum.f32(float %159, float %160) + %163 = call float @llvm.maxnum.f32(float %162, float %161) + %164 = fsub float %50, undef + %165 = fsub float %51, undef + %166 = fsub float %52, undef + %167 = fmul float %164, undef + %168 = fmul float %165, 0.000000e+00 + %169 = fmul float %166, 0.000000e+00 + %170 = fsub float %53, undef + %171 = fsub float %54, undef + %172 = fsub float %55, undef + %173 = fdiv float 1.000000e+00, %temp18.0 + %174 = fmul float %170, undef + %175 = fmul float %171, undef + %176 = fmul float %172, %173 + %177 = call float @llvm.minnum.f32(float %167, float %174) + %178 = call float @llvm.minnum.f32(float %168, float %175) + %179 = call float @llvm.minnum.f32(float %169, float %176) + %180 = call float @llvm.maxnum.f32(float %177, float %178) + %181 = call float @llvm.maxnum.f32(float %180, float %179) + %182 = fsub float %62, undef + %183 = fsub float %63, undef + %184 = fsub float %64, undef + %185 = fmul float %182, 0.000000e+00 + %186 = fmul float %183, undef + %187 = fmul float %184, undef + %188 = fsub float %65, undef + %189 = fsub float %66, undef + %190 = fmul float %188, undef + %191 = fmul float %189, undef + %192 = call float @llvm.maxnum.f32(float %185, float %190) + %193 = call float @llvm.maxnum.f32(float %186, float %191) + %194 = call float @llvm.maxnum.f32(float %187, float undef) + %195 = call float @llvm.minnum.f32(float %192, float %193) + %196 = call float @llvm.minnum.f32(float %195, float %194) + %.temp292.7 = select i1 undef, float %163, float undef + %temp292.9 = select i1 false, float %181, float %.temp292.7 + %.temp292.9 = select i1 undef, float undef, float %temp292.9 + %197 = fcmp ogt float undef, 0.000000e+00 + %198 = fcmp olt float undef, %196 + %199 = and i1 %197, %198 + %200 = fcmp olt float undef, %.temp292.9 + %201 = and i1 %199, %200 + %temp292.11 = select i1 %201, float undef, float %.temp292.9 + br i1 undef, label %IF2565, label %ELSE2566 + +IF2565: ; preds = %ENDIF + br i1 false, label %ENDIF2582, label %ELSE2584 + +ELSE2566: ; preds = %ENDIF + %202 = fcmp oeq float %temp292.11, 1.000000e+04 + br i1 %202, label %ENDLOOP, label %ELSE2593 + +ENDIF2564: ; preds = %ENDIF2594, %ENDIF2588 + %temp894.1 = phi float [ undef, %ENDIF2588 ], [ %temp894.2, %ENDIF2594 ] + %temp18.1 = phi float [ %219, %ENDIF2588 ], [ undef, %ENDIF2594 ] + %203 = fsub float %5, undef + %204 = fmul float %203, undef + %205 = call float @llvm.maxnum.f32(float undef, float %204) + %206 = call float @llvm.minnum.f32(float %205, float undef) + %207 = call float @llvm.minnum.f32(float %206, float undef) + %208 = fcmp ogt float undef, 0.000000e+00 + %209 = fcmp olt float undef, 1.000000e+00 + %210 = and i1 %208, %209 + %211 = fcmp olt float undef, %207 + %212 = and i1 %210, %211 + br i1 %212, label %ENDIF2795, label %ELSE2797 + +ELSE2584: ; preds = %IF2565 + br label %ENDIF2582 + +ENDIF2582: ; preds = %ELSE2584, %IF2565 + %213 = fadd float %1, undef + %214 = fadd float 0.000000e+00, %213 + %215 = call float @llvm.AMDIL.fraction.(float %214) + br i1 undef, label %IF2589, label %ELSE2590 + +IF2589: ; preds = %ENDIF2582 + br label %ENDIF2588 + +ELSE2590: ; preds = %ENDIF2582 + br label %ENDIF2588 + +ENDIF2588: ; preds = %ELSE2590, %IF2589 + %216 = fsub float 1.000000e+00, %215 + %217 = call float @llvm.sqrt.f32(float %216) + %218 = fmul float %217, undef + %219 = fadd float %218, undef + br label %ENDIF2564 + +ELSE2593: ; preds = %ELSE2566 + %220 = fcmp oeq float %temp292.11, %82 + %221 = fcmp olt float %82, %84 + %222 = and i1 %220, %221 + br i1 %222, label %ENDIF2594, label %ELSE2596 + +ELSE2596: ; preds = %ELSE2593 + %223 = fcmp oeq float %temp292.11, %101 + %224 = fcmp olt float %101, %103 + %225 = and i1 %223, %224 + br i1 %225, label %ENDIF2594, label %ELSE2632 + +ENDIF2594: ; preds = %ELSE2788, %ELSE2785, %ELSE2782, %ELSE2779, %IF2775, %ELSE2761, %ELSE2758, %IF2757, %ELSE2704, %ELSE2686, %ELSE2671, %ELSE2668, %IF2667, %ELSE2632, %ELSE2596, %ELSE2593 + %temp894.2 = phi float [ 0.000000e+00, %IF2667 ], [ 0.000000e+00, %ELSE2671 ], [ 0.000000e+00, %IF2757 ], [ 0.000000e+00, %ELSE2761 ], [ %temp894.0, %ELSE2758 ], [ 0.000000e+00, %IF2775 ], [ 0.000000e+00, %ELSE2779 ], [ 0.000000e+00, %ELSE2782 ], [ %.2848, %ELSE2788 ], [ 0.000000e+00, %ELSE2785 ], [ 0.000000e+00, %ELSE2593 ], [ 0.000000e+00, %ELSE2632 ], [ 0.000000e+00, %ELSE2704 ], [ 0.000000e+00, %ELSE2686 ], [ 0.000000e+00, %ELSE2668 ], [ 0.000000e+00, %ELSE2596 ] + %226 = fmul float %temp894.2, undef + br label %ENDIF2564 + +ELSE2632: ; preds = %ELSE2596 + br i1 undef, label %ENDIF2594, label %ELSE2650 + +ELSE2650: ; preds = %ELSE2632 + %227 = fcmp oeq float %temp292.11, %111 + %228 = fcmp olt float %111, %112 + %229 = and i1 %227, %228 + br i1 %229, label %IF2667, label %ELSE2668 + +IF2667: ; preds = %ELSE2650 + br i1 undef, label %ENDIF2594, label %ELSE2671 + +ELSE2668: ; preds = %ELSE2650 + %230 = fcmp oeq float %temp292.11, %129 + %231 = fcmp olt float %129, undef + %232 = and i1 %230, %231 + br i1 %232, label %ENDIF2594, label %ELSE2686 + +ELSE2671: ; preds = %IF2667 + br label %ENDIF2594 + +ELSE2686: ; preds = %ELSE2668 + %233 = fcmp oeq float %temp292.11, %146 + %234 = fcmp olt float %146, undef + %235 = and i1 %233, %234 + br i1 %235, label %ENDIF2594, label %ELSE2704 + +ELSE2704: ; preds = %ELSE2686 + %236 = fcmp oeq float %temp292.11, %181 + %237 = fcmp olt float %181, undef + %238 = and i1 %236, %237 + br i1 %238, label %ENDIF2594, label %ELSE2740 + +ELSE2740: ; preds = %ELSE2704 + br i1 undef, label %IF2757, label %ELSE2758 + +IF2757: ; preds = %ELSE2740 + br i1 undef, label %ENDIF2594, label %ELSE2761 + +ELSE2758: ; preds = %ELSE2740 + br i1 undef, label %IF2775, label %ENDIF2594 + +ELSE2761: ; preds = %IF2757 + br label %ENDIF2594 + +IF2775: ; preds = %ELSE2758 + %239 = fcmp olt float undef, undef + br i1 %239, label %ENDIF2594, label %ELSE2779 + +ELSE2779: ; preds = %IF2775 + br i1 undef, label %ENDIF2594, label %ELSE2782 + +ELSE2782: ; preds = %ELSE2779 + br i1 undef, label %ENDIF2594, label %ELSE2785 + +ELSE2785: ; preds = %ELSE2782 + %240 = fcmp olt float undef, 0.000000e+00 + br i1 %240, label %ENDIF2594, label %ELSE2788 + +ELSE2788: ; preds = %ELSE2785 + %241 = fcmp olt float 0.000000e+00, undef + %.2848 = select i1 %241, float -1.000000e+00, float 1.000000e+00 + br label %ENDIF2594 + +ELSE2797: ; preds = %ENDIF2564 + %242 = fsub float %8, undef + %243 = fsub float %9, undef + %244 = fsub float %10, undef + %245 = fmul float %242, undef + %246 = fmul float %243, undef + %247 = fmul float %244, undef + %248 = fsub float %11, undef + %249 = fsub float %12, undef + %250 = fsub float %13, undef + %251 = fmul float %248, undef + %252 = fmul float %249, undef + %253 = fmul float %250, undef + %254 = call float @llvm.minnum.f32(float %245, float %251) + %255 = call float @llvm.minnum.f32(float %246, float %252) + %256 = call float @llvm.maxnum.f32(float %247, float %253) + %257 = call float @llvm.maxnum.f32(float %254, float %255) + %258 = call float @llvm.maxnum.f32(float %257, float undef) + %259 = call float @llvm.minnum.f32(float undef, float %256) + %260 = fcmp ogt float %258, 0.000000e+00 + %261 = fcmp olt float %258, 1.000000e+00 + %262 = and i1 %260, %261 + %263 = fcmp olt float %258, %259 + %264 = and i1 %262, %263 + br i1 %264, label %ENDIF2795, label %ELSE2800 + +ENDIF2795: ; preds = %ELSE2824, %ELSE2821, %ELSE2818, %ELSE2815, %ELSE2812, %ELSE2809, %ELSE2806, %ELSE2803, %ELSE2800, %ELSE2797, %ENDIF2564 + br label %LOOP + +ELSE2800: ; preds = %ELSE2797 + br i1 undef, label %ENDIF2795, label %ELSE2803 + +ELSE2803: ; preds = %ELSE2800 + %265 = fsub float %20, undef + %266 = fsub float %21, undef + %267 = fsub float %22, undef + %268 = fmul float %265, undef + %269 = fmul float %266, undef + %270 = fmul float %267, 0.000000e+00 + %271 = fsub float %23, undef + %272 = fsub float %24, undef + %273 = fsub float %25, undef + %274 = fmul float %271, undef + %275 = fmul float %272, undef + %276 = fmul float %273, undef + %277 = call float @llvm.minnum.f32(float %268, float %274) + %278 = call float @llvm.maxnum.f32(float %269, float %275) + %279 = call float @llvm.maxnum.f32(float %270, float %276) + %280 = call float @llvm.maxnum.f32(float %277, float undef) + %281 = call float @llvm.maxnum.f32(float %280, float undef) + %282 = call float @llvm.minnum.f32(float undef, float %278) + %283 = call float @llvm.minnum.f32(float %282, float %279) + %284 = fcmp ogt float %281, 0.000000e+00 + %285 = fcmp olt float %281, 1.000000e+00 + %286 = and i1 %284, %285 + %287 = fcmp olt float %281, %283 + %288 = and i1 %286, %287 + br i1 %288, label %ENDIF2795, label %ELSE2806 + +ELSE2806: ; preds = %ELSE2803 + %289 = fsub float %26, undef + %290 = fsub float %27, undef + %291 = fsub float %28, undef + %292 = fmul float %289, undef + %293 = fmul float %290, 0.000000e+00 + %294 = fmul float %291, undef + %295 = fsub float %29, undef + %296 = fmul float %295, undef + %297 = call float @llvm.minnum.f32(float %292, float %296) + %298 = call float @llvm.minnum.f32(float %293, float undef) + %299 = call float @llvm.maxnum.f32(float %294, float undef) + %300 = call float @llvm.maxnum.f32(float %297, float %298) + %301 = call float @llvm.maxnum.f32(float %300, float undef) + %302 = call float @llvm.minnum.f32(float undef, float %299) + %303 = fcmp ogt float %301, 0.000000e+00 + %304 = fcmp olt float %301, 1.000000e+00 + %305 = and i1 %303, %304 + %306 = fcmp olt float %301, %302 + %307 = and i1 %305, %306 + br i1 %307, label %ENDIF2795, label %ELSE2809 + +ELSE2809: ; preds = %ELSE2806 + br i1 undef, label %ENDIF2795, label %ELSE2812 + +ELSE2812: ; preds = %ELSE2809 + br i1 undef, label %ENDIF2795, label %ELSE2815 + +ELSE2815: ; preds = %ELSE2812 + br i1 undef, label %ENDIF2795, label %ELSE2818 + +ELSE2818: ; preds = %ELSE2815 + br i1 undef, label %ENDIF2795, label %ELSE2821 + +ELSE2821: ; preds = %ELSE2818 + %308 = fsub float %56, undef + %309 = fsub float %57, undef + %310 = fsub float %58, undef + %311 = fmul float %308, undef + %312 = fmul float %309, 0.000000e+00 + %313 = fmul float %310, undef + %314 = fsub float %59, undef + %315 = fsub float %60, undef + %316 = fsub float %61, undef + %317 = fmul float %314, undef + %318 = fmul float %315, undef + %319 = fmul float %316, undef + %320 = call float @llvm.maxnum.f32(float %311, float %317) + %321 = call float @llvm.maxnum.f32(float %312, float %318) + %322 = call float @llvm.maxnum.f32(float %313, float %319) + %323 = call float @llvm.minnum.f32(float %320, float %321) + %324 = call float @llvm.minnum.f32(float %323, float %322) + %325 = fcmp ogt float undef, 0.000000e+00 + %326 = fcmp olt float undef, 1.000000e+00 + %327 = and i1 %325, %326 + %328 = fcmp olt float undef, %324 + %329 = and i1 %327, %328 + br i1 %329, label %ENDIF2795, label %ELSE2824 + +ELSE2824: ; preds = %ELSE2821 + %.2849 = select i1 undef, float 0.000000e+00, float 1.000000e+00 + br label %ENDIF2795 +} + +; Function Attrs: nounwind readnone +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 + +; Function Attrs: readnone +declare float @llvm.AMDIL.fraction.(float) #2 + +; Function Attrs: nounwind readnone +declare float @llvm.sqrt.f32(float) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.minnum.f32(float, float) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.maxnum.f32(float, float) #1 + +; Function Attrs: readnone +declare float @llvm.AMDGPU.lrp(float, float, float) #2 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" } +attributes #1 = { nounwind readnone } +attributes #2 = { readnone } diff --git a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll new file mode 100644 index 00000000000..5a6129aaa3f --- /dev/null +++ b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -0,0 +1,236 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s + +declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) +declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) +declare void @llvm.AMDGPU.barrier.local() #2 + + +@stored_lds_ptr = addrspace(3) global i32 addrspace(3)* undef, align 4 +@stored_constant_ptr = addrspace(3) global i32 addrspace(2)* undef, align 8 +@stored_global_ptr = addrspace(3) global i32 addrspace(1)* undef, align 8 + +; FUNC-LABEL: @reorder_local_load_global_store_local_load +; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4 +; CI-NEXT: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8 +; CI: buffer_store_dword +define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { + %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 + + %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1 + %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2 + + %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4 + store i32 99, i32 addrspace(1)* %gptr, align 4 + %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4 + + %add = add nsw i32 %tmp1, %tmp2 + + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @no_reorder_local_load_volatile_global_store_local_load +; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4 +; CI: buffer_store_dword +; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8 +define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { + %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 + + %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1 + %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2 + + %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4 + store volatile i32 99, i32 addrspace(1)* %gptr, align 4 + %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4 + + %add = add nsw i32 %tmp1, %tmp2 + + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @no_reorder_barrier_local_load_global_store_local_load +; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4 +; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8 +; CI: buffer_store_dword +define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { + %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 + + %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1 + %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2 + + %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4 + store i32 99, i32 addrspace(1)* %gptr, align 4 + call void @llvm.AMDGPU.barrier.local() #2 + %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4 + + %add = add nsw i32 %tmp1, %tmp2 + + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; Technically we could reorder these, but just comparing the +; instruction type of the load is insufficient. + +; FUNC-LABEL: @no_reorder_constant_load_global_store_constant_load +; CI: buffer_load_dword +; CI: buffer_store_dword +; CI: buffer_load_dword +; CI: buffer_store_dword +define void @no_reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { + %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8 + + %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1 + %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2 + + %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4 + store i32 99, i32 addrspace(1)* %gptr, align 4 + %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4 + + %add = add nsw i32 %tmp1, %tmp2 + + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @reorder_constant_load_local_store_constant_load +; CI: buffer_load_dword +; CI: buffer_load_dword +; CI: ds_write_b32 +; CI: buffer_store_dword +define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 { + %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8 + + %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1 + %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2 + + %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4 + store i32 99, i32 addrspace(3)* %lptr, align 4 + %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4 + + %add = add nsw i32 %tmp1, %tmp2 + + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @reorder_smrd_load_local_store_smrd_load +; CI: s_load_dword +; CI: s_load_dword +; CI: s_load_dword +; CI: ds_write_b32 +; CI: buffer_store_dword +define void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(2)* %ptr0) #0 { + %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1 + %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2 + + %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4 + store i32 99, i32 addrspace(3)* %lptr, align 4 + %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4 + + %add = add nsw i32 %tmp1, %tmp2 + + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @reorder_global_load_local_store_global_load +; CI: buffer_load_dword +; CI: buffer_load_dword +; CI: ds_write_b32 +; CI: buffer_store_dword +define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 { + %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 1 + %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 2 + + %tmp1 = load i32, i32 addrspace(1)* %ptr1, align 4 + store i32 99, i32 addrspace(3)* %lptr, align 4 + %tmp2 = load i32, i32 addrspace(1)* %ptr2, align 4 + + %add = add nsw i32 %tmp1, %tmp2 + + store i32 %add, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @reorder_local_offsets +; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12 +; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400 +; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404 +; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400 +; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404 +; CI: buffer_store_dword +; CI: s_endpgm +define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 { + %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3 + %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 100 + %ptr3 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 101 + + store i32 123, i32 addrspace(3)* %ptr1, align 4 + %tmp1 = load i32, i32 addrspace(3)* %ptr2, align 4 + %tmp2 = load i32, i32 addrspace(3)* %ptr3, align 4 + store i32 123, i32 addrspace(3)* %ptr2, align 4 + %tmp3 = load i32, i32 addrspace(3)* %ptr1, align 4 + store i32 789, i32 addrspace(3)* %ptr3, align 4 + + %add.0 = add nsw i32 %tmp2, %tmp1 + %add.1 = add nsw i32 %add.0, %tmp3 + store i32 %add.1, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @reorder_global_offsets +; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12 +; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400 +; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404 +; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400 +; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404 +; CI: buffer_store_dword +; CI: s_endpgm +define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 { + %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3 + %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 100 + %ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 101 + + store i32 123, i32 addrspace(1)* %ptr1, align 4 + %tmp1 = load i32, i32 addrspace(1)* %ptr2, align 4 + %tmp2 = load i32, i32 addrspace(1)* %ptr3, align 4 + store i32 123, i32 addrspace(1)* %ptr2, align 4 + %tmp3 = load i32, i32 addrspace(1)* %ptr1, align 4 + store i32 789, i32 addrspace(1)* %ptr3, align 4 + + %add.0 = add nsw i32 %tmp2, %tmp1 + %add.1 = add nsw i32 %add.0, %tmp3 + store i32 %add.1, i32 addrspace(1)* %out, align 4 + ret void +} + +; XFUNC-LABEL: @reorder_local_load_tbuffer_store_local_load +; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x4 +; XCI: TBUFFER_STORE_FORMAT +; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x8 +; define void @reorder_local_load_tbuffer_store_local_load(i32 addrspace(1)* %out, i32 %a1, i32 %vaddr) #1 { +; %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 + +; %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1 +; %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2 + +; %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4 + +; %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 +; call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, +; i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1, +; i32 1, i32 0) + +; %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4 + +; %add = add nsw i32 %tmp1, %tmp2 + +; store i32 %add, i32 addrspace(1)* %out, align 4 +; ret void +; } + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #1 = { "ShaderType"="1" nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #2 = { nounwind noduplicate } diff --git a/test/CodeGen/AMDGPU/si-vector-hang.ll b/test/CodeGen/AMDGPU/si-vector-hang.ll new file mode 100644 index 00000000000..bd427dd3ed4 --- /dev/null +++ b/test/CodeGen/AMDGPU/si-vector-hang.ll @@ -0,0 +1,105 @@ +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +; CHECK: {{^}}test_8_min_char: +; CHECK: buffer_store_byte +; CHECK: buffer_store_byte +; CHECK: buffer_store_byte +; CHECK: buffer_store_byte +; CHECK: buffer_store_byte +; CHECK: buffer_store_byte +; CHECK: buffer_store_byte +; CHECK: buffer_store_byte +; ModuleID = 'radeon' + +define void @test_8_min_char(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture readonly %in0, i8 addrspace(1)* nocapture readonly %in1) #0 { +entry: + %0 = load i8, i8 addrspace(1)* %in0, align 1 + %1 = insertelement <8 x i8> undef, i8 %0, i32 0 + %arrayidx2.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 1 + %2 = load i8, i8 addrspace(1)* %arrayidx2.i.i, align 1 + %3 = insertelement <8 x i8> %1, i8 %2, i32 1 + %arrayidx6.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 2 + %4 = load i8, i8 addrspace(1)* %arrayidx6.i.i, align 1 + %5 = insertelement <8 x i8> %3, i8 %4, i32 2 + %arrayidx10.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 3 + %6 = load i8, i8 addrspace(1)* %arrayidx10.i.i, align 1 + %7 = insertelement <8 x i8> %5, i8 %6, i32 3 + %arrayidx.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 4 + %8 = load i8, i8 addrspace(1)* %arrayidx.i.i, align 1 + %9 = insertelement <8 x i8> undef, i8 %8, i32 0 + %arrayidx2.i9.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 5 + %10 = load i8, i8 addrspace(1)* %arrayidx2.i9.i, align 1 + %11 = insertelement <8 x i8> %9, i8 %10, i32 1 + %arrayidx6.i11.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 6 + %12 = load i8, i8 addrspace(1)* %arrayidx6.i11.i, align 1 + %13 = insertelement <8 x i8> %11, i8 %12, i32 2 + %arrayidx10.i13.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 7 + %14 = load i8, i8 addrspace(1)* %arrayidx10.i13.i, align 1 + %15 = insertelement <8 x i8> %13, i8 %14, i32 3 + %vecinit5.i = shufflevector <8 x i8> %7, <8 x i8> %15, <8 x i32> + %16 = load i8, i8 addrspace(1)* %in1, align 1 + %17 = insertelement <8 x i8> undef, i8 %16, i32 0 + %arrayidx2.i.i4 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 1 + %18 = load i8, i8 addrspace(1)* %arrayidx2.i.i4, align 1 + %19 = insertelement <8 x i8> %17, i8 %18, i32 1 + %arrayidx6.i.i5 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 2 + %20 = load i8, i8 addrspace(1)* %arrayidx6.i.i5, align 1 + %21 = insertelement <8 x i8> %19, i8 %20, i32 2 + %arrayidx10.i.i6 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 3 + %22 = load i8, i8 addrspace(1)* %arrayidx10.i.i6, align 1 + %23 = insertelement <8 x i8> %21, i8 %22, i32 3 + %arrayidx.i.i7 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 4 + %24 = load i8, i8 addrspace(1)* %arrayidx.i.i7, align 1 + %25 = insertelement <8 x i8> undef, i8 %24, i32 0 + %arrayidx2.i9.i8 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 5 + %26 = load i8, i8 addrspace(1)* %arrayidx2.i9.i8, align 1 + %27 = insertelement <8 x i8> %25, i8 %26, i32 1 + %arrayidx6.i11.i9 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 6 + %28 = load i8, i8 addrspace(1)* %arrayidx6.i11.i9, align 1 + %29 = insertelement <8 x i8> %27, i8 %28, i32 2 + %arrayidx10.i13.i10 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 7 + %30 = load i8, i8 addrspace(1)* %arrayidx10.i13.i10, align 1 + %31 = insertelement <8 x i8> %29, i8 %30, i32 3 + %vecinit5.i11 = shufflevector <8 x i8> %23, <8 x i8> %31, <8 x i32> + %cmp.i = icmp slt <8 x i8> %vecinit5.i, %vecinit5.i11 + %cond.i = select <8 x i1> %cmp.i, <8 x i8> %vecinit5.i, <8 x i8> %vecinit5.i11 + %32 = extractelement <8 x i8> %cond.i, i32 0 + store i8 %32, i8 addrspace(1)* %out, align 1 + %33 = extractelement <8 x i8> %cond.i, i32 1 + %arrayidx2.i.i.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 + store i8 %33, i8 addrspace(1)* %arrayidx2.i.i.i, align 1 + %34 = extractelement <8 x i8> %cond.i, i32 2 + %arrayidx.i.i.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 2 + store i8 %34, i8 addrspace(1)* %arrayidx.i.i.i, align 1 + %35 = extractelement <8 x i8> %cond.i, i32 3 + %arrayidx2.i6.i.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 3 + store i8 %35, i8 addrspace(1)* %arrayidx2.i6.i.i, align 1 + %arrayidx.i.i3 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 4 + %36 = extractelement <8 x i8> %cond.i, i32 4 + store i8 %36, i8 addrspace(1)* %arrayidx.i.i3, align 1 + %37 = extractelement <8 x i8> %cond.i, i32 5 + %arrayidx2.i.i6.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 5 + store i8 %37, i8 addrspace(1)* %arrayidx2.i.i6.i, align 1 + %38 = extractelement <8 x i8> %cond.i, i32 6 + %arrayidx.i.i7.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 6 + store i8 %38, i8 addrspace(1)* %arrayidx.i.i7.i, align 1 + %39 = extractelement <8 x i8> %cond.i, i32 7 + %arrayidx2.i6.i8.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 7 + store i8 %39, i8 addrspace(1)* %arrayidx2.i6.i8.i, align 1 + ret void +} + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!opencl.kernels = !{!0, !1, !2, !3, !4, !5, !6, !7, !8} + +!0 = !{null} +!1 = !{null} +!2 = !{null} +!3 = !{void (i8 addrspace(1)*, i8 addrspace(1)*, i8 addrspace(1)*)* @test_8_min_char} +!4 = !{null} +!5 = !{null} +!6 = !{null} +!7 = !{null} +!8 = !{null} diff --git a/test/CodeGen/AMDGPU/sign_extend.ll b/test/CodeGen/AMDGPU/sign_extend.ll new file mode 100644 index 00000000000..06bee114c23 --- /dev/null +++ b/test/CodeGen/AMDGPU/sign_extend.ll @@ -0,0 +1,63 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}s_sext_i1_to_i32: +; SI: v_cndmask_b32_e64 +; SI: s_endpgm +define void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %cmp = icmp eq i32 %a, %b + %sext = sext i1 %cmp to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}test_s_sext_i32_to_i64: +; SI: s_ashr_i32 +; SI: s_endpg +define void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind { +entry: + %mul = mul i32 %a, %b + %add = add i32 %mul, %c + %sext = sext i32 %add to i64 + store i64 %sext, i64 addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}s_sext_i1_to_i64: +; SI: v_cndmask_b32_e64 v[[LOREG:[0-9]+]], 0, -1, vcc +; SI: v_mov_b32_e32 v[[HIREG:[0-9]+]], v[[LOREG]] +; SI: buffer_store_dwordx2 v{{\[}}[[LOREG]]:[[HIREG]]{{\]}} +; SI: s_endpgm +define void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %cmp = icmp eq i32 %a, %b + %sext = sext i1 %cmp to i64 + store i64 %sext, i64 addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}s_sext_i32_to_i64: +; SI: s_ashr_i32 +; SI: s_endpgm +define void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind { + %sext = sext i32 %a to i64 + store i64 %sext, i64 addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}v_sext_i32_to_i64: +; SI: v_ashr +; SI: s_endpgm +define void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { + %val = load i32, i32 addrspace(1)* %in, align 4 + %sext = sext i32 %val to i64 + store i64 %sext, i64 addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}s_sext_i16_to_i64: +; SI: s_endpgm +define void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind { + %sext = sext i16 %a to i64 + store i64 %sext, i64 addrspace(1)* %out, align 8 + ret void +} diff --git a/test/CodeGen/AMDGPU/simplify-demanded-bits-build-pair.ll b/test/CodeGen/AMDGPU/simplify-demanded-bits-build-pair.ll new file mode 100644 index 00000000000..dffee70b6b0 --- /dev/null +++ b/test/CodeGen/AMDGPU/simplify-demanded-bits-build-pair.ll @@ -0,0 +1,39 @@ +; XFAIL: * +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI %s + +; 64-bit select was originally lowered with a build_pair, and this +; could be simplified to 1 cndmask instead of 2, but that broken when +; it started being implemented with a v2i32 build_vector and +; bitcasting. +define void @trunc_select_i64(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, i64 %a, i64 %b + %trunc = trunc i64 %select to i32 + store i32 %trunc, i32 addrspace(1)* %out, align 4 + ret void +} + +; FIXME: Fix truncating store for local memory +; SI-LABEL: {{^}}trunc_load_alloca_i64: +; SI: v_movrels_b32 +; SI-NOT: v_movrels_b32 +; SI: s_endpgm +define void @trunc_load_alloca_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) { + %idx = add i32 %a, %b + %alloca = alloca i64, i32 4 + %gep0 = getelementptr i64, i64* %alloca, i64 0 + %gep1 = getelementptr i64, i64* %alloca, i64 1 + %gep2 = getelementptr i64, i64* %alloca, i64 2 + %gep3 = getelementptr i64, i64* %alloca, i64 3 + store i64 24, i64* %gep0, align 8 + store i64 9334, i64* %gep1, align 8 + store i64 3935, i64* %gep2, align 8 + store i64 9342, i64* %gep3, align 8 + %gep = getelementptr i64, i64* %alloca, i32 %idx + %load = load i64, i64* %gep, align 8 + %mask = and i64 %load, 4294967296 + %add = add i64 %mask, -1 + store i64 %add, i64 addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/test/CodeGen/AMDGPU/sint_to_fp.f64.ll new file mode 100644 index 00000000000..da4e91db3a3 --- /dev/null +++ b/test/CodeGen/AMDGPU/sint_to_fp.f64.ll @@ -0,0 +1,61 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; SI-LABEL: {{^}}sint_to_fp_i32_to_f64 +; SI: v_cvt_f64_i32_e32 +define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) { + %result = sitofp i32 %in to double + store double %result, double addrspace(1)* %out + ret void +} + +; FIXME: select on 0, 0 +; SI-LABEL: {{^}}sint_to_fp_i1_f64: +; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]], +; We can't fold the SGPRs into v_cndmask_b32_e64, because it already +; uses an SGPR for [[CMP]] +; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]] +; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, [[CMP]] +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) { + %cmp = icmp eq i32 %in, 0 + %fp = sitofp i1 %cmp to double + store double %fp, double addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}sint_to_fp_i1_f64_load: +; SI: v_cndmask_b32_e64 [[IRESULT:v[0-9]]], 0, -1 +; SI-NEXT: v_cvt_f64_i32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]] +; SI: buffer_store_dwordx2 [[RESULT]] +; SI: s_endpgm +define void @sint_to_fp_i1_f64_load(double addrspace(1)* %out, i1 %in) { + %fp = sitofp i1 %in to double + store double %fp, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: @s_sint_to_fp_i64_to_f64 +define void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) { + %result = sitofp i64 %in to double + store double %result, double addrspace(1)* %out + ret void +} + +; SI-LABEL: @v_sint_to_fp_i64_to_f64 +; SI: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} +; SI: v_cvt_f64_i32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]] +; SI: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32 +; SI: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]] +; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]] +; SI: buffer_store_dwordx2 [[RESULT]] +define void @v_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %val = load i64, i64 addrspace(1)* %gep, align 8 + %result = sitofp i64 %val to double + store double %result, double addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/sint_to_fp.ll b/test/CodeGen/AMDGPU/sint_to_fp.ll new file mode 100644 index 00000000000..8506441d136 --- /dev/null +++ b/test/CodeGen/AMDGPU/sint_to_fp.ll @@ -0,0 +1,64 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s + + +; FUNC-LABEL: {{^}}s_sint_to_fp_i32_to_f32: +; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z +; SI: v_cvt_f32_i32_e32 {{v[0-9]+}}, {{s[0-9]+$}} +define void @s_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) { + %result = sitofp i32 %in to float + store float %result, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sint_to_fp_v2i32: +; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W +; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X + +; SI: v_cvt_f32_i32_e32 +; SI: v_cvt_f32_i32_e32 +define void @sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) { + %result = sitofp <2 x i32> %in to <2 x float> + store <2 x float> %result, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sint_to_fp_v4i32: +; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_cvt_f32_i32_e32 +; SI: v_cvt_f32_i32_e32 +; SI: v_cvt_f32_i32_e32 +; SI: v_cvt_f32_i32_e32 +define void @sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %value = load <4 x i32>, <4 x i32> addrspace(1) * %in + %result = sitofp <4 x i32> %value to <4 x float> + store <4 x float> %result, <4 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sint_to_fp_i1_f32: +; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]], +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) { + %cmp = icmp eq i32 %in, 0 + %fp = uitofp i1 %cmp to float + store float %fp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}sint_to_fp_i1_f32_load: +; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1.0 +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) { + %fp = sitofp i1 %in to float + store float %fp, float addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/smrd.ll b/test/CodeGen/AMDGPU/smrd.ll new file mode 100644 index 00000000000..b0c18ca5959 --- /dev/null +++ b/test/CodeGen/AMDGPU/smrd.ll @@ -0,0 +1,111 @@ +; RUN: llc < %s -march=amdgcn -mcpu=SI -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN %s + +; SMRD load with an immediate offset. +; GCN-LABEL: {{^}}smrd0: +; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01 +; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 +define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +entry: + %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 1 + %1 = load i32, i32 addrspace(2)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; SMRD load with the largest possible immediate offset. +; GCN-LABEL: {{^}}smrd1: +; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff +; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc +define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +entry: + %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 255 + %1 = load i32, i32 addrspace(2)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; SMRD load with an offset greater than the largest possible immediate. +; GCN-LABEL: {{^}}smrd2: +; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400 +; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] +; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 +; GCN: s_endpgm +define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +entry: + %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 256 + %1 = load i32, i32 addrspace(2)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; SMRD load with a 64-bit offset +; GCN-LABEL: {{^}}smrd3: +; FIXME: There are too many copies here because we don't fold immediates +; through REG_SEQUENCE +; SI: s_mov_b32 s[[SLO:[0-9]+]], 0 ; +; SI: s_mov_b32 s[[SHI:[0-9]+]], 4 +; SI: s_mov_b32 s[[SSLO:[0-9]+]], s[[SLO]] +; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SSLO]] +; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] +; FIXME: We should be able to use s_load_dword here +; SI: buffer_load_dword v{{[0-9]+}}, v{{\[}}[[VLO]]:[[VHI]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 +; TODO: Add VI checks +; GCN: s_endpgm +define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { +entry: + %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32 + %1 = load i32, i32 addrspace(2)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; SMRD load using the load.const intrinsic with an immediate offset +; GCN-LABEL: {{^}}smrd_load_const0: +; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04 +; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10 +define void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { +main_body: + %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 + %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 + %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16) + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22) + ret void +} + +; SMRD load using the load.const intrinsic with the largest possible immediate +; offset. +; GCN-LABEL: {{^}}smrd_load_const1: +; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff +; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc +define void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { +main_body: + %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 + %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 + %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1020) + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22) + ret void +} +; SMRD load using the load.const intrinsic with an offset greater than the +; largets possible immediate. +; immediate offset. +; GCN-LABEL: {{^}}smrd_load_const2: +; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400 +; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] +; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 +define void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { +main_body: + %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 + %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 + %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1024) + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22) + ret void +} + +; Function Attrs: nounwind readnone +declare float @llvm.SI.load.const(<16 x i8>, i32) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/split-scalar-i64-add.ll b/test/CodeGen/AMDGPU/split-scalar-i64-add.ll new file mode 100644 index 00000000000..46409cdfae1 --- /dev/null +++ b/test/CodeGen/AMDGPU/split-scalar-i64-add.ll @@ -0,0 +1,48 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() readnone + +; This is broken because the low half of the 64-bit add remains on the +; SALU, but the upper half does not. The addc expects the carry bit +; set in vcc, which is undefined since the low scalar half add sets +; scc instead. + +; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_0: +; SI: v_add_i32 +; SI: v_addc_u32 +define void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) { + %vec.0 = insertelement <2 x i32> undef, i32 %val, i32 0 + %vec.1 = insertelement <2 x i32> %vec.0, i32 999999, i32 1 + %bc = bitcast <2 x i32> %vec.1 to i64 + %add = add i64 %bc, 399 + store i64 %add, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_1: +; SI: v_add_i32 +; SI: v_addc_u32 +define void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i64 %val1) { + %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0 + %vec.1 = insertelement <2 x i32> %vec.0, i32 99999, i32 1 + %bc = bitcast <2 x i32> %vec.1 to i64 + %add = add i64 %bc, %val1 + store i64 %add, i64 addrspace(1)* %out, align 8 + ret void +} + +; Doesn't use constants +; FUNC-LABEL @imp_def_vcc_split_i64_add_2 +; SI: v_add_i32 +; SI: v_addc_u32 +define void @imp_def_vcc_split_i64_add_2(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) { + %tid = call i32 @llvm.r600.read.tidig.x() readnone + %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %load = load i32, i32 addrspace(1)* %gep + %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0 + %vec.1 = insertelement <2 x i32> %vec.0, i32 %load, i32 1 + %bc = bitcast <2 x i32> %vec.1 to i64 + %add = add i64 %bc, %val1 + store i64 %add, i64 addrspace(1)* %out, align 8 + ret void +} diff --git a/test/CodeGen/AMDGPU/sra.ll b/test/CodeGen/AMDGPU/sra.ll new file mode 100644 index 00000000000..bcbc32f4c05 --- /dev/null +++ b/test/CodeGen/AMDGPU/sra.ll @@ -0,0 +1,213 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI %s + +;EG-LABEL: {{^}}ashr_v2i32: +;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +;SI-LABEL: {{^}}ashr_v2i32: +;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +;VI-LABEL: {{^}}ashr_v2i32: +;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 + %a = load <2 x i32>, <2 x i32> addrspace(1) * %in + %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr + %result = ashr <2 x i32> %a, %b + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +;EG-LABEL: {{^}}ashr_v4i32: +;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +;SI-LABEL: {{^}}ashr_v4i32: +;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +;VI-LABEL: {{^}}ashr_v4i32: +;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 + %a = load <4 x i32>, <4 x i32> addrspace(1) * %in + %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr + %result = ashr <4 x i32> %a, %b + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +;EG-LABEL: {{^}}ashr_i64: +;EG: ASHR + +;SI-LABEL: {{^}}ashr_i64: +;SI: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8 + +;VI-LABEL: {{^}}ashr_i64: +;VI: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8 + +define void @ashr_i64(i64 addrspace(1)* %out, i32 %in) { +entry: + %0 = sext i32 %in to i64 + %1 = ashr i64 %0, 8 + store i64 %1, i64 addrspace(1)* %out + ret void +} + +;EG-LABEL: {{^}}ashr_i64_2: +;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]] +;EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}} +;EG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1 +;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal +;EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]] +;EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}} +;EG-DAG: ASHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}} +;EG-DAG: ASHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal +;EG-DAG: ASHR {{\*? *}}[[HIBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal +;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal +;EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}} +;EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}} + +;SI-LABEL: {{^}}ashr_i64_2: +;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} + +;VI-LABEL: {{^}}ashr_i64_2: +;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} + +define void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +entry: + %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 + %a = load i64, i64 addrspace(1) * %in + %b = load i64, i64 addrspace(1) * %b_ptr + %result = ashr i64 %a, %b + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;EG-LABEL: {{^}}ashr_v2i64: +;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]] +;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]] +;EG-DAG: LSHL {{\*? *}}[[COMPSHA]] +;EG-DAG: LSHL {{\*? *}}[[COMPSHB]] +;EG-DAG: LSHL {{.*}}, 1 +;EG-DAG: LSHL {{.*}}, 1 +;EG-DAG: ASHR {{.*}}, [[SHA]] +;EG-DAG: ASHR {{.*}}, [[SHB]] +;EG-DAG: LSHR {{.*}}, [[SHA]] +;EG-DAG: LSHR {{.*}}, [[SHB]] +;EG-DAG: OR_INT +;EG-DAG: OR_INT +;EG-DAG: ADD_INT {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal +;EG-DAG: ADD_INT {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal +;EG-DAG: ASHR +;EG-DAG: ASHR +;EG-DAG: ASHR {{.*}}, literal +;EG-DAG: ASHR {{.*}}, literal +;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal +;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal +;EG-DAG: CNDE_INT +;EG-DAG: CNDE_INT +;EG-DAG: CNDE_INT +;EG-DAG: CNDE_INT + +;SI-LABEL: {{^}}ashr_v2i64: +;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} +;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} + +;VI-LABEL: {{^}}ashr_v2i64: +;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} +;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} + +define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 + %a = load <2 x i64>, <2 x i64> addrspace(1) * %in + %b = load <2 x i64>, <2 x i64> addrspace(1) * %b_ptr + %result = ashr <2 x i64> %a, %b + store <2 x i64> %result, <2 x i64> addrspace(1)* %out + ret void +} + +;EG-LABEL: {{^}}ashr_v4i64: +;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]] +;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]] +;EG-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]] +;EG-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]] +;EG-DAG: LSHL {{\*? *}}[[COMPSHA]] +;EG-DAG: LSHL {{\*? *}}[[COMPSHB]] +;EG-DAG: LSHL {{\*? *}}[[COMPSHC]] +;EG-DAG: LSHL {{\*? *}}[[COMPSHD]] +;EG-DAG: LSHL {{.*}}, 1 +;EG-DAG: LSHL {{.*}}, 1 +;EG-DAG: LSHL {{.*}}, 1 +;EG-DAG: LSHL {{.*}}, 1 +;EG-DAG: ASHR {{.*}}, [[SHA]] +;EG-DAG: ASHR {{.*}}, [[SHB]] +;EG-DAG: ASHR {{.*}}, [[SHC]] +;EG-DAG: ASHR {{.*}}, [[SHD]] +;EG-DAG: LSHR {{.*}}, [[SHA]] +;EG-DAG: LSHR {{.*}}, [[SHB]] +;EG-DAG: LSHR {{.*}}, [[SHA]] +;EG-DAG: LSHR {{.*}}, [[SHB]] +;EG-DAG: OR_INT +;EG-DAG: OR_INT +;EG-DAG: OR_INT +;EG-DAG: OR_INT +;EG-DAG: ADD_INT {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal +;EG-DAG: ADD_INT {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal +;EG-DAG: ADD_INT {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal +;EG-DAG: ADD_INT {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal +;EG-DAG: ASHR +;EG-DAG: ASHR +;EG-DAG: ASHR +;EG-DAG: ASHR +;EG-DAG: ASHR {{.*}}, literal +;EG-DAG: ASHR {{.*}}, literal +;EG-DAG: ASHR {{.*}}, literal +;EG-DAG: ASHR {{.*}}, literal +;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal +;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal +;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal +;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal +;EG-DAG: CNDE_INT +;EG-DAG: CNDE_INT +;EG-DAG: CNDE_INT +;EG-DAG: CNDE_INT +;EG-DAG: CNDE_INT +;EG-DAG: CNDE_INT +;EG-DAG: CNDE_INT +;EG-DAG: CNDE_INT + +;SI-LABEL: {{^}}ashr_v4i64: +;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} +;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} +;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} +;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} + +;VI-LABEL: {{^}}ashr_v4i64: +;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} +;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} +;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} +;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} + +define void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 + %a = load <4 x i64>, <4 x i64> addrspace(1) * %in + %b = load <4 x i64>, <4 x i64> addrspace(1) * %b_ptr + %result = ashr <4 x i64> %a, %b + store <4 x i64> %result, <4 x i64> addrspace(1)* %out + ret void +} + diff --git a/test/CodeGen/AMDGPU/srem.ll b/test/CodeGen/AMDGPU/srem.ll new file mode 100644 index 00000000000..c78fd549b31 --- /dev/null +++ b/test/CodeGen/AMDGPU/srem.ll @@ -0,0 +1,112 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s + +define void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in + %den = load i32, i32 addrspace(1) * %den_ptr + %result = srem i32 %num, %den + store i32 %result, i32 addrspace(1)* %out + ret void +} + +define void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %num = load i32, i32 addrspace(1) * %in + %result = srem i32 %num, 4 + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}srem_i32_7: +; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x92492493 +; SI: v_mul_hi_i32 {{v[0-9]+}}, [[MAGIC]], +; SI: v_mul_lo_i32 +; SI: v_sub_i32 +; SI: s_endpgm +define void @srem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %num = load i32, i32 addrspace(1) * %in + %result = srem i32 %num, 7 + store i32 %result, i32 addrspace(1)* %out + ret void +} + +define void @srem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 + %num = load <2 x i32>, <2 x i32> addrspace(1) * %in + %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr + %result = srem <2 x i32> %num, %den + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +define void @srem_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %num = load <2 x i32>, <2 x i32> addrspace(1) * %in + %result = srem <2 x i32> %num, + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +define void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 + %num = load <4 x i32>, <4 x i32> addrspace(1) * %in + %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr + %result = srem <4 x i32> %num, %den + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +define void @srem_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %num = load <4 x i32>, <4 x i32> addrspace(1) * %in + %result = srem <4 x i32> %num, + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +define void @srem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { + %den_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 + %num = load i64, i64 addrspace(1) * %in + %den = load i64, i64 addrspace(1) * %den_ptr + %result = srem i64 %num, %den + store i64 %result, i64 addrspace(1)* %out + ret void +} + +define void @srem_i64_4(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { + %num = load i64, i64 addrspace(1) * %in + %result = srem i64 %num, 4 + store i64 %result, i64 addrspace(1)* %out + ret void +} + +define void @srem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { + %den_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 + %num = load <2 x i64>, <2 x i64> addrspace(1) * %in + %den = load <2 x i64>, <2 x i64> addrspace(1) * %den_ptr + %result = srem <2 x i64> %num, %den + store <2 x i64> %result, <2 x i64> addrspace(1)* %out + ret void +} + +define void @srem_v2i64_4(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { + %num = load <2 x i64>, <2 x i64> addrspace(1) * %in + %result = srem <2 x i64> %num, + store <2 x i64> %result, <2 x i64> addrspace(1)* %out + ret void +} + +define void @srem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { + %den_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 + %num = load <4 x i64>, <4 x i64> addrspace(1) * %in + %den = load <4 x i64>, <4 x i64> addrspace(1) * %den_ptr + %result = srem <4 x i64> %num, %den + store <4 x i64> %result, <4 x i64> addrspace(1)* %out + ret void +} + +define void @srem_v4i64_4(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { + %num = load <4 x i64>, <4 x i64> addrspace(1) * %in + %result = srem <4 x i64> %num, + store <4 x i64> %result, <4 x i64> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/srl.ll b/test/CodeGen/AMDGPU/srl.ll new file mode 100644 index 00000000000..4904d7fa1bd --- /dev/null +++ b/test/CodeGen/AMDGPU/srl.ll @@ -0,0 +1,186 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}lshr_i32: +; SI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +define void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %a = load i32, i32 addrspace(1)* %in + %b = load i32, i32 addrspace(1)* %b_ptr + %result = lshr i32 %a, %b + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}lshr_v2i32: +; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 + %a = load <2 x i32>, <2 x i32> addrspace(1)* %in + %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr + %result = lshr <2 x i32> %a, %b + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}lshr_v4i32: +; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 + %a = load <4 x i32>, <4 x i32> addrspace(1)* %in + %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr + %result = lshr <4 x i32> %a, %b + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}lshr_i64: +; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} +; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} + +; EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]] +; EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}} +; EG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1 +; EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal +; EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]] +; EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}} +; EG-DAG: LSHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}} +; EG-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}} +; EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal +; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}} +; EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0 +define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { + %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 + %a = load i64, i64 addrspace(1)* %in + %b = load i64, i64 addrspace(1)* %b_ptr + %result = lshr i64 %a, %b + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}lshr_v2i64: +; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} +; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} + +; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} +; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} + +; EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]] +; EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]] +; EG-DAG: LSHL {{\*? *}}[[COMPSHA]] +; EG-DAG: LSHL {{\*? *}}[[COMPSHB]] +; EG-DAG: LSHL {{.*}}, 1 +; EG-DAG: LSHL {{.*}}, 1 +; EG-DAG: LSHR {{.*}}, [[SHA]] +; EG-DAG: LSHR {{.*}}, [[SHB]] +; EG-DAG: LSHR {{.*}}, [[SHA]] +; EG-DAG: LSHR {{.*}}, [[SHB]] +; EG-DAG: OR_INT +; EG-DAG: OR_INT +; EG-DAG: ADD_INT {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal +; EG-DAG: ADD_INT {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal +; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal +; EG-DAG: CNDE_INT {{.*}}, 0.0 +; EG-DAG: CNDE_INT {{.*}}, 0.0 +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +define void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 + %a = load <2 x i64>, <2 x i64> addrspace(1)* %in + %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr + %result = lshr <2 x i64> %a, %b + store <2 x i64> %result, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}lshr_v4i64: +; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} +; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} +; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} +; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} + +; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} +; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} +; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} +; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} + +; EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]] +; EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]] +; EG-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]] +; EG-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]] +; EG-DAG: LSHL {{\*? *}}[[COMPSHA]] +; EG-DAG: LSHL {{\*? *}}[[COMPSHB]] +; EG-DAG: LSHL {{\*? *}}[[COMPSHC]] +; EG-DAG: LSHL {{\*? *}}[[COMPSHD]] +; EG-DAG: LSHL {{.*}}, 1 +; EG-DAG: LSHL {{.*}}, 1 +; EG-DAG: LSHL {{.*}}, 1 +; EG-DAG: LSHL {{.*}}, 1 +; EG-DAG: LSHR {{.*}}, [[SHA]] +; EG-DAG: LSHR {{.*}}, [[SHB]] +; EG-DAG: LSHR {{.*}}, [[SHC]] +; EG-DAG: LSHR {{.*}}, [[SHD]] +; EG-DAG: LSHR {{.*}}, [[SHA]] +; EG-DAG: LSHR {{.*}}, [[SHB]] +; EG-DAG: LSHR {{.*}}, [[SHC]] +; EG-DAG: LSHR {{.*}}, [[SHD]] +; EG-DAG: OR_INT +; EG-DAG: OR_INT +; EG-DAG: OR_INT +; EG-DAG: OR_INT +; EG-DAG: ADD_INT {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal +; EG-DAG: ADD_INT {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal +; EG-DAG: ADD_INT {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal +; EG-DAG: ADD_INT {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: LSHR +; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal +; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal +; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal +; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal +; EG-DAG: CNDE_INT {{.*}}, 0.0 +; EG-DAG: CNDE_INT {{.*}}, 0.0 +; EG-DAG: CNDE_INT {{.*}}, 0.0 +; EG-DAG: CNDE_INT {{.*}}, 0.0 +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 + %a = load <4 x i64>, <4 x i64> addrspace(1)* %in + %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr + %result = lshr <4 x i64> %a, %b + store <4 x i64> %result, <4 x i64> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/ssubo.ll b/test/CodeGen/AMDGPU/ssubo.ll new file mode 100644 index 00000000000..26884a1b776 --- /dev/null +++ b/test/CodeGen/AMDGPU/ssubo.ll @@ -0,0 +1,65 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s + +declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone +declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone + +; FUNC-LABEL: {{^}}ssubo_i64_zext: +define void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %ssub, 0 + %carry = extractvalue { i64, i1 } %ssub, 1 + %ext = zext i1 %carry to i64 + %add2 = add i64 %val, %ext + store i64 %add2, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_ssubo_i32: +define void @s_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { + %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind + %val = extractvalue { i32, i1 } %ssub, 0 + %carry = extractvalue { i32, i1 } %ssub, 1 + store i32 %val, i32 addrspace(1)* %out, align 4 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: {{^}}v_ssubo_i32: +define void @v_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %a = load i32, i32 addrspace(1)* %aptr, align 4 + %b = load i32, i32 addrspace(1)* %bptr, align 4 + %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind + %val = extractvalue { i32, i1 } %ssub, 0 + %carry = extractvalue { i32, i1 } %ssub, 1 + store i32 %val, i32 addrspace(1)* %out, align 4 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: {{^}}s_ssubo_i64: +; SI: s_sub_u32 +; SI: s_subb_u32 +define void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { + %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %ssub, 0 + %carry = extractvalue { i64, i1 } %ssub, 1 + store i64 %val, i64 addrspace(1)* %out, align 8 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: {{^}}v_ssubo_i64: +; SI: v_sub_i32_e32 +; SI: v_subb_u32_e32 +define void @v_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %a = load i64, i64 addrspace(1)* %aptr, align 4 + %b = load i64, i64 addrspace(1)* %bptr, align 4 + %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %ssub, 0 + %carry = extractvalue { i64, i1 } %ssub, 1 + store i64 %val, i64 addrspace(1)* %out, align 8 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} diff --git a/test/CodeGen/AMDGPU/store-barrier.ll b/test/CodeGen/AMDGPU/store-barrier.ll new file mode 100644 index 00000000000..4a72b4d090a --- /dev/null +++ b/test/CodeGen/AMDGPU/store-barrier.ll @@ -0,0 +1,42 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck --check-prefix=CHECK %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck --check-prefix=CHECK %s + +; This test is for a bug in the machine scheduler where stores without +; an underlying object would be moved across the barrier. In this +; test, the <2 x i8> store will be split into two i8 stores, so they +; won't have an underlying object. + +; CHECK-LABEL: {{^}}test: +; CHECK: ds_write_b8 +; CHECK: ds_write_b8 +; CHECK: s_barrier +; CHECK: s_endpgm +; Function Attrs: nounwind +define void @test(<2 x i8> addrspace(3)* nocapture %arg, <2 x i8> addrspace(1)* nocapture readonly %arg1, i32 addrspace(1)* nocapture readonly %arg2, <2 x i8> addrspace(1)* nocapture %arg3, i32 %arg4, i64 %tmp9) { +bb: + %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp9 + %tmp13 = load i32, i32 addrspace(1)* %tmp10, align 2 + %tmp14 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp13 + %tmp15 = load <2 x i8>, <2 x i8> addrspace(3)* %tmp14, align 2 + %tmp16 = add i32 %tmp13, 1 + %tmp17 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp16 + store <2 x i8> %tmp15, <2 x i8> addrspace(3)* %tmp17, align 2 + tail call void @llvm.AMDGPU.barrier.local() #2 + %tmp25 = load i32, i32 addrspace(1)* %tmp10, align 4 + %tmp26 = sext i32 %tmp25 to i64 + %tmp27 = sext i32 %arg4 to i64 + %tmp28 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp25, i32 %arg4 + %tmp29 = load i8, i8 addrspace(3)* %tmp28, align 1 + %tmp30 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(1)* %arg3, i64 %tmp26, i64 %tmp27 + store i8 %tmp29, i8 addrspace(1)* %tmp30, align 1 + %tmp32 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp25, i32 0 + %tmp33 = load i8, i8 addrspace(3)* %tmp32, align 1 + %tmp35 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(1)* %arg3, i64 %tmp26, i64 0 + store i8 %tmp33, i8 addrspace(1)* %tmp35, align 1 + ret void +} + +; Function Attrs: noduplicate nounwind +declare void @llvm.AMDGPU.barrier.local() #2 + +attributes #2 = { noduplicate nounwind } diff --git a/test/CodeGen/AMDGPU/store-v3i32.ll b/test/CodeGen/AMDGPU/store-v3i32.ll new file mode 100644 index 00000000000..33617b55ed6 --- /dev/null +++ b/test/CodeGen/AMDGPU/store-v3i32.ll @@ -0,0 +1,13 @@ +; XFAIL: * +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s + +; 3 vectors have the same size and alignment as 4 vectors, so this +; should be done in a single store. + +; SI-LABEL: {{^}}store_v3i32: +; SI: buffer_store_dwordx4 +define void @store_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a) nounwind { + store <3 x i32> %a, <3 x i32> addrspace(1)* %out, align 16 + ret void +} diff --git a/test/CodeGen/AMDGPU/store-v3i64.ll b/test/CodeGen/AMDGPU/store-v3i64.ll new file mode 100644 index 00000000000..e0c554ad2c1 --- /dev/null +++ b/test/CodeGen/AMDGPU/store-v3i64.ll @@ -0,0 +1,29 @@ +; XFAIL: * +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}global_store_v3i64: +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +define void @global_store_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %x) { + store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 32 + ret void +} + +; SI-LABEL: {{^}}global_store_v3i64_unaligned: +define void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) { + store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1 + ret void +} + +; SI-LABEL: {{^}}local_store_v3i64: +define void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) { + store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 32 + ret void +} + +; SI-LABEL: {{^}}local_store_v3i64_unaligned: +define void @local_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) { + store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1 + ret void +} diff --git a/test/CodeGen/AMDGPU/store-vector-ptrs.ll b/test/CodeGen/AMDGPU/store-vector-ptrs.ll new file mode 100644 index 00000000000..d5af3b29118 --- /dev/null +++ b/test/CodeGen/AMDGPU/store-vector-ptrs.ll @@ -0,0 +1,12 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s + +; This tests for a bug that caused a crash in +; AMDGPUDAGToDAGISel::SelectMUBUFScratch() which is used for selecting +; scratch loads and stores. +; CHECK-LABEL: {{^}}store_vector_ptrs: +define void @store_vector_ptrs(<4 x i32*>* %out, <4 x [1024 x i32]*> %array) nounwind { + %p = getelementptr [1024 x i32], <4 x [1024 x i32]*> %array, <4 x i16> zeroinitializer, <4 x i16> + store <4 x i32*> %p, <4 x i32*>* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/store.ll b/test/CodeGen/AMDGPU/store.ll new file mode 100644 index 00000000000..0f89405e073 --- /dev/null +++ b/test/CodeGen/AMDGPU/store.ll @@ -0,0 +1,369 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s + +;===------------------------------------------------------------------------===; +; Global Address Space +;===------------------------------------------------------------------------===; +; FUNC-LABEL: {{^}}store_i1: +; EG: MEM_RAT MSKOR +; SI: buffer_store_byte +define void @store_i1(i1 addrspace(1)* %out) { +entry: + store i1 true, i1 addrspace(1)* %out + ret void +} + +; i8 store +; FUNC-LABEL: {{^}}store_i8: +; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X + +; IG 0: Get the byte index and truncate the value +; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x +; EG: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x +; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y +; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43) + + +; IG 1: Truncate the calculated the shift amount for the mask + +; IG 2: Shift the value and the mask +; EG: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]] +; EG: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]] +; EG-NEXT: 255 +; IG 3: Initialize the Y and Z channels to zero +; XXX: An optimal scheduler should merge this into one of the prevous IGs. +; EG: MOV T[[RW_GPR]].Y, 0.0 +; EG: MOV * T[[RW_GPR]].Z, 0.0 + +; SI: buffer_store_byte + +define void @store_i8(i8 addrspace(1)* %out, i8 %in) { +entry: + store i8 %in, i8 addrspace(1)* %out + ret void +} + +; i16 store +; FUNC-LABEL: {{^}}store_i16: +; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X + +; IG 0: Get the byte index and truncate the value + + +; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x +; EG-NEXT: 3(4.203895e-45), + +; EG: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x +; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y + +; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) +; IG 1: Truncate the calculated the shift amount for the mask + +; IG 2: Shift the value and the mask +; EG: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]] +; EG: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]] +; EG-NEXT: 65535 +; IG 3: Initialize the Y and Z channels to zero +; XXX: An optimal scheduler should merge this into one of the prevous IGs. +; EG: MOV T[[RW_GPR]].Y, 0.0 +; EG: MOV * T[[RW_GPR]].Z, 0.0 + +; SI: buffer_store_short +define void @store_i16(i16 addrspace(1)* %out, i16 %in) { +entry: + store i16 %in, i16 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_v2i8: +; EG: MEM_RAT MSKOR +; EG-NOT: MEM_RAT MSKOR + +; SI: buffer_store_byte +; SI: buffer_store_byte +define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) { +entry: + %0 = trunc <2 x i32> %in to <2 x i8> + store <2 x i8> %0, <2 x i8> addrspace(1)* %out + ret void +} + + +; FUNC-LABEL: {{^}}store_v2i16: +; EG: MEM_RAT_CACHELESS STORE_RAW + +; CM: MEM_RAT_CACHELESS STORE_DWORD + +; SI: buffer_store_short +; SI: buffer_store_short +define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) { +entry: + %0 = trunc <2 x i32> %in to <2 x i16> + store <2 x i16> %0, <2 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_v4i8: +; EG: MEM_RAT_CACHELESS STORE_RAW + +; CM: MEM_RAT_CACHELESS STORE_DWORD + +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { +entry: + %0 = trunc <4 x i32> %in to <4 x i8> + store <4 x i8> %0, <4 x i8> addrspace(1)* %out + ret void +} + +; floating-point store +; FUNC-LABEL: {{^}}store_f32: +; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.X, T[0-9]+\.X}}, 1 + +; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+\.X, T[0-9]+\.X}} + +; SI: buffer_store_dword + +define void @store_f32(float addrspace(1)* %out, float %in) { + store float %in, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_v4i16: +; EG: MEM_RAT MSKOR +; EG: MEM_RAT MSKOR +; EG: MEM_RAT MSKOR +; EG: MEM_RAT MSKOR +; EG-NOT: MEM_RAT MSKOR + +; SI: buffer_store_short +; SI: buffer_store_short +; SI: buffer_store_short +; SI: buffer_store_short +; SI-NOT: buffer_store_byte +define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) { +entry: + %0 = trunc <4 x i32> %in to <4 x i16> + store <4 x i16> %0, <4 x i16> addrspace(1)* %out + ret void +} + +; vec2 floating-point stores +; FUNC-LABEL: {{^}}store_v2f32: +; EG: MEM_RAT_CACHELESS STORE_RAW + +; CM: MEM_RAT_CACHELESS STORE_DWORD + +; SI: buffer_store_dwordx2 + +define void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) { +entry: + %0 = insertelement <2 x float> , float %a, i32 0 + %1 = insertelement <2 x float> %0, float %b, i32 1 + store <2 x float> %1, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_v4i32: +; EG: MEM_RAT_CACHELESS STORE_RAW +; EG-NOT: MEM_RAT_CACHELESS STORE_RAW + +; CM: MEM_RAT_CACHELESS STORE_DWORD +; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD + +; SI: buffer_store_dwordx4 +define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) { +entry: + store <4 x i32> %in, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_i64_i8: +; EG: MEM_RAT MSKOR +; SI: buffer_store_byte +define void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) { +entry: + %0 = trunc i64 %in to i8 + store i8 %0, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_i64_i16: +; EG: MEM_RAT MSKOR +; SI: buffer_store_short +define void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) { +entry: + %0 = trunc i64 %in to i16 + store i16 %0, i16 addrspace(1)* %out + ret void +} + +;===------------------------------------------------------------------------===; +; Local Address Space +;===------------------------------------------------------------------------===; + +; FUNC-LABEL: {{^}}store_local_i1: +; EG: LDS_BYTE_WRITE +; SI: ds_write_b8 +define void @store_local_i1(i1 addrspace(3)* %out) { +entry: + store i1 true, i1 addrspace(3)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_local_i8: +; EG: LDS_BYTE_WRITE + +; SI: ds_write_b8 +define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) { + store i8 %in, i8 addrspace(3)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_local_i16: +; EG: LDS_SHORT_WRITE + +; SI: ds_write_b16 +define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) { + store i16 %in, i16 addrspace(3)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_local_v2i16: +; EG: LDS_WRITE + +; CM: LDS_WRITE + +; SI: ds_write_b16 +; SI: ds_write_b16 +define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) { +entry: + store <2 x i16> %in, <2 x i16> addrspace(3)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_local_v4i8: +; EG: LDS_WRITE + +; CM: LDS_WRITE + +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +define void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) { +entry: + store <4 x i8> %in, <4 x i8> addrspace(3)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_local_v2i32: +; EG: LDS_WRITE +; EG: LDS_WRITE + +; CM: LDS_WRITE +; CM: LDS_WRITE + +; SI: ds_write_b64 +define void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) { +entry: + store <2 x i32> %in, <2 x i32> addrspace(3)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_local_v4i32: +; EG: LDS_WRITE +; EG: LDS_WRITE +; EG: LDS_WRITE +; EG: LDS_WRITE + +; CM: LDS_WRITE +; CM: LDS_WRITE +; CM: LDS_WRITE +; CM: LDS_WRITE + +; SI: ds_write_b32 +; SI: ds_write_b32 +; SI: ds_write_b32 +; SI: ds_write_b32 +define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) { +entry: + store <4 x i32> %in, <4 x i32> addrspace(3)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_local_i64_i8: +; EG: LDS_BYTE_WRITE +; SI: ds_write_b8 +define void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) { +entry: + %0 = trunc i64 %in to i8 + store i8 %0, i8 addrspace(3)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_local_i64_i16: +; EG: LDS_SHORT_WRITE +; SI: ds_write_b16 +define void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) { +entry: + %0 = trunc i64 %in to i16 + store i16 %0, i16 addrspace(3)* %out + ret void +} + +; The stores in this function are combined by the optimizer to create a +; 64-bit store with 32-bit alignment. This is legal for SI and the legalizer +; should not try to split the 64-bit store back into 2 32-bit stores. +; +; Evergreen / Northern Islands don't support 64-bit stores yet, so there should +; be two 32-bit stores. + +; FUNC-LABEL: {{^}}vecload2: +; EG: MEM_RAT_CACHELESS STORE_RAW + +; CM: MEM_RAT_CACHELESS STORE_DWORD + +; SI: buffer_store_dwordx2 +define void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 { +entry: + %0 = load i32, i32 addrspace(2)* %mem, align 4 + %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(2)* %mem, i64 1 + %1 = load i32, i32 addrspace(2)* %arrayidx1.i, align 4 + store i32 %0, i32 addrspace(1)* %out, align 4 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 + store i32 %1, i32 addrspace(1)* %arrayidx1, align 4 + ret void +} + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } + +; When i128 was a legal type this program generated cannot select errors: + +; FUNC-LABEL: {{^}}"i128-const-store": +; FIXME: We should be able to to this with one store instruction +; EG: STORE_RAW +; EG: STORE_RAW +; EG: STORE_RAW +; EG: STORE_RAW +; CM: STORE_DWORD +; CM: STORE_DWORD +; CM: STORE_DWORD +; CM: STORE_DWORD +; SI: buffer_store_dwordx4 +define void @i128-const-store(i32 addrspace(1)* %out) { +entry: + store i32 1, i32 addrspace(1)* %out, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 + store i32 1, i32 addrspace(1)* %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 + store i32 2, i32 addrspace(1)* %arrayidx4, align 4 + %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 + store i32 2, i32 addrspace(1)* %arrayidx6, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/store.r600.ll b/test/CodeGen/AMDGPU/store.r600.ll new file mode 100644 index 00000000000..696fb033b5e --- /dev/null +++ b/test/CodeGen/AMDGPU/store.r600.ll @@ -0,0 +1,22 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s + +; XXX: Merge this test into store.ll once it is supported on SI + +; v4i32 store +; EG: {{^}}store_v4i32: +; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1 + +define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %1 = load <4 x i32>, <4 x i32> addrspace(1) * %in + store <4 x i32> %1, <4 x i32> addrspace(1)* %out + ret void +} + +; v4f32 store +; EG: {{^}}store_v4f32: +; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1 +define void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { + %1 = load <4 x float>, <4 x float> addrspace(1) * %in + store <4 x float> %1, <4 x float> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/structurize.ll b/test/CodeGen/AMDGPU/structurize.ll new file mode 100644 index 00000000000..02e592e9a55 --- /dev/null +++ b/test/CodeGen/AMDGPU/structurize.ll @@ -0,0 +1,83 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood -mattr=disable-irstructurizer | FileCheck %s +; Test case for a crash in the AMDILCFGStructurizer from a CFG like this: +; +; entry +; / \ +; diamond_head branch_from +; / \ | +; diamond_false diamond_true +; \ / +; done +; +; When the diamond_true branch had more than 100 instructions. +; +; + +; CHECK-LABEL: {{^}}branch_into_diamond: +; === entry block: +; CHECK: ALU_PUSH_BEFORE +; === Branch instruction (IF): +; CHECK: JUMP + ; === branch_from block + ; CHECK: ALU + ; === Duplicated diamond_true block (There can be more than one ALU clause): + ; === XXX: We should be able to optimize this so the basic block is not + ; === duplicated. See comments in + ; === AMDGPUCFGStructurizer::improveSimpleJumpintoIf() + ; CHECK: ALU +; === Branch instruction (ELSE): +; CHECK: ELSE + ; === diamond_head block: + ; CHECK: ALU_PUSH_BEFORE + ; === Branch instruction (IF): + ; CHECK: JUMP + ; === diamond_true block (There can be more than one ALU clause): + ; ALU + ; === Branch instruction (ELSE): + ; CHECK: ELSE + ; === diamond_false block plus implicit ENDIF + ; CHECK: ALU_POP_AFTER +; === Branch instruction (ENDIF): +; CHECK: POP +; === done block: +; CHECK: ALU +; CHECK: MEM_RAT_CACHELESS +; CHECK: CF_END + + +define void @branch_into_diamond(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +entry: +%0 = icmp ne i32 %a, 0 + br i1 %0, label %diamond_head, label %branch_from + +diamond_head: + %1 = icmp ne i32 %a, 1 + br i1 %1, label %diamond_true, label %diamond_false + +branch_from: + %2 = add i32 %a, 1 + br label %diamond_true + +diamond_false: + %3 = add i32 %a, 2 + br label %done + +diamond_true: + %4 = phi i32 [%2, %branch_from], [%a, %diamond_head] + ; This block needs to be > 100 ISA instructions to hit the bug, + ; so we'll use udiv instructions. + %div0 = udiv i32 %a, %b + %div1 = udiv i32 %div0, %4 + %div2 = udiv i32 %div1, 11 + %div3 = udiv i32 %div2, %a + %div4 = udiv i32 %div3, %b + %div5 = udiv i32 %div4, %c + %div6 = udiv i32 %div5, %div0 + %div7 = udiv i32 %div6, %div1 + br label %done + +done: + %5 = phi i32 [%3, %diamond_false], [%div7, %diamond_true] + store i32 %5, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/structurize1.ll b/test/CodeGen/AMDGPU/structurize1.ll new file mode 100644 index 00000000000..77432c1f9d2 --- /dev/null +++ b/test/CodeGen/AMDGPU/structurize1.ll @@ -0,0 +1,62 @@ +; RUN: llc < %s -march=r600 -mattr=disable-ifcvt -mcpu=redwood | FileCheck %s + +; This tests for abug where the AMDILCFGStructurizer was crashing on loops +; like this: +; +; for (i = 0; i < x; i++) { +; if (cond0) { +; if (cond1) { +; +; } else { +; +; } +; if (cond2) { +; +; } +; } +; } + +; CHECK-LABEL: {{^}}if_inside_loop: +; CHECK: LOOP_START_DX10 +; CHECK: END_LOOP +define void @if_inside_loop(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { +entry: + br label %for.body + +for.body: + %0 = phi i32 [0, %entry], [%inc, %for.inc] + %val = phi i32 [0, %entry], [%val.for.inc, %for.inc] + %inc = add i32 %0, 1 + %1 = icmp ult i32 10, %a + br i1 %1, label %for.inc, label %if.then + +if.then: + %2 = icmp ne i32 0, %b + br i1 %2, label %if.then.true, label %if.then.false + +if.then.true: + %3 = add i32 %a, %val + br label %if + +if.then.false: + %4 = mul i32 %a, %val + br label %if + +if: + %val.if = phi i32 [%3, %if.then.true], [%4, %if.then.false] + %5 = icmp ne i32 0, %c + br i1 %5, label %if.true, label %for.inc + +if.true: + %6 = add i32 %a, %val.if + br label %for.inc + +for.inc: + %val.for.inc = phi i32 [%val, %for.body], [%val.if, %if], [%6, %if.true] + %7 = icmp ne i32 0, %d + br i1 %7, label %for.body, label %exit + +exit: + store i32 %val.for.inc, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/sub.ll b/test/CodeGen/AMDGPU/sub.ll new file mode 100644 index 00000000000..b7fba0efa5b --- /dev/null +++ b/test/CodeGen/AMDGPU/sub.ll @@ -0,0 +1,130 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + + +declare i32 @llvm.r600.read.tidig.x() readnone + +; FUNC-LABEL: {{^}}test_sub_i32: +; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_subrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +define void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %a = load i32, i32 addrspace(1)* %in + %b = load i32, i32 addrspace(1)* %b_ptr + %result = sub i32 %a, %b + store i32 %result, i32 addrspace(1)* %out + ret void +} + + +; FUNC-LABEL: {{^}}test_sub_v2i32: +; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 + %a = load <2 x i32>, <2 x i32> addrspace(1) * %in + %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr + %result = sub <2 x i32> %a, %b + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_sub_v4i32: +; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 + %a = load <4 x i32>, <4 x i32> addrspace(1) * %in + %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr + %result = sub <4 x i32> %a, %b + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}s_sub_i64: +; SI: s_sub_u32 +; SI: s_subb_u32 + +; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] +; EG-DAG: SUB_INT {{[* ]*}}[[LO]] +; EG-DAG: SUBB_UINT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT {{[* ]*}}[[HI]] +; EG-NOT: SUB +define void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind { + %result = sub i64 %a, %b + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_sub_i64: +; SI: v_sub_i32_e32 +; SI: v_subb_u32_e32 + +; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] +; EG-DAG: SUB_INT {{[* ]*}}[[LO]] +; EG-DAG: SUBB_UINT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT {{[* ]*}}[[HI]] +; EG-NOT: SUB +define void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() readnone + %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid + %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid + %a = load i64, i64 addrspace(1)* %a_ptr + %b = load i64, i64 addrspace(1)* %b_ptr + %result = sub i64 %a, %b + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}v_test_sub_v2i64: +; SI: v_sub_i32_e32 +; SI: v_subb_u32_e32 +; SI: v_sub_i32_e32 +; SI: v_subb_u32_e32 +define void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) { + %tid = call i32 @llvm.r600.read.tidig.x() readnone + %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid + %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid + %a = load <2 x i64>, <2 x i64> addrspace(1)* %a_ptr + %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr + %result = sub <2 x i64> %a, %b + store <2 x i64> %result, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_test_sub_v4i64: +; SI: v_sub_i32_e32 +; SI: v_subb_u32_e32 +; SI: v_sub_i32_e32 +; SI: v_subb_u32_e32 +; SI: v_sub_i32_e32 +; SI: v_subb_u32_e32 +; SI: v_sub_i32_e32 +; SI: v_subb_u32_e32 +define void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) { + %tid = call i32 @llvm.r600.read.tidig.x() readnone + %a_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inA, i32 %tid + %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inB, i32 %tid + %a = load <4 x i64>, <4 x i64> addrspace(1)* %a_ptr + %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr + %result = sub <4 x i64> %a, %b + store <4 x i64> %result, <4 x i64> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll b/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll new file mode 100644 index 00000000000..c4dae4736cf --- /dev/null +++ b/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll @@ -0,0 +1,109 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -o - %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - %s + +; SI-LABEL:{{^}}row_filter_C1_D0: +; SI: s_endpgm +; Function Attrs: nounwind +define void @row_filter_C1_D0() { +entry: + br i1 undef, label %for.inc.1, label %do.body.preheader + +do.body.preheader: ; preds = %entry + %0 = insertelement <4 x i32> zeroinitializer, i32 undef, i32 1 + br i1 undef, label %do.body56.1, label %do.body90 + +do.body90: ; preds = %do.body56.2, %do.body56.1, %do.body.preheader + %1 = phi <4 x i32> [ %6, %do.body56.2 ], [ %5, %do.body56.1 ], [ %0, %do.body.preheader ] + %2 = insertelement <4 x i32> %1, i32 undef, i32 2 + %3 = insertelement <4 x i32> %2, i32 undef, i32 3 + br i1 undef, label %do.body124.1, label %do.body.1562.preheader + +do.body.1562.preheader: ; preds = %do.body124.1, %do.body90 + %storemerge = phi <4 x i32> [ %3, %do.body90 ], [ %7, %do.body124.1 ] + %4 = insertelement <4 x i32> undef, i32 undef, i32 1 + br label %for.inc.1 + +do.body56.1: ; preds = %do.body.preheader + %5 = insertelement <4 x i32> %0, i32 undef, i32 1 + %or.cond472.1 = or i1 undef, undef + br i1 %or.cond472.1, label %do.body56.2, label %do.body90 + +do.body56.2: ; preds = %do.body56.1 + %6 = insertelement <4 x i32> %5, i32 undef, i32 1 + br label %do.body90 + +do.body124.1: ; preds = %do.body90 + %7 = insertelement <4 x i32> %3, i32 undef, i32 3 + br label %do.body.1562.preheader + +for.inc.1: ; preds = %do.body.1562.preheader, %entry + %storemerge591 = phi <4 x i32> [ zeroinitializer, %entry ], [ %storemerge, %do.body.1562.preheader ] + %add.i495 = add <4 x i32> %storemerge591, undef + unreachable +} + +; SI-LABEL: {{^}}foo: +; SI: s_endpgm +define void @foo() #0 { +bb: + br i1 undef, label %bb2, label %bb1 + +bb1: ; preds = %bb + br i1 undef, label %bb4, label %bb6 + +bb2: ; preds = %bb4, %bb + %tmp = phi float [ %tmp5, %bb4 ], [ 0.000000e+00, %bb ] + br i1 undef, label %bb9, label %bb13 + +bb4: ; preds = %bb7, %bb6, %bb1 + %tmp5 = phi float [ undef, %bb1 ], [ undef, %bb6 ], [ %tmp8, %bb7 ] + br label %bb2 + +bb6: ; preds = %bb1 + br i1 undef, label %bb7, label %bb4 + +bb7: ; preds = %bb6 + %tmp8 = fmul float undef, undef + br label %bb4 + +bb9: ; preds = %bb2 + %tmp10 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 2) + %tmp11 = extractelement <4 x float> %tmp10, i32 1 + %tmp12 = extractelement <4 x float> %tmp10, i32 3 + br label %bb14 + +bb13: ; preds = %bb2 + br i1 undef, label %bb23, label %bb24 + +bb14: ; preds = %bb27, %bb24, %bb9 + %tmp15 = phi float [ %tmp12, %bb9 ], [ undef, %bb27 ], [ 0.000000e+00, %bb24 ] + %tmp16 = phi float [ %tmp11, %bb9 ], [ undef, %bb27 ], [ %tmp25, %bb24 ] + %tmp17 = fmul float 10.5, %tmp16 + %tmp18 = fmul float 11.5, %tmp15 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp18, float %tmp17, float %tmp17, float %tmp17) + ret void + +bb23: ; preds = %bb13 + br i1 undef, label %bb24, label %bb26 + +bb24: ; preds = %bb26, %bb23, %bb13 + %tmp25 = phi float [ %tmp, %bb13 ], [ %tmp, %bb26 ], [ 0.000000e+00, %bb23 ] + br i1 undef, label %bb27, label %bb14 + +bb26: ; preds = %bb23 + br label %bb24 + +bb27: ; preds = %bb24 + br label %bb14 +} + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.SI.packf16(float, float) #1 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" "unsafe-fp-math"="true" } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll b/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll new file mode 100644 index 00000000000..8bd995a8ecb --- /dev/null +++ b/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll @@ -0,0 +1,19 @@ +; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck %s +; LiveRangeEdit::eliminateDeadDef did not update LiveInterval sub ranges +; properly. + +; Just make sure this test doesn't crash. +; CHECK-LABEL: foobar: +; CHECK: s_endpgm +define void @foobar() { + %v0 = icmp eq <4 x i32> undef, + %v3 = sext <4 x i1> %v0 to <4 x i32> + %v4 = extractelement <4 x i32> %v3, i32 1 + %v5 = icmp ne i32 %v4, 0 + %v6 = select i1 %v5, i32 undef, i32 0 + %v15 = insertelement <2 x i32> undef, i32 %v6, i32 1 + store <2 x i32> %v15, <2 x i32> addrspace(1)* undef, align 8 + ret void +} + +declare double @llvm.fma.f64(double, double, double) diff --git a/test/CodeGen/AMDGPU/swizzle-export.ll b/test/CodeGen/AMDGPU/swizzle-export.ll new file mode 100644 index 00000000000..000ee2faa47 --- /dev/null +++ b/test/CodeGen/AMDGPU/swizzle-export.ll @@ -0,0 +1,129 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s + +;EG: {{^}}main: +;EG: EXPORT T{{[0-9]+}}.XYXX +;EG: EXPORT T{{[0-9]+}}.ZXXX +;EG: EXPORT T{{[0-9]+}}.XXWX +;EG: EXPORT T{{[0-9]+}}.XXXW + +define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { +main_body: + %0 = extractelement <4 x float> %reg1, i32 0 + %1 = extractelement <4 x float> %reg1, i32 1 + %2 = extractelement <4 x float> %reg1, i32 2 + %3 = extractelement <4 x float> %reg1, i32 3 + %4 = load <4 x float>, <4 x float> addrspace(8)* null + %5 = extractelement <4 x float> %4, i32 1 + %6 = load <4 x float>, <4 x float> addrspace(8)* null + %7 = extractelement <4 x float> %6, i32 2 + %8 = load <4 x float>, <4 x float> addrspace(8)* null + %9 = extractelement <4 x float> %8, i32 0 + %10 = fmul float 0.000000e+00, %9 + %11 = load <4 x float>, <4 x float> addrspace(8)* null + %12 = extractelement <4 x float> %11, i32 0 + %13 = fmul float %5, %12 + %14 = load <4 x float>, <4 x float> addrspace(8)* null + %15 = extractelement <4 x float> %14, i32 0 + %16 = fmul float 0.000000e+00, %15 + %17 = load <4 x float>, <4 x float> addrspace(8)* null + %18 = extractelement <4 x float> %17, i32 0 + %19 = fmul float 0.000000e+00, %18 + %20 = load <4 x float>, <4 x float> addrspace(8)* null + %21 = extractelement <4 x float> %20, i32 0 + %22 = fmul float %7, %21 + %23 = load <4 x float>, <4 x float> addrspace(8)* null + %24 = extractelement <4 x float> %23, i32 0 + %25 = fmul float 0.000000e+00, %24 + %26 = load <4 x float>, <4 x float> addrspace(8)* null + %27 = extractelement <4 x float> %26, i32 0 + %28 = fmul float 0.000000e+00, %27 + %29 = load <4 x float>, <4 x float> addrspace(8)* null + %30 = extractelement <4 x float> %29, i32 0 + %31 = fmul float 0.000000e+00, %30 + %32 = load <4 x float>, <4 x float> addrspace(8)* null + %33 = extractelement <4 x float> %32, i32 0 + %34 = fmul float 0.000000e+00, %33 + %35 = load <4 x float>, <4 x float> addrspace(8)* null + %36 = extractelement <4 x float> %35, i32 0 + %37 = fmul float 0.000000e+00, %36 + %38 = load <4 x float>, <4 x float> addrspace(8)* null + %39 = extractelement <4 x float> %38, i32 0 + %40 = fmul float 1.000000e+00, %39 + %41 = load <4 x float>, <4 x float> addrspace(8)* null + %42 = extractelement <4 x float> %41, i32 0 + %43 = fmul float 0.000000e+00, %42 + %44 = load <4 x float>, <4 x float> addrspace(8)* null + %45 = extractelement <4 x float> %44, i32 0 + %46 = fmul float 0.000000e+00, %45 + %47 = load <4 x float>, <4 x float> addrspace(8)* null + %48 = extractelement <4 x float> %47, i32 0 + %49 = fmul float 0.000000e+00, %48 + %50 = load <4 x float>, <4 x float> addrspace(8)* null + %51 = extractelement <4 x float> %50, i32 0 + %52 = fmul float 0.000000e+00, %51 + %53 = load <4 x float>, <4 x float> addrspace(8)* null + %54 = extractelement <4 x float> %53, i32 0 + %55 = fmul float 1.000000e+00, %54 + %56 = insertelement <4 x float> undef, float %0, i32 0 + %57 = insertelement <4 x float> %56, float %1, i32 1 + %58 = insertelement <4 x float> %57, float %2, i32 2 + %59 = insertelement <4 x float> %58, float %3, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %59, i32 60, i32 1) + %60 = insertelement <4 x float> undef, float %10, i32 0 + %61 = insertelement <4 x float> %60, float %13, i32 1 + %62 = insertelement <4 x float> %61, float %16, i32 2 + %63 = insertelement <4 x float> %62, float %19, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %63, i32 0, i32 2) + %64 = insertelement <4 x float> undef, float %22, i32 0 + %65 = insertelement <4 x float> %64, float %25, i32 1 + %66 = insertelement <4 x float> %65, float %28, i32 2 + %67 = insertelement <4 x float> %66, float %31, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %67, i32 1, i32 2) + %68 = insertelement <4 x float> undef, float %34, i32 0 + %69 = insertelement <4 x float> %68, float %37, i32 1 + %70 = insertelement <4 x float> %69, float %40, i32 2 + %71 = insertelement <4 x float> %70, float %43, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %71, i32 2, i32 2) + %72 = insertelement <4 x float> undef, float %46, i32 0 + %73 = insertelement <4 x float> %72, float %49, i32 1 + %74 = insertelement <4 x float> %73, float %52, i32 2 + %75 = insertelement <4 x float> %74, float %55, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %75, i32 3, i32 2) + ret void +} + +; EG: {{^}}main2: +; EG: T{{[0-9]+}}.XY__ +; EG: T{{[0-9]+}}.ZXY0 + +define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { +main_body: + %0 = extractelement <4 x float> %reg1, i32 0 + %1 = extractelement <4 x float> %reg1, i32 1 + %2 = fadd float %0, 2.5 + %3 = fmul float %1, 3.5 + %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) + %5 = extractelement <4 x float> %4, i32 0 + %6 = call float @llvm.cos.f32(float %5) + %7 = load <4 x float>, <4 x float> addrspace(8)* null + %8 = extractelement <4 x float> %7, i32 0 + %9 = load <4 x float>, <4 x float> addrspace(8)* null + %10 = extractelement <4 x float> %9, i32 1 + %11 = insertelement <4 x float> undef, float %2, i32 0 + %12 = insertelement <4 x float> %11, float %3, i32 1 + call void @llvm.R600.store.swizzle(<4 x float> %12, i32 60, i32 1) + %13 = insertelement <4 x float> undef, float %6, i32 0 + %14 = insertelement <4 x float> %13, float %8, i32 1 + %15 = insertelement <4 x float> %14, float %10, i32 2 + %16 = insertelement <4 x float> %15, float 0.000000e+00, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %16, i32 0, i32 2) + ret void +} + +; Function Attrs: nounwind readonly +declare float @llvm.cos.f32(float) #1 + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="1" } +attributes #1 = { nounwind readonly } diff --git a/test/CodeGen/AMDGPU/tex-clause-antidep.ll b/test/CodeGen/AMDGPU/tex-clause-antidep.ll new file mode 100644 index 00000000000..cbb9c50974a --- /dev/null +++ b/test/CodeGen/AMDGPU/tex-clause-antidep.ll @@ -0,0 +1,25 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +;CHECK: TEX +;CHECK-NEXT: ALU + +define void @test(<4 x float> inreg %reg0) #0 { + %1 = extractelement <4 x float> %reg0, i32 0 + %2 = extractelement <4 x float> %reg0, i32 1 + %3 = extractelement <4 x float> %reg0, i32 2 + %4 = extractelement <4 x float> %reg0, i32 3 + %5 = insertelement <4 x float> undef, float %1, i32 0 + %6 = insertelement <4 x float> %5, float %2, i32 1 + %7 = insertelement <4 x float> %6, float %3, i32 2 + %8 = insertelement <4 x float> %7, float %4, i32 3 + %9 = call <4 x float> @llvm.R600.tex(<4 x float> %8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %10 = call <4 x float> @llvm.R600.tex(<4 x float> %8, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %11 = fadd <4 x float> %9, %10 + call void @llvm.R600.store.swizzle(<4 x float> %11, i32 0, i32 0) + ret void +} + +declare <4 x float> @llvm.R600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="1" } \ No newline at end of file diff --git a/test/CodeGen/AMDGPU/texture-input-merge.ll b/test/CodeGen/AMDGPU/texture-input-merge.ll new file mode 100644 index 00000000000..789538af582 --- /dev/null +++ b/test/CodeGen/AMDGPU/texture-input-merge.ll @@ -0,0 +1,31 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +;CHECK-NOT: MOV + +define void @test(<4 x float> inreg %reg0) #0 { + %1 = extractelement <4 x float> %reg0, i32 0 + %2 = extractelement <4 x float> %reg0, i32 1 + %3 = extractelement <4 x float> %reg0, i32 2 + %4 = extractelement <4 x float> %reg0, i32 3 + %5 = fmul float %1, 3.0 + %6 = fmul float %2, 3.0 + %7 = fmul float %3, 3.0 + %8 = fmul float %4, 3.0 + %9 = insertelement <4 x float> undef, float %5, i32 0 + %10 = insertelement <4 x float> %9, float %6, i32 1 + %11 = insertelement <4 x float> undef, float %7, i32 0 + %12 = insertelement <4 x float> %11, float %5, i32 1 + %13 = insertelement <4 x float> undef, float %8, i32 0 + %14 = call <4 x float> @llvm.R600.tex(<4 x float> %10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %15 = call <4 x float> @llvm.R600.tex(<4 x float> %12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %16 = call <4 x float> @llvm.R600.tex(<4 x float> %13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %17 = fadd <4 x float> %14, %15 + %18 = fadd <4 x float> %17, %16 + call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 0) + ret void +} + +declare <4 x float> @llvm.R600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) + +attributes #0 = { "ShaderType"="1" } \ No newline at end of file diff --git a/test/CodeGen/AMDGPU/trunc-cmp-constant.ll b/test/CodeGen/AMDGPU/trunc-cmp-constant.ll new file mode 100644 index 00000000000..dac74728b3c --- /dev/null +++ b/test/CodeGen/AMDGPU/trunc-cmp-constant.ll @@ -0,0 +1,170 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL {{^}}sextload_i1_to_i32_trunc_cmp_eq_0: +; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] +; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]] +; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}} +; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc, -1{{$}} +; SI: v_cndmask_b32_e64 +; SI: buffer_store_byte +define void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = sext i1 %load to i32 + %cmp = icmp eq i32 %ext, 0 + store i1 %cmp, i1 addrspace(1)* %out + ret void +} + +; FIXME: The negate should be inverting the compare. +; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_0: +; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] +; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]] +; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}} +; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1 +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]] +; SI-NEXT: buffer_store_byte [[RESULT]] +define void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = zext i1 %load to i32 + %cmp = icmp eq i32 %ext, 0 + store i1 %cmp, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_eq_1: +; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} +; SI: buffer_store_byte [[RESULT]] +define void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = sext i1 %load to i32 + %cmp = icmp eq i32 %ext, 1 + store i1 %cmp, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_1: +; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] +; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]] +; SI-NEXT: buffer_store_byte [[RESULT]] +define void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = zext i1 %load to i32 + %cmp = icmp eq i32 %ext, 1 + store i1 %cmp, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_eq_neg1: +; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] +; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]] +; SI-NEXT: buffer_store_byte [[RESULT]] +define void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = sext i1 %load to i32 + %cmp = icmp eq i32 %ext, -1 + store i1 %cmp, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_neg1: +; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} +; SI: buffer_store_byte [[RESULT]] +define void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = zext i1 %load to i32 + %cmp = icmp eq i32 %ext, -1 + store i1 %cmp, i1 addrspace(1)* %out + ret void +} + + +; FUNC-LABEL {{^}}sextload_i1_to_i32_trunc_cmp_ne_0: +; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] +; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]] +; SI-NEXT: buffer_store_byte [[RESULT]] +define void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = sext i1 %load to i32 + %cmp = icmp ne i32 %ext, 0 + store i1 %cmp, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_0: +; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] +; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]] +; SI-NEXT: buffer_store_byte [[RESULT]] +define void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = zext i1 %load to i32 + %cmp = icmp ne i32 %ext, 0 + store i1 %cmp, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_ne_1: +; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}} +; SI: buffer_store_byte [[RESULT]] +define void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = sext i1 %load to i32 + %cmp = icmp ne i32 %ext, 1 + store i1 %cmp, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_1: +; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] +; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]] +; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}} +; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1 +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]] +; SI-NEXT: buffer_store_byte [[RESULT]] +define void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = zext i1 %load to i32 + %cmp = icmp ne i32 %ext, 1 + store i1 %cmp, i1 addrspace(1)* %out + ret void +} + +; FIXME: This should be one compare. +; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_ne_neg1: +; XSI: buffer_load_ubyte [[LOAD:v[0-9]+]] +; XSI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]] +; XSI: v_cmp_eq_i32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], [[TMP]], 0{{$}} +; XSI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP0]] +; XSI-NEXT: buffer_store_byte [[RESULT]] +define void @sextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = sext i1 %load to i32 + %cmp = icmp ne i32 %ext, -1 + store i1 %cmp, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_neg1: +; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}} +; SI: buffer_store_byte [[RESULT]] +define void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { + %load = load i1, i1 addrspace(1)* %in + %ext = zext i1 %load to i32 + %cmp = icmp ne i32 %ext, -1 + store i1 %cmp, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}masked_load_i1_to_i32_trunc_cmp_ne_neg1: +; SI: buffer_load_sbyte [[LOAD:v[0-9]+]] +; SI: v_cmp_ne_i32_e32 vcc, -1, [[LOAD]]{{$}} +; SI-NEXT: v_cndmask_b32_e64 +; SI-NEXT: buffer_store_byte +define void @masked_load_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { + %load = load i8, i8 addrspace(1)* %in + %masked = and i8 %load, 255 + %ext = sext i8 %masked to i32 + %cmp = icmp ne i32 %ext, -1 + store i1 %cmp, i1 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll b/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll new file mode 100644 index 00000000000..c29872beef8 --- /dev/null +++ b/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll @@ -0,0 +1,56 @@ +; XFAIL: * +; RUN: llc -march=amdgcn -mcpu=SI < %s + +; GCN-LABEL: {{^}}global_truncstore_f64_to_f16: +; GCN: s_endpgm +define void @global_truncstore_f64_to_f16(half addrspace(1)* %out, double addrspace(1)* %in) #0 { + %val = load double, double addrspace(1)* %in + %cvt = fptrunc double %val to half + store half %cvt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_v2f64_to_v2f16: +; GCN: s_endpgm +define void @global_truncstore_v2f64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 { + %val = load <2 x double>, <2 x double> addrspace(1)* %in + %cvt = fptrunc <2 x double> %val to <2 x half> + store <2 x half> %cvt, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_v3f64_to_v3f16: +; GCN: s_endpgm +define void @global_truncstore_v3f64_to_v3f16(<3 x half> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 { + %val = load <3 x double>, <3 x double> addrspace(1)* %in + %cvt = fptrunc <3 x double> %val to <3 x half> + store <3 x half> %cvt, <3 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_v4f64_to_v4f16: +; GCN: s_endpgm +define void @global_truncstore_v4f64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 { + %val = load <4 x double>, <4 x double> addrspace(1)* %in + %cvt = fptrunc <4 x double> %val to <4 x half> + store <4 x half> %cvt, <4 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_v8f64_to_v8f16: +; GCN: s_endpgm +define void @global_truncstore_v8f64_to_v8f16(<8 x half> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 { + %val = load <8 x double>, <8 x double> addrspace(1)* %in + %cvt = fptrunc <8 x double> %val to <8 x half> + store <8 x half> %cvt, <8 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_v16f64_to_v16f16: +; GCN: s_endpgm +define void @global_truncstore_v16f64_to_v16f16(<16 x half> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 { + %val = load <16 x double>, <16 x double> addrspace(1)* %in + %cvt = fptrunc <16 x double> %val to <16 x half> + store <16 x half> %cvt, <16 x half> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/trunc-store-i1.ll b/test/CodeGen/AMDGPU/trunc-store-i1.ll new file mode 100644 index 00000000000..b71a838b62c --- /dev/null +++ b/test/CodeGen/AMDGPU/trunc-store-i1.ll @@ -0,0 +1,33 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s + + +; SI-LABEL: {{^}}global_truncstore_i32_to_i1: +; SI: s_load_dword [[LOAD:s[0-9]+]], +; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1 +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]] +; SI: buffer_store_byte [[VREG]], +define void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwind { + %trunc = trunc i32 %val to i1 + store i1 %trunc, i1 addrspace(1)* %out, align 1 + ret void +} + +; SI-LABEL: {{^}}global_truncstore_i64_to_i1: +; SI: buffer_store_byte +define void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwind { + %trunc = trunc i64 %val to i1 + store i1 %trunc, i1 addrspace(1)* %out, align 1 + ret void +} + +; SI-LABEL: {{^}}global_truncstore_i16_to_i1: +; SI: s_load_dword [[LOAD:s[0-9]+]], +; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1 +; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]] +; SI: buffer_store_byte [[VREG]], +define void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind { + %trunc = trunc i16 %val to i1 + store i1 %trunc, i1 addrspace(1)* %out, align 1 + ret void +} diff --git a/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll b/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll new file mode 100644 index 00000000000..878ea3f4899 --- /dev/null +++ b/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll @@ -0,0 +1,20 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; This tests for a bug in the SelectionDAG where custom lowered truncated +; vector stores at the end of a basic block were not being added to the +; LegalizedNodes list, which triggered an assertion failure. + +; CHECK-LABEL: {{^}}test: +; CHECK: MEM_RAT_CACHELESS STORE_RAW +define void @test(<4 x i8> addrspace(1)* %out, i32 %cond, <4 x i8> %in) { +entry: + %0 = icmp eq i32 %cond, 0 + br i1 %0, label %if, label %done + +if: + store <4 x i8> %in, <4 x i8> addrspace(1)* %out + br label %done + +done: + ret void +} diff --git a/test/CodeGen/AMDGPU/trunc.ll b/test/CodeGen/AMDGPU/trunc.ll new file mode 100644 index 00000000000..bf690ca4cb2 --- /dev/null +++ b/test/CodeGen/AMDGPU/trunc.ll @@ -0,0 +1,100 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +define void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) { +; SI-LABEL: {{^}}trunc_i64_to_i32_store: +; SI: s_load_dword [[SLOAD:s[0-9]+]], s[0:1], 0xb +; SI: v_mov_b32_e32 [[VLOAD:v[0-9]+]], [[SLOAD]] +; SI: buffer_store_dword [[VLOAD]] + +; EG-LABEL: {{^}}trunc_i64_to_i32_store: +; EG: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG: LSHR +; EG-NEXT: 2( + + %result = trunc i64 %in to i32 store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}trunc_load_shl_i64: +; SI-DAG: s_load_dwordx2 +; SI-DAG: s_load_dword [[SREG:s[0-9]+]], +; SI: s_lshl_b32 [[SHL:s[0-9]+]], [[SREG]], 2 +; SI: v_mov_b32_e32 [[VSHL:v[0-9]+]], [[SHL]] +; SI: buffer_store_dword [[VSHL]], +define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) { + %b = shl i64 %a, 2 + %result = trunc i64 %b to i32 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}trunc_shl_i64: +; SI: s_load_dwordx2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SI: s_lshl_b64 s{{\[}}[[LO_SHL:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_SREG]]:{{[0-9]+\]}}, 2 +; SI: s_add_u32 s[[LO_SREG2:[0-9]+]], s[[LO_SHL]], +; SI: s_addc_u32 +; SI: v_mov_b32_e32 +; SI: v_mov_b32_e32 +; SI: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]] +; SI: buffer_store_dword v[[LO_VREG]], +define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) { + %aa = add i64 %a, 234 ; Prevent shrinking store. + %b = shl i64 %aa, 2 + %result = trunc i64 %b to i32 + store i32 %result, i32 addrspace(1)* %out, align 4 + store i64 %b, i64 addrspace(1)* %out2, align 8 ; Prevent reducing ops to 32-bits + ret void +} + +; SI-LABEL: {{^}}trunc_i32_to_i1: +; SI: v_and_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} +; SI: v_cmp_eq_i32 +define void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) { + %a = load i32, i32 addrspace(1)* %ptr, align 4 + %trunc = trunc i32 %a to i1 + %result = select i1 %trunc, i32 1, i32 0 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}sgpr_trunc_i32_to_i1: +; SI: v_and_b32_e64 v{{[0-9]+}}, 1, s{{[0-9]+}} +; SI: v_cmp_eq_i32 +define void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) { + %trunc = trunc i32 %a to i1 + %result = select i1 %trunc, i32 1, i32 0 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}s_trunc_i64_to_i1: +; SI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI: v_and_b32_e64 [[MASKED:v[0-9]+]], 1, s[[SLO]] +; SI: v_cmp_eq_i32_e32 vcc, 1, [[MASKED]] +; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc +define void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) { + %trunc = trunc i64 %x to i1 + %sel = select i1 %trunc, i32 63, i32 -12 + store i32 %sel, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}v_trunc_i64_to_i1: +; SI: buffer_load_dwordx2 v{{\[}}[[VLO:[0-9]+]]:{{[0-9]+\]}} +; SI: v_and_b32_e32 [[MASKED:v[0-9]+]], 1, v[[VLO]] +; SI: v_cmp_eq_i32_e32 vcc, 1, [[MASKED]] +; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc +define void @v_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %x = load i64, i64 addrspace(1)* %gep + + %trunc = trunc i64 %x to i1 + %sel = select i1 %trunc, i32 63, i32 -12 + store i32 %sel, i32 addrspace(1)* %out.gep + ret void +} diff --git a/test/CodeGen/AMDGPU/tti-unroll-prefs.ll b/test/CodeGen/AMDGPU/tti-unroll-prefs.ll new file mode 100644 index 00000000000..76c32afc1f2 --- /dev/null +++ b/test/CodeGen/AMDGPU/tti-unroll-prefs.ll @@ -0,0 +1,58 @@ +; RUN: opt -loop-unroll -S -mtriple=amdgcn-- -mcpu=SI %s | FileCheck %s + +; This IR comes from this OpenCL C code: +; +; if (b + 4 > a) { +; for (int i = 0; i < 4; i++, b++) { +; if (b + 1 <= a) +; *(dst + c + b) = 0; +; else +; break; +; } +; } +; +; This test is meant to check that this loop isn't unrolled into more than +; four iterations. The loop unrolling preferences we currently use cause this +; loop to not be unrolled at all, but that may change in the future. + +; CHECK-LABEL: @test +; CHECK: store i8 0, i8 addrspace(1)* +; CHECK-NOT: store i8 0, i8 addrspace(1)* +; CHECK: ret void +define void @test(i8 addrspace(1)* nocapture %dst, i32 %a, i32 %b, i32 %c) { +entry: + %add = add nsw i32 %b, 4 + %cmp = icmp sgt i32 %add, %a + br i1 %cmp, label %for.cond.preheader, label %if.end7 + +for.cond.preheader: ; preds = %entry + %cmp313 = icmp slt i32 %b, %a + br i1 %cmp313, label %if.then4.lr.ph, label %if.end7.loopexit + +if.then4.lr.ph: ; preds = %for.cond.preheader + %0 = sext i32 %c to i64 + br label %if.then4 + +if.then4: ; preds = %if.then4.lr.ph, %if.then4 + %i.015 = phi i32 [ 0, %if.then4.lr.ph ], [ %inc, %if.then4 ] + %b.addr.014 = phi i32 [ %b, %if.then4.lr.ph ], [ %add2, %if.then4 ] + %add2 = add nsw i32 %b.addr.014, 1 + %1 = sext i32 %b.addr.014 to i64 + %add.ptr.sum = add nsw i64 %1, %0 + %add.ptr5 = getelementptr inbounds i8, i8 addrspace(1)* %dst, i64 %add.ptr.sum + store i8 0, i8 addrspace(1)* %add.ptr5, align 1 + %inc = add nsw i32 %i.015, 1 + %cmp1 = icmp slt i32 %inc, 4 + %cmp3 = icmp slt i32 %add2, %a + %or.cond = and i1 %cmp3, %cmp1 + br i1 %or.cond, label %if.then4, label %for.cond.if.end7.loopexit_crit_edge + +for.cond.if.end7.loopexit_crit_edge: ; preds = %if.then4 + br label %if.end7.loopexit + +if.end7.loopexit: ; preds = %for.cond.if.end7.loopexit_crit_edge, %for.cond.preheader + br label %if.end7 + +if.end7: ; preds = %if.end7.loopexit, %entry + ret void +} diff --git a/test/CodeGen/AMDGPU/uaddo.ll b/test/CodeGen/AMDGPU/uaddo.ll new file mode 100644 index 00000000000..11438f267ad --- /dev/null +++ b/test/CodeGen/AMDGPU/uaddo.ll @@ -0,0 +1,85 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone +declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone + +; FUNC-LABEL: {{^}}uaddo_i64_zext: +; SI: add +; SI: addc +; SI: addc + +; EG: ADDC_UINT +; EG: ADDC_UINT +define void @uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %uadd, 0 + %carry = extractvalue { i64, i1 } %uadd, 1 + %ext = zext i1 %carry to i64 + %add2 = add i64 %val, %ext + store i64 %add2, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_uaddo_i32: +; SI: s_add_i32 + +; EG: ADDC_UINT +; EG: ADD_INT +define void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { + %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) nounwind + %val = extractvalue { i32, i1 } %uadd, 0 + %carry = extractvalue { i32, i1 } %uadd, 1 + store i32 %val, i32 addrspace(1)* %out, align 4 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: {{^}}v_uaddo_i32: +; SI: v_add_i32 + +; EG: ADDC_UINT +; EG: ADD_INT +define void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %a = load i32, i32 addrspace(1)* %aptr, align 4 + %b = load i32, i32 addrspace(1)* %bptr, align 4 + %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) nounwind + %val = extractvalue { i32, i1 } %uadd, 0 + %carry = extractvalue { i32, i1 } %uadd, 1 + store i32 %val, i32 addrspace(1)* %out, align 4 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: {{^}}s_uaddo_i64: +; SI: s_add_u32 +; SI: s_addc_u32 + +; EG: ADDC_UINT +; EG: ADD_INT +define void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { + %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %uadd, 0 + %carry = extractvalue { i64, i1 } %uadd, 1 + store i64 %val, i64 addrspace(1)* %out, align 8 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: {{^}}v_uaddo_i64: +; SI: v_add_i32 +; SI: v_addc_u32 + +; EG: ADDC_UINT +; EG: ADD_INT +define void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %a = load i64, i64 addrspace(1)* %aptr, align 4 + %b = load i64, i64 addrspace(1)* %bptr, align 4 + %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %uadd, 0 + %carry = extractvalue { i64, i1 } %uadd, 1 + store i64 %val, i64 addrspace(1)* %out, align 8 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} diff --git a/test/CodeGen/AMDGPU/udiv.ll b/test/CodeGen/AMDGPU/udiv.ll new file mode 100644 index 00000000000..de22a22e502 --- /dev/null +++ b/test/CodeGen/AMDGPU/udiv.ll @@ -0,0 +1,48 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s + +;EG-LABEL: {{^}}test: +;EG-NOT: SETGE_INT +;EG: CF_END + +define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %a = load i32, i32 addrspace(1) * %in + %b = load i32, i32 addrspace(1) * %b_ptr + %result = udiv i32 %a, %b + store i32 %result, i32 addrspace(1)* %out + ret void +} + +;The code generated by udiv is long and complex and may frequently change. +;The goal of this test is to make sure the ISel doesn't fail when it gets +;a v4i32 udiv + +;EG-LABEL: {{^}}test2: +;EG: CF_END +;SI-LABEL: {{^}}test2: +;SI: s_endpgm + +define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 + %a = load <2 x i32>, <2 x i32> addrspace(1) * %in + %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr + %result = udiv <2 x i32> %a, %b + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +;EG-LABEL: {{^}}test4: +;EG: CF_END +;SI-LABEL: {{^}}test4: +;SI: s_endpgm + +define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 + %a = load <4 x i32>, <4 x i32> addrspace(1) * %in + %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr + %result = udiv <4 x i32> %a, %b + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/udivrem.ll b/test/CodeGen/AMDGPU/udivrem.ll new file mode 100644 index 00000000000..b3837f28209 --- /dev/null +++ b/test/CodeGen/AMDGPU/udivrem.ll @@ -0,0 +1,345 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}test_udivrem: +; EG: RECIP_UINT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG: CNDE_INT +; EG: MULHI +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG: CNDE_INT +; EG: MULHI +; EG: MULLO_INT +; EG: SUB_INT +; EG-DAG: SETGE_UINT +; EG-DAG: SETGE_UINT +; EG: AND_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT + +; SI: v_rcp_iflag_f32_e32 [[RCP:v[0-9]+]] +; SI-DAG: v_mul_hi_u32 [[RCP_HI:v[0-9]+]], [[RCP]] +; SI-DAG: v_mul_lo_i32 [[RCP_LO:v[0-9]+]], [[RCP]] +; SI-DAG: v_sub_i32_e32 [[NEG_RCP_LO:v[0-9]+]], 0, [[RCP_LO]] +; SI: v_cndmask_b32_e64 +; SI: v_mul_hi_u32 [[E:v[0-9]+]], {{v[0-9]+}}, [[RCP]] +; SI-DAG: v_add_i32_e32 [[RCP_A_E:v[0-9]+]], [[E]], [[RCP]] +; SI-DAG: v_subrev_i32_e32 [[RCP_S_E:v[0-9]+]], [[E]], [[RCP]] +; SI: v_cndmask_b32_e64 +; SI: v_mul_hi_u32 [[Quotient:v[0-9]+]] +; SI: v_mul_lo_i32 [[Num_S_Remainder:v[0-9]+]] +; SI-DAG: v_sub_i32_e32 [[Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[Num_S_Remainder]] +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI: v_and_b32_e32 [[Tmp1:v[0-9]+]] +; SI-DAG: v_add_i32_e32 [[Quotient_A_One:v[0-9]+]], 1, [[Quotient]] +; SI-DAG: v_subrev_i32_e32 [[Quotient_S_One:v[0-9]+]], +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_add_i32_e32 [[Remainder_A_Den:v[0-9]+]], +; SI-DAG: v_subrev_i32_e32 [[Remainder_S_Den:v[0-9]+]], +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI: s_endpgm +define void @test_udivrem(i32 addrspace(1)* %out, i32 %x, i32 %y) { + %result0 = udiv i32 %x, %y + store i32 %result0, i32 addrspace(1)* %out + %result1 = urem i32 %x, %y + store i32 %result1, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_udivrem_v2: +; EG-DAG: RECIP_UINT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: SETGE_UINT +; EG-DAG: SETGE_UINT +; EG-DAG: AND_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: RECIP_UINT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: SETGE_UINT +; EG-DAG: SETGE_UINT +; EG-DAG: AND_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT + +; SI-DAG: v_rcp_iflag_f32_e32 [[FIRST_RCP:v[0-9]+]] +; SI-DAG: v_mul_hi_u32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]] +; SI-DAG: v_mul_lo_i32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]] +; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]] +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]] +; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]] +; SI-DAG: v_subrev_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]] +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]] +; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]] +; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder:v[0-9]+]], [[FIRST_Num_S_Remainder]], v{{[0-9]+}} +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]] +; SI-DAG: v_add_i32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]] +; SI-DAG: v_subrev_i32_e32 [[FIRST_Quotient_S_One:v[0-9]+]], +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_add_i32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]], +; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]], +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_rcp_iflag_f32_e32 [[SECOND_RCP:v[0-9]+]] +; SI-DAG: v_mul_hi_u32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]] +; SI-DAG: v_mul_lo_i32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]] +; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]] +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]] +; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]] +; SI-DAG: v_subrev_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]] +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]] +; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]] +; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder:v[0-9]+]], [[SECOND_Num_S_Remainder]], v{{[0-9]+}} +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]] +; SI-DAG: v_add_i32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]] +; SI-DAG: v_subrev_i32_e32 [[SECOND_Quotient_S_One:v[0-9]+]], +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_add_i32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]], +; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]], +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI: s_endpgm +define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { + %result0 = udiv <2 x i32> %x, %y + store <2 x i32> %result0, <2 x i32> addrspace(1)* %out + %result1 = urem <2 x i32> %x, %y + store <2 x i32> %result1, <2 x i32> addrspace(1)* %out + ret void +} + + +; FUNC-LABEL: {{^}}test_udivrem_v4: +; EG-DAG: RECIP_UINT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: SETGE_UINT +; EG-DAG: SETGE_UINT +; EG-DAG: AND_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: RECIP_UINT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: SETGE_UINT +; EG-DAG: SETGE_UINT +; EG-DAG: AND_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: RECIP_UINT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: SETGE_UINT +; EG-DAG: SETGE_UINT +; EG-DAG: AND_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: RECIP_UINT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: MULHI +; EG-DAG: MULLO_INT +; EG-DAG: SUB_INT +; EG-DAG: SETGE_UINT +; EG-DAG: SETGE_UINT +; EG-DAG: AND_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT +; EG-DAG: ADD_INT +; EG-DAG: SUB_INT +; EG-DAG: CNDE_INT +; EG-DAG: CNDE_INT + +; SI-DAG: v_rcp_iflag_f32_e32 [[FIRST_RCP:v[0-9]+]] +; SI-DAG: v_mul_hi_u32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]] +; SI-DAG: v_mul_lo_i32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]] +; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]] +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]] +; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]] +; SI-DAG: v_subrev_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]] +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]] +; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]] +; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder:v[l0-9]+]], [[FIRST_Num_S_Remainder]], v{{[0-9]+}} +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]] +; SI-DAG: v_add_i32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]] +; SI-DAG: v_subrev_i32_e32 [[FIRST_Quotient_S_One:v[0-9]+]], +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_add_i32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]], +; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]], +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_rcp_iflag_f32_e32 [[SECOND_RCP:v[0-9]+]] +; SI-DAG: v_mul_hi_u32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]] +; SI-DAG: v_mul_lo_i32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]] +; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]] +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]] +; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]] +; SI-DAG: v_subrev_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]] +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]] +; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]] +; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder:v[0-9]+]], [[SECOND_Num_S_Remainder]], v{{[0-9]+}} +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]] +; SI-DAG: v_add_i32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]] +; SI-DAG: v_subrev_i32_e32 [[SECOND_Quotient_S_One:v[0-9]+]], +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_add_i32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]], +; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]], +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_rcp_iflag_f32_e32 [[THIRD_RCP:v[0-9]+]] +; SI-DAG: v_mul_hi_u32 [[THIRD_RCP_HI:v[0-9]+]], [[THIRD_RCP]] +; SI-DAG: v_mul_lo_i32 [[THIRD_RCP_LO:v[0-9]+]], [[THIRD_RCP]] +; SI-DAG: v_sub_i32_e32 [[THIRD_NEG_RCP_LO:v[0-9]+]], 0, [[THIRD_RCP_LO]] +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_mul_hi_u32 [[THIRD_E:v[0-9]+]], {{v[0-9]+}}, [[THIRD_RCP]] +; SI-DAG: v_add_i32_e32 [[THIRD_RCP_A_E:v[0-9]+]], [[THIRD_E]], [[THIRD_RCP]] +; SI-DAG: v_subrev_i32_e32 [[THIRD_RCP_S_E:v[0-9]+]], [[THIRD_E]], [[THIRD_RCP]] +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_mul_hi_u32 [[THIRD_Quotient:v[0-9]+]] +; SI-DAG: v_mul_lo_i32 [[THIRD_Num_S_Remainder:v[0-9]+]] +; SI-DAG: v_subrev_i32_e32 [[THIRD_Remainder:v[0-9]+]], [[THIRD_Num_S_Remainder]], {{v[0-9]+}} +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_and_b32_e32 [[THIRD_Tmp1:v[0-9]+]] +; SI-DAG: v_add_i32_e32 [[THIRD_Quotient_A_One:v[0-9]+]], {{.*}}, [[THIRD_Quotient]] +; SI-DAG: v_subrev_i32_e32 [[THIRD_Quotient_S_One:v[0-9]+]], +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_add_i32_e32 [[THIRD_Remainder_A_Den:v[0-9]+]], +; SI-DAG: v_subrev_i32_e32 [[THIRD_Remainder_S_Den:v[0-9]+]], +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_rcp_iflag_f32_e32 [[FOURTH_RCP:v[0-9]+]] +; SI-DAG: v_mul_hi_u32 [[FOURTH_RCP_HI:v[0-9]+]], [[FOURTH_RCP]] +; SI-DAG: v_mul_lo_i32 [[FOURTH_RCP_LO:v[0-9]+]], [[FOURTH_RCP]] +; SI-DAG: v_sub_i32_e32 [[FOURTH_NEG_RCP_LO:v[0-9]+]], 0, [[FOURTH_RCP_LO]] +; SI-DAG: v_cndmask_b32_e64 +; SI-DAG: v_mul_hi_u32 [[FOURTH_E:v[0-9]+]], {{v[0-9]+}}, [[FOURTH_RCP]] +; SI-DAG: v_add_i32_e32 [[FOURTH_RCP_A_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]] +; SI-DAG: v_subrev_i32_e32 [[FOURTH_RCP_S_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]] +; SI-DAG: v_cndmask_b32_e64 +; SI: s_endpgm +define void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { + %result0 = udiv <4 x i32> %x, %y + store <4 x i32> %result0, <4 x i32> addrspace(1)* %out + %result1 = urem <4 x i32> %x, %y + store <4 x i32> %result1, <4 x i32> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/udivrem24.ll b/test/CodeGen/AMDGPU/udivrem24.ll new file mode 100644 index 00000000000..4de881b66f1 --- /dev/null +++ b/test/CodeGen/AMDGPU/udivrem24.ll @@ -0,0 +1,245 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}udiv24_i8: +; SI: v_cvt_f32_ubyte +; SI: v_cvt_f32_ubyte +; SI: v_rcp_f32 +; SI: v_cvt_u32_f32 + +; EG: UINT_TO_FLT +; EG-DAG: UINT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_UINT +define void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { + %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 + %num = load i8, i8 addrspace(1) * %in + %den = load i8, i8 addrspace(1) * %den_ptr + %result = udiv i8 %num, %den + store i8 %result, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}udiv24_i16: +; SI: v_cvt_f32_u32 +; SI: v_cvt_f32_u32 +; SI: v_rcp_f32 +; SI: v_cvt_u32_f32 + +; EG: UINT_TO_FLT +; EG-DAG: UINT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_UINT +define void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { + %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 + %num = load i16, i16 addrspace(1) * %in, align 2 + %den = load i16, i16 addrspace(1) * %den_ptr, align 2 + %result = udiv i16 %num, %den + store i16 %result, i16 addrspace(1)* %out, align 2 + ret void +} + +; FUNC-LABEL: {{^}}udiv24_i32: +; SI: v_cvt_f32_u32 +; SI-DAG: v_cvt_f32_u32 +; SI-DAG: v_rcp_f32 +; SI: v_cvt_u32_f32 + +; EG: UINT_TO_FLT +; EG-DAG: UINT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_UINT +define void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 8 + %den.i24.0 = shl i32 %den, 8 + %num.i24 = lshr i32 %num.i24.0, 8 + %den.i24 = lshr i32 %den.i24.0, 8 + %result = udiv i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}udiv25_i32: +; RCP_IFLAG is for URECIP in the full 32b alg +; SI: v_rcp_iflag +; SI-NOT: v_rcp_f32 + +; EG-NOT: UINT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 7 + %den.i24.0 = shl i32 %den, 7 + %num.i24 = lshr i32 %num.i24.0, 7 + %den.i24 = lshr i32 %den.i24.0, 7 + %result = udiv i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_no_udiv24_i32_1: +; RCP_IFLAG is for URECIP in the full 32b alg +; SI: v_rcp_iflag +; SI-NOT: v_rcp_f32 + +; EG-NOT: UINT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 8 + %den.i24.0 = shl i32 %den, 7 + %num.i24 = lshr i32 %num.i24.0, 8 + %den.i24 = lshr i32 %den.i24.0, 7 + %result = udiv i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_no_udiv24_i32_2: +; RCP_IFLAG is for URECIP in the full 32b alg +; SI: v_rcp_iflag +; SI-NOT: v_rcp_f32 + +; EG-NOT: UINT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 7 + %den.i24.0 = shl i32 %den, 8 + %num.i24 = lshr i32 %num.i24.0, 7 + %den.i24 = lshr i32 %den.i24.0, 8 + %result = udiv i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}urem24_i8: +; SI: v_cvt_f32_ubyte +; SI: v_cvt_f32_ubyte +; SI: v_rcp_f32 +; SI: v_cvt_u32_f32 + +; EG: UINT_TO_FLT +; EG-DAG: UINT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_UINT +define void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { + %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 + %num = load i8, i8 addrspace(1) * %in + %den = load i8, i8 addrspace(1) * %den_ptr + %result = urem i8 %num, %den + store i8 %result, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}urem24_i16: +; SI: v_cvt_f32_u32 +; SI: v_cvt_f32_u32 +; SI: v_rcp_f32 +; SI: v_cvt_u32_f32 + +; EG: UINT_TO_FLT +; EG-DAG: UINT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_UINT +define void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { + %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 + %num = load i16, i16 addrspace(1) * %in, align 2 + %den = load i16, i16 addrspace(1) * %den_ptr, align 2 + %result = urem i16 %num, %den + store i16 %result, i16 addrspace(1)* %out, align 2 + ret void +} + +; FUNC-LABEL: {{^}}urem24_i32: +; SI: v_cvt_f32_u32 +; SI: v_cvt_f32_u32 +; SI: v_rcp_f32 +; SI: v_cvt_u32_f32 + +; EG: UINT_TO_FLT +; EG-DAG: UINT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_UINT +define void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 8 + %den.i24.0 = shl i32 %den, 8 + %num.i24 = lshr i32 %num.i24.0, 8 + %den.i24 = lshr i32 %den.i24.0, 8 + %result = urem i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}urem25_i32: +; RCP_IFLAG is for URECIP in the full 32b alg +; SI: v_rcp_iflag +; SI-NOT: v_rcp_f32 + +; EG-NOT: UINT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 7 + %den.i24.0 = shl i32 %den, 7 + %num.i24 = lshr i32 %num.i24.0, 7 + %den.i24 = lshr i32 %den.i24.0, 7 + %result = urem i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_no_urem24_i32_1: +; RCP_IFLAG is for URECIP in the full 32b alg +; SI: v_rcp_iflag +; SI-NOT: v_rcp_f32 + +; EG-NOT: UINT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 8 + %den.i24.0 = shl i32 %den, 7 + %num.i24 = lshr i32 %num.i24.0, 8 + %den.i24 = lshr i32 %den.i24.0, 7 + %result = urem i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}test_no_urem24_i32_2: +; RCP_IFLAG is for URECIP in the full 32b alg +; SI: v_rcp_iflag +; SI-NOT: v_rcp_f32 + +; EG-NOT: UINT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %num = load i32, i32 addrspace(1) * %in, align 4 + %den = load i32, i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 7 + %den.i24.0 = shl i32 %den, 8 + %num.i24 = lshr i32 %num.i24.0, 7 + %den.i24 = lshr i32 %den.i24.0, 8 + %result = urem i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/udivrem64.ll b/test/CodeGen/AMDGPU/udivrem64.ll new file mode 100644 index 00000000000..9f3069bdf80 --- /dev/null +++ b/test/CodeGen/AMDGPU/udivrem64.ll @@ -0,0 +1,223 @@ +;RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s +;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s +;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s + +;FUNC-LABEL: {{^}}test_udiv: +;EG: RECIP_UINT +;EG: LSHL {{.*}}, 1, +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT + +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN-NOT: v_mad_f32 +;SI-NOT: v_lshr_b64 +;VI-NOT: v_lshrrev_b64 +;GCN: s_endpgm +define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %result = udiv i64 %x, %y + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test_urem: +;EG: RECIP_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: BFE_UINT +;EG: AND_INT {{.*}}, 1, + +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN: s_bfe_u32 +;GCN-NOT: v_mad_f32 +;SI-NOT: v_lshr_b64 +;VI-NOT: v_lshrrev_b64 +;GCN: s_endpgm +define void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %result = urem i64 %x, %y + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test_udiv3264: +;EG: RECIP_UINT +;EG-NOT: BFE_UINT + +;GCN-NOT: s_bfe_u32 +;GCN-NOT: v_mad_f32 +;SI-NOT: v_lshr_b64 +;VI-NOT: v_lshrrev_b64 +;GCN: s_endpgm +define void @test_udiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %1 = lshr i64 %x, 33 + %2 = lshr i64 %y, 33 + %result = udiv i64 %1, %2 + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test_urem3264: +;EG: RECIP_UINT +;EG-NOT: BFE_UINT + +;GCN-NOT: s_bfe_u32 +;GCN-NOT: v_mad_f32 +;SI-NOT: v_lshr_b64 +;VI-NOT: v_lshrrev_b64 +;GCN: s_endpgm +define void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %1 = lshr i64 %x, 33 + %2 = lshr i64 %y, 33 + %result = urem i64 %1, %2 + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test_udiv2464: +;EG: UINT_TO_FLT +;EG: UINT_TO_FLT +;EG: FLT_TO_UINT +;EG-NOT: RECIP_UINT +;EG-NOT: BFE_UINT + +;SI-NOT: v_lshr_b64 +;VI-NOT: v_lshrrev_b64 +;GCN: v_mad_f32 +;GCN: s_endpgm +define void @test_udiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %1 = lshr i64 %x, 40 + %2 = lshr i64 %y, 40 + %result = udiv i64 %1, %2 + store i64 %result, i64 addrspace(1)* %out + ret void +} + +;FUNC-LABEL: {{^}}test_urem2464: +;EG: UINT_TO_FLT +;EG: UINT_TO_FLT +;EG: FLT_TO_UINT +;EG-NOT: RECIP_UINT +;EG-NOT: BFE_UINT + +;SI-NOT: v_lshr_b64 +;VI-NOT: v_lshrrev_b64 +;GCN: v_mad_f32 +;GCN: s_endpgm +define void @test_urem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { + %1 = lshr i64 %x, 40 + %2 = lshr i64 %y, 40 + %result = urem i64 %1, %2 + store i64 %result, i64 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/test/CodeGen/AMDGPU/uint_to_fp.f64.ll new file mode 100644 index 00000000000..dfec8eb15cb --- /dev/null +++ b/test/CodeGen/AMDGPU/uint_to_fp.f64.ll @@ -0,0 +1,98 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; SI-LABEL: {{^}}v_uint_to_fp_i64_to_f64 +; SI: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} +; SI: v_cvt_f64_u32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]] +; SI: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32 +; SI: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]] +; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]] +; SI: buffer_store_dwordx2 [[RESULT]] +define void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %val = load i64, i64 addrspace(1)* %gep, align 8 + %result = uitofp i64 %val to double + store double %result, double addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_uint_to_fp_i64_to_f64 +define void @s_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) { + %cast = uitofp i64 %in to double + store double %cast, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}s_uint_to_fp_v2i64_to_v2f64 +define void @s_uint_to_fp_v2i64_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i64> %in) { + %cast = uitofp <2 x i64> %in to <2 x double> + store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}s_uint_to_fp_v4i64_to_v4f64 +define void @s_uint_to_fp_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %in) { + %cast = uitofp <4 x i64> %in to <4 x double> + store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}s_uint_to_fp_i32_to_f64 +; SI: v_cvt_f64_u32_e32 +; SI: s_endpgm +define void @s_uint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) { + %cast = uitofp i32 %in to double + store double %cast, double addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}s_uint_to_fp_v2i32_to_v2f64 +; SI: v_cvt_f64_u32_e32 +; SI: v_cvt_f64_u32_e32 +; SI: s_endpgm +define void @s_uint_to_fp_v2i32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i32> %in) { + %cast = uitofp <2 x i32> %in to <2 x double> + store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16 + ret void +} + +; SI-LABEL: {{^}}s_uint_to_fp_v4i32_to_v4f64 +; SI: v_cvt_f64_u32_e32 +; SI: v_cvt_f64_u32_e32 +; SI: v_cvt_f64_u32_e32 +; SI: v_cvt_f64_u32_e32 +; SI: s_endpgm +define void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i32> %in) { + %cast = uitofp <4 x i32> %in to <4 x double> + store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16 + ret void +} + +; FIXME: select on 0, 0 +; SI-LABEL: {{^}}uint_to_fp_i1_to_f64: +; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]], +; We can't fold the SGPRs into v_cndmask_b32_e64, because it already +; uses an SGPR for [[CMP]] +; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]] +; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, [[CMP]] +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) { + %cmp = icmp eq i32 %in, 0 + %fp = uitofp i1 %cmp to double + store double %fp, double addrspace(1)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}uint_to_fp_i1_to_f64_load: +; SI: v_cndmask_b32_e64 [[IRESULT:v[0-9]]], 0, 1 +; SI-NEXT: v_cvt_f64_u32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]] +; SI: buffer_store_dwordx2 [[RESULT]] +; SI: s_endpgm +define void @uint_to_fp_i1_to_f64_load(double addrspace(1)* %out, i1 %in) { + %fp = uitofp i1 %in to double + store double %fp, double addrspace(1)* %out, align 8 + ret void +} diff --git a/test/CodeGen/AMDGPU/uint_to_fp.ll b/test/CodeGen/AMDGPU/uint_to_fp.ll new file mode 100644 index 00000000000..00fea80b1bc --- /dev/null +++ b/test/CodeGen/AMDGPU/uint_to_fp.ll @@ -0,0 +1,82 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}uint_to_fp_i32_to_f32: +; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z + +; SI: v_cvt_f32_u32_e32 +; SI: s_endpgm +define void @uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) { + %result = uitofp i32 %in to float + store float %result, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}uint_to_fp_v2i32_to_v2f32: +; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W +; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X + +; SI: v_cvt_f32_u32_e32 +; SI: v_cvt_f32_u32_e32 +; SI: s_endpgm +define void @uint_to_fp_v2i32_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i32> %in) { + %result = uitofp <2 x i32> %in to <2 x float> + store <2 x float> %result, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}uint_to_fp_v4i32_to_v4f32: +; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_cvt_f32_u32_e32 +; SI: v_cvt_f32_u32_e32 +; SI: v_cvt_f32_u32_e32 +; SI: v_cvt_f32_u32_e32 +; SI: s_endpgm +define void @uint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %value = load <4 x i32>, <4 x i32> addrspace(1) * %in + %result = uitofp <4 x i32> %value to <4 x float> + store <4 x float> %result, <4 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}uint_to_fp_i64_to_f32: +; R600: UINT_TO_FLT +; R600: UINT_TO_FLT +; R600: MULADD_IEEE +; SI: v_cvt_f32_u32_e32 +; SI: v_cvt_f32_u32_e32 +; SI: v_madmk_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, 0x4f800000 +; SI: s_endpgm +define void @uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) { +entry: + %0 = uitofp i64 %in to float + store float %0, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}uint_to_fp_i1_to_f32: +; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]], +; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @uint_to_fp_i1_to_f32(float addrspace(1)* %out, i32 %in) { + %cmp = icmp eq i32 %in, 0 + %fp = uitofp i1 %cmp to float + store float %fp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}uint_to_fp_i1_to_f32_load: +; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0 +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @uint_to_fp_i1_to_f32_load(float addrspace(1)* %out, i1 %in) { + %fp = uitofp i1 %in to float + store float %fp, float addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/unaligned-load-store.ll b/test/CodeGen/AMDGPU/unaligned-load-store.ll new file mode 100644 index 00000000000..82d88ebd3ae --- /dev/null +++ b/test/CodeGen/AMDGPU/unaligned-load-store.ll @@ -0,0 +1,254 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}unaligned_load_store_i16_local: +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: s_endpgm +define void @unaligned_load_store_i16_local(i16 addrspace(3)* %p, i16 addrspace(3)* %r) nounwind { + %v = load i16, i16 addrspace(3)* %p, align 1 + store i16 %v, i16 addrspace(3)* %r, align 1 + ret void +} + +; SI-LABEL: {{^}}unaligned_load_store_i16_global: +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: s_endpgm +define void @unaligned_load_store_i16_global(i16 addrspace(1)* %p, i16 addrspace(1)* %r) nounwind { + %v = load i16, i16 addrspace(1)* %p, align 1 + store i16 %v, i16 addrspace(1)* %r, align 1 + ret void +} + +; SI-LABEL: {{^}}unaligned_load_store_i32_local: +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: s_endpgm +define void @unaligned_load_store_i32_local(i32 addrspace(3)* %p, i32 addrspace(3)* %r) nounwind { + %v = load i32, i32 addrspace(3)* %p, align 1 + store i32 %v, i32 addrspace(3)* %r, align 1 + ret void +} + +; SI-LABEL: {{^}}unaligned_load_store_i32_global: +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +define void @unaligned_load_store_i32_global(i32 addrspace(1)* %p, i32 addrspace(1)* %r) nounwind { + %v = load i32, i32 addrspace(1)* %p, align 1 + store i32 %v, i32 addrspace(1)* %r, align 1 + ret void +} + +; SI-LABEL: {{^}}unaligned_load_store_i64_local: +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: s_endpgm +define void @unaligned_load_store_i64_local(i64 addrspace(3)* %p, i64 addrspace(3)* %r) { + %v = load i64, i64 addrspace(3)* %p, align 1 + store i64 %v, i64 addrspace(3)* %r, align 1 + ret void +} + +; SI-LABEL: {{^}}unaligned_load_store_i64_global: +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) { + %v = load i64, i64 addrspace(1)* %p, align 1 + store i64 %v, i64 addrspace(1)* %r, align 1 + ret void +} + +; SI-LABEL: {{^}}unaligned_load_store_v4i32_local: +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 + +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 + +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 + +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 + +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 + +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 + +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 + +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: ds_write_b8 +; SI: s_endpgm +define void @unaligned_load_store_v4i32_local(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) nounwind { + %v = load <4 x i32>, <4 x i32> addrspace(3)* %p, align 1 + store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1 + ret void +} + +; FIXME: We mark v4i32 as custom, so misaligned loads are never expanded. +; FIXME-SI-LABEL: {{^}}unaligned_load_store_v4i32_global +; FIXME-SI: buffer_load_ubyte +; FIXME-SI: buffer_load_ubyte +; FIXME-SI: buffer_load_ubyte +; FIXME-SI: buffer_load_ubyte +; FIXME-SI: buffer_load_ubyte +; FIXME-SI: buffer_load_ubyte +; FIXME-SI: buffer_load_ubyte +; FIXME-SI: buffer_load_ubyte +; FIXME-SI: buffer_load_ubyte +; FIXME-SI: buffer_load_ubyte +; FIXME-SI: buffer_load_ubyte +; FIXME-SI: buffer_load_ubyte +; FIXME-SI: buffer_load_ubyte +; FIXME-SI: buffer_load_ubyte +; FIXME-SI: buffer_load_ubyte +; FIXME-SI: buffer_load_ubyte +define void @unaligned_load_store_v4i32_global(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) nounwind { + %v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1 + store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1 + ret void +} + +; SI-LABEL: {{^}}load_lds_i64_align_4: +; SI: ds_read2_b32 +; SI: s_endpgm +define void @load_lds_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { + %val = load i64, i64 addrspace(3)* %in, align 4 + store i64 %val, i64 addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}load_lds_i64_align_4_with_offset +; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9 +; SI: s_endpgm +define void @load_lds_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { + %ptr = getelementptr i64, i64 addrspace(3)* %in, i32 4 + %val = load i64, i64 addrspace(3)* %ptr, align 4 + store i64 %val, i64 addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}load_lds_i64_align_4_with_split_offset: +; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits +; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1 +; SI: s_endpgm +define void @load_lds_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { + %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)* + %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255 + %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)* + %val = load i64, i64 addrspace(3)* %ptri64, align 4 + store i64 %val, i64 addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}load_lds_i64_align_1: +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: ds_read_u8 +; SI: buffer_store_dwordx2 +; SI: s_endpgm + +define void @load_lds_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { + %val = load i64, i64 addrspace(3)* %in, align 1 + store i64 %val, i64 addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: {{^}}store_lds_i64_align_4: +; SI: ds_write2_b32 +; SI: s_endpgm +define void @store_lds_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 { + store i64 %val, i64 addrspace(3)* %out, align 4 + ret void +} + +; SI-LABEL: {{^}}store_lds_i64_align_4_with_offset +; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9 +; SI: s_endpgm +define void @store_lds_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 { + %ptr = getelementptr i64, i64 addrspace(3)* %out, i32 4 + store i64 0, i64 addrspace(3)* %ptr, align 4 + ret void +} + +; SI-LABEL: {{^}}store_lds_i64_align_4_with_split_offset: +; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits +; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; SI: s_endpgm +define void @store_lds_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 { + %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)* + %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255 + %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)* + store i64 0, i64 addrspace(3)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll b/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll new file mode 100644 index 00000000000..036a7e91b47 --- /dev/null +++ b/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll @@ -0,0 +1,115 @@ +; REQUIRES: asserts +; XFAIL: * +; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s +; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s +; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=COMMON %s + +; SI hits an assertion at -O0, evergreen hits a not implemented unreachable. + +; COMMON-LABEL: {{^}}branch_true: +define void @branch_true(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 { +entry: + br i1 true, label %for.end, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + %add.ptr.sum = shl i32 %main_stride, 1 + %add.ptr1.sum = add i32 %add.ptr.sum, %main_stride + %add.ptr4.sum = shl i32 %main_stride, 2 + br label %for.body + +for.body: ; preds = %for.body, %for.body.lr.ph + %main.addr.011 = phi i8 addrspace(1)* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ] + %0 = bitcast i8 addrspace(1)* %main.addr.011 to i32 addrspace(1)* + %1 = load i32, i32 addrspace(1)* %0, align 4 + %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %main_stride + %2 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)* + %3 = load i32, i32 addrspace(1)* %2, align 4 + %add.ptr1 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum + %4 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)* + %5 = load i32, i32 addrspace(1)* %4, align 4 + %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum + %6 = bitcast i8 addrspace(1)* %add.ptr2 to i32 addrspace(1)* + %7 = load i32, i32 addrspace(1)* %6, align 4 + %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum + %8 = bitcast i8 addrspace(1)* %add.ptr3 to i32 addrspace(1)* + %9 = load i32, i32 addrspace(1)* %8, align 4 + %add.ptr6 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 undef + br i1 undef, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; COMMON-LABEL: {{^}}branch_false: +; SI: .text +; SI-NEXT: s_endpgm +define void @branch_false(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 { +entry: + br i1 false, label %for.end, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + %add.ptr.sum = shl i32 %main_stride, 1 + %add.ptr1.sum = add i32 %add.ptr.sum, %main_stride + %add.ptr4.sum = shl i32 %main_stride, 2 + br label %for.body + +for.body: ; preds = %for.body, %for.body.lr.ph + %main.addr.011 = phi i8 addrspace(1)* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ] + %0 = bitcast i8 addrspace(1)* %main.addr.011 to i32 addrspace(1)* + %1 = load i32, i32 addrspace(1)* %0, align 4 + %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %main_stride + %2 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)* + %3 = load i32, i32 addrspace(1)* %2, align 4 + %add.ptr1 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum + %4 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)* + %5 = load i32, i32 addrspace(1)* %4, align 4 + %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum + %6 = bitcast i8 addrspace(1)* %add.ptr2 to i32 addrspace(1)* + %7 = load i32, i32 addrspace(1)* %6, align 4 + %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum + %8 = bitcast i8 addrspace(1)* %add.ptr3 to i32 addrspace(1)* + %9 = load i32, i32 addrspace(1)* %8, align 4 + %add.ptr6 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 undef + br i1 undef, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +; COMMON-LABEL: {{^}}branch_undef: +; SI: .text +; SI-NEXT: s_endpgm +define void @branch_undef(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 { +entry: + br i1 undef, label %for.end, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + %add.ptr.sum = shl i32 %main_stride, 1 + %add.ptr1.sum = add i32 %add.ptr.sum, %main_stride + %add.ptr4.sum = shl i32 %main_stride, 2 + br label %for.body + +for.body: ; preds = %for.body, %for.body.lr.ph + %main.addr.011 = phi i8 addrspace(1)* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ] + %0 = bitcast i8 addrspace(1)* %main.addr.011 to i32 addrspace(1)* + %1 = load i32, i32 addrspace(1)* %0, align 4 + %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %main_stride + %2 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)* + %3 = load i32, i32 addrspace(1)* %2, align 4 + %add.ptr1 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum + %4 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)* + %5 = load i32, i32 addrspace(1)* %4, align 4 + %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum + %6 = bitcast i8 addrspace(1)* %add.ptr2 to i32 addrspace(1)* + %7 = load i32, i32 addrspace(1)* %6, align 4 + %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum + %8 = bitcast i8 addrspace(1)* %add.ptr3 to i32 addrspace(1)* + %9 = load i32, i32 addrspace(1)* %8, align 4 + %add.ptr6 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 undef + br i1 undef, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/unroll.ll b/test/CodeGen/AMDGPU/unroll.ll new file mode 100644 index 00000000000..411a15a4b83 --- /dev/null +++ b/test/CodeGen/AMDGPU/unroll.ll @@ -0,0 +1,36 @@ +; RUN: opt -mtriple=amdgcn-- -loop-unroll -simplifycfg -sroa %s -S -o - | FileCheck %s +; RUN: opt -mtriple=r600-- -loop-unroll -simplifycfg -sroa %s -S -o - | FileCheck %s + + +; This test contains a simple loop that initializes an array declared in +; private memory. We want to make sure these kinds of loops are always +; unrolled, because private memory is slow. + +; CHECK-LABEL: @test +; CHECK-NOT: alloca +; CHECK: store i32 5, i32 addrspace(1)* %out +define void @test(i32 addrspace(1)* %out) { +entry: + %0 = alloca [32 x i32] + br label %loop.header + +loop.header: + %counter = phi i32 [0, %entry], [%inc, %loop.inc] + br label %loop.body + +loop.body: + %ptr = getelementptr [32 x i32], [32 x i32]* %0, i32 0, i32 %counter + store i32 %counter, i32* %ptr + br label %loop.inc + +loop.inc: + %inc = add i32 %counter, 1 + %1 = icmp sge i32 %counter, 32 + br i1 %1, label %exit, label %loop.header + +exit: + %2 = getelementptr [32 x i32], [32 x i32]* %0, i32 0, i32 5 + %3 = load i32, i32* %2 + store i32 %3, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/unsupported-cc.ll b/test/CodeGen/AMDGPU/unsupported-cc.ll new file mode 100644 index 00000000000..8ab4faf2f14 --- /dev/null +++ b/test/CodeGen/AMDGPU/unsupported-cc.ll @@ -0,0 +1,125 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; These tests are for condition codes that are not supported by the hardware + +; CHECK-LABEL: {{^}}slt: +; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z +; CHECK-NEXT: LSHR +; CHECK-NEXT: 5(7.006492e-45) +define void @slt(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = icmp slt i32 %in, 5 + %1 = select i1 %0, i32 -1, i32 0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}ult_i32: +; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z +; CHECK-NEXT: LSHR +; CHECK-NEXT: 5(7.006492e-45) +define void @ult_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = icmp ult i32 %in, 5 + %1 = select i1 %0, i32 -1, i32 0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}ult_float: +; CHECK: SETGE * T{{[0-9]}}.[[CHAN:[XYZW]]], KC0[2].Z, literal.x +; CHECK-NEXT: 1084227584(5.000000e+00) +; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0 +; CHECK-NEXT: LSHR * +define void @ult_float(float addrspace(1)* %out, float %in) { +entry: + %0 = fcmp ult float %in, 5.0 + %1 = select i1 %0, float 1.0, float 0.0 + store float %1, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}ult_float_native: +; CHECK: SETGE T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x +; CHECK-NEXT: LSHR * +; CHECK-NEXT: 1084227584(5.000000e+00) +define void @ult_float_native(float addrspace(1)* %out, float %in) { +entry: + %0 = fcmp ult float %in, 5.0 + %1 = select i1 %0, float 0.0, float 1.0 + store float %1, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}olt: +; CHECK: SETGT T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z +; CHECK-NEXT: LSHR * +; CHECK-NEXT: 1084227584(5.000000e+00) +define void @olt(float addrspace(1)* %out, float %in) { +entry: + %0 = fcmp olt float %in, 5.0 + %1 = select i1 %0, float 1.0, float 0.0 + store float %1, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}sle: +; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z +; CHECK-NEXT: LSHR +; CHECK-NEXT: 6(8.407791e-45) +define void @sle(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = icmp sle i32 %in, 5 + %1 = select i1 %0, i32 -1, i32 0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}ule_i32: +; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z +; CHECK-NEXT: LSHR +; CHECK-NEXT: 6(8.407791e-45) +define void @ule_i32(i32 addrspace(1)* %out, i32 %in) { +entry: + %0 = icmp ule i32 %in, 5 + %1 = select i1 %0, i32 -1, i32 0 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}ule_float: +; CHECK: SETGT * T{{[0-9]}}.[[CHAN:[XYZW]]], KC0[2].Z, literal.x +; CHECK-NEXT: 1084227584(5.000000e+00) +; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0 +; CHECK-NEXT: LSHR * +define void @ule_float(float addrspace(1)* %out, float %in) { +entry: + %0 = fcmp ule float %in, 5.0 + %1 = select i1 %0, float 1.0, float 0.0 + store float %1, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}ule_float_native: +; CHECK: SETGT T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x +; CHECK-NEXT: LSHR * +; CHECK-NEXT: 1084227584(5.000000e+00) +define void @ule_float_native(float addrspace(1)* %out, float %in) { +entry: + %0 = fcmp ule float %in, 5.0 + %1 = select i1 %0, float 0.0, float 1.0 + store float %1, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}ole: +; CHECK: SETGE T{{[0-9]\.[XYZW]}}, literal.x, KC0[2].Z +; CHECK-NEXT: LSHR * +; CHECK-NEXT:1084227584(5.000000e+00) +define void @ole(float addrspace(1)* %out, float %in) { +entry: + %0 = fcmp ole float %in, 5.0 + %1 = select i1 %0, float 1.0, float 0.0 + store float %1, float addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/urecip.ll b/test/CodeGen/AMDGPU/urecip.ll new file mode 100644 index 00000000000..daacc771708 --- /dev/null +++ b/test/CodeGen/AMDGPU/urecip.ll @@ -0,0 +1,13 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK: v_rcp_iflag_f32_e32 + +define void @test(i32 %p, i32 %q) { + %i = udiv i32 %p, %q + %r = bitcast i32 %i to float + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) + ret void +} + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/test/CodeGen/AMDGPU/urem.ll b/test/CodeGen/AMDGPU/urem.ll new file mode 100644 index 00000000000..62841ec2d6c --- /dev/null +++ b/test/CodeGen/AMDGPU/urem.ll @@ -0,0 +1,94 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; The code generated by urem is long and complex and may frequently +; change. The goal of this test is to make sure the ISel doesn't fail +; when it gets a v2i32/v4i32 urem + +; FUNC-LABEL: {{^}}test_urem_i32: +; SI: s_endpgm +; EG: CF_END +define void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 + %a = load i32, i32 addrspace(1)* %in + %b = load i32, i32 addrspace(1)* %b_ptr + %result = urem i32 %a, %b + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_urem_i32_7: +; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x24924925 +; SI: v_mul_hi_u32 {{v[0-9]+}}, [[MAGIC]] +; SI: v_subrev_i32 +; SI: v_mul_lo_i32 +; SI: v_sub_i32 +; SI: buffer_store_dword +; SI: s_endpgm +define void @test_urem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %num = load i32, i32 addrspace(1) * %in + %result = urem i32 %num, 7 + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_urem_v2i32: +; SI: s_endpgm +; EG: CF_END +define void @test_urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 + %a = load <2 x i32>, <2 x i32> addrspace(1)* %in + %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr + %result = urem <2 x i32> %a, %b + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_urem_v4i32: +; SI: s_endpgm +; EG: CF_END +define void @test_urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 + %a = load <4 x i32>, <4 x i32> addrspace(1)* %in + %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr + %result = urem <4 x i32> %a, %b + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_urem_i64: +; SI: s_endpgm +; EG: CF_END +define void @test_urem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { + %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 + %a = load i64, i64 addrspace(1)* %in + %b = load i64, i64 addrspace(1)* %b_ptr + %result = urem i64 %a, %b + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_urem_v2i64: +; SI: s_endpgm +; EG: CF_END +define void @test_urem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { + %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 + %a = load <2 x i64>, <2 x i64> addrspace(1)* %in + %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr + %result = urem <2 x i64> %a, %b + store <2 x i64> %result, <2 x i64> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}test_urem_v4i64: +; SI: s_endpgm +; EG: CF_END +define void @test_urem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 + %a = load <4 x i64>, <4 x i64> addrspace(1)* %in + %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr + %result = urem <4 x i64> %a, %b + store <4 x i64> %result, <4 x i64> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll new file mode 100644 index 00000000000..f26f30022b4 --- /dev/null +++ b/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll @@ -0,0 +1,103 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s + +declare float @llvm.fma.f32(float, float, float) #1 +declare float @llvm.fmuladd.f32(float, float, float) #1 +declare i32 @llvm.AMDGPU.imad24(i32, i32, i32) #1 + + +; GCN-LABEL: {{^}}test_sgpr_use_twice_binop: +; GCN: s_load_dword [[SGPR:s[0-9]+]], +; GCN: v_add_f32_e64 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]] +; GCN: buffer_store_dword [[RESULT]] +define void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 { + %dbl = fadd float %a, %a + store float %dbl, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_sgpr_use_three_ternary_op: +; GCN: s_load_dword [[SGPR:s[0-9]+]], +; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[SGPR]] +; GCN: buffer_store_dword [[RESULT]] +define void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a) #0 { + %fma = call float @llvm.fma.f32(float %a, float %a, float %a) #1 + store float %fma, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_b: +; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]] +; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[SGPR0]], [[VGPR1]] +; GCN: buffer_store_dword [[RESULT]] +define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 { + %fma = call float @llvm.fma.f32(float %a, float %a, float %b) #1 + store float %fma, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_b_a: +; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]] +; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]] +; GCN: buffer_store_dword [[RESULT]] +define void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 { + %fma = call float @llvm.fma.f32(float %a, float %b, float %a) #1 + store float %fma, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_b_a_a: +; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc +; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]] +; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]] +; GCN: buffer_store_dword [[RESULT]] +define void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 { + %fma = call float @llvm.fma.f32(float %b, float %a, float %a) #1 + store float %fma, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_imm: +; GCN: s_load_dword [[SGPR:s[0-9]+]] +; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], 2.0 +; GCN: buffer_store_dword [[RESULT]] +define void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, float %a) #0 { + %fma = call float @llvm.fma.f32(float %a, float %a, float 2.0) #1 + store float %fma, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_imm_a: +; GCN: s_load_dword [[SGPR:s[0-9]+]] +; GCN: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]] +; GCN: buffer_store_dword [[RESULT]] +define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 { + %fma = call float @llvm.fma.f32(float %a, float 2.0, float %a) #1 + store float %fma, float addrspace(1)* %out, align 4 + ret void +} + +; Don't use fma since fma c, x, y is canonicalized to fma x, c, y +; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_imm_a_a: +; GCN: s_load_dword [[SGPR:s[0-9]+]] +; GCN: v_mad_i32_i24 [[RESULT:v[0-9]+]], 2, [[SGPR]], [[SGPR]] +; GCN: buffer_store_dword [[RESULT]] +define void @test_sgpr_use_twice_ternary_op_imm_a_a(i32 addrspace(1)* %out, i32 %a) #0 { + %fma = call i32 @llvm.AMDGPU.imad24(i32 2, i32 %a, i32 %a) #1 + store i32 %fma, i32 addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/usubo.ll b/test/CodeGen/AMDGPU/usubo.ll new file mode 100644 index 00000000000..3c9b1622a07 --- /dev/null +++ b/test/CodeGen/AMDGPU/usubo.ll @@ -0,0 +1,86 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone +declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone + +; FUNC-LABEL: {{^}}usubo_i64_zext: + +; EG: SUBB_UINT +; EG: ADDC_UINT +define void @usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %usub, 0 + %carry = extractvalue { i64, i1 } %usub, 1 + %ext = zext i1 %carry to i64 + %add2 = add i64 %val, %ext + store i64 %add2, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: {{^}}s_usubo_i32: +; SI: s_sub_i32 + +; EG-DAG: SUBB_UINT +; EG-DAG: SUB_INT +define void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { + %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) nounwind + %val = extractvalue { i32, i1 } %usub, 0 + %carry = extractvalue { i32, i1 } %usub, 1 + store i32 %val, i32 addrspace(1)* %out, align 4 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: {{^}}v_usubo_i32: +; SI: v_subrev_i32_e32 + +; EG-DAG: SUBB_UINT +; EG-DAG: SUB_INT +define void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { + %a = load i32, i32 addrspace(1)* %aptr, align 4 + %b = load i32, i32 addrspace(1)* %bptr, align 4 + %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) nounwind + %val = extractvalue { i32, i1 } %usub, 0 + %carry = extractvalue { i32, i1 } %usub, 1 + store i32 %val, i32 addrspace(1)* %out, align 4 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: {{^}}s_usubo_i64: +; SI: s_sub_u32 +; SI: s_subb_u32 + +; EG-DAG: SUBB_UINT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG: SUB_INT +define void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { + %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %usub, 0 + %carry = extractvalue { i64, i1 } %usub, 1 + store i64 %val, i64 addrspace(1)* %out, align 8 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} + +; FUNC-LABEL: {{^}}v_usubo_i64: +; SI: v_sub_i32 +; SI: v_subb_u32 + +; EG-DAG: SUBB_UINT +; EG-DAG: SUB_INT +; EG-DAG: SUB_INT +; EG: SUB_INT +define void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { + %a = load i64, i64 addrspace(1)* %aptr, align 4 + %b = load i64, i64 addrspace(1)* %bptr, align 4 + %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind + %val = extractvalue { i64, i1 } %usub, 0 + %carry = extractvalue { i64, i1 } %usub, 1 + store i64 %val, i64 addrspace(1)* %out, align 8 + store i1 %carry, i1 addrspace(1)* %carryout + ret void +} diff --git a/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll b/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll new file mode 100644 index 00000000000..31755125c03 --- /dev/null +++ b/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll @@ -0,0 +1,17 @@ +; REQUIRES: asserts +; XFAIL: * +; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s + +; CHECK-LABEL: {{^}}kernel_arg_i64: +define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { + store i64 %a, i64 addrspace(1)* %out, align 8 + ret void +} + +; i64 arg works, v1i64 arg does not. +; CHECK-LABEL: {{^}}kernel_arg_v1i64: +define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind { + store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8 + ret void +} + diff --git a/test/CodeGen/AMDGPU/v_cndmask.ll b/test/CodeGen/AMDGPU/v_cndmask.ll new file mode 100644 index 00000000000..c368c5aaf7d --- /dev/null +++ b/test/CodeGen/AMDGPU/v_cndmask.ll @@ -0,0 +1,39 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare i32 @llvm.r600.read.tidig.x() #1 + +; SI-LABEL: {{^}}v_cnd_nan_nosgpr: +; SI: v_cndmask_b32_e64 v{{[0-9]}}, v{{[0-9]}}, -1, s{{\[[0-9]+:[0-9]+\]}} +; SI-DAG: v{{[0-9]}} +; All nan values are converted to 0xffffffff +; SI: s_endpgm +define void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 { + %idx = call i32 @llvm.r600.read.tidig.x() #1 + %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx + %f = load float, float addrspace(1)* %fptr + %setcc = icmp ne i32 %c, 0 + %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f + store float %select, float addrspace(1)* %out + ret void +} + + +; This requires slightly trickier SGPR operand legalization since the +; single constant bus SGPR usage is the last operand, and it should +; never be moved. + +; SI-LABEL: {{^}}v_cnd_nan: +; SI: v_cndmask_b32_e64 v{{[0-9]}}, v{{[0-9]}}, -1, s{{\[[0-9]+:[0-9]+\]}} +; SI-DAG: v{{[0-9]}} +; All nan values are converted to 0xffffffff +; SI: s_endpgm +define void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 { + %setcc = icmp ne i32 %c, 0 + %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f + store float %select, float addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/valu-i1.ll b/test/CodeGen/AMDGPU/valu-i1.ll new file mode 100644 index 00000000000..7d0ebd139f5 --- /dev/null +++ b/test/CodeGen/AMDGPU/valu-i1.ll @@ -0,0 +1,188 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-misched -asm-verbose < %s | FileCheck -check-prefix=SI %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; SI-LABEL: @test_if +; Make sure the i1 values created by the cfg structurizer pass are +; moved using VALU instructions +; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1 +; SI: v_mov_b32_e32 v{{[0-9]}}, -1 +define void @test_if(i32 %a, i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 { +entry: + switch i32 %a, label %default [ + i32 0, label %case0 + i32 1, label %case1 + ] + +case0: + %arrayidx1 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b + store i32 0, i32 addrspace(1)* %arrayidx1, align 4 + br label %end + +case1: + %arrayidx5 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b + store i32 1, i32 addrspace(1)* %arrayidx5, align 4 + br label %end + +default: + %cmp8 = icmp eq i32 %a, 2 + %arrayidx10 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b + br i1 %cmp8, label %if, label %else + +if: + store i32 2, i32 addrspace(1)* %arrayidx10, align 4 + br label %end + +else: + store i32 3, i32 addrspace(1)* %arrayidx10, align 4 + br label %end + +end: + ret void +} + +; SI-LABEL: @simple_test_v_if +; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}} +; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc +; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] + +; SI: ; BB#1 +; SI: buffer_store_dword +; SI: s_endpgm + +; SI: BB1_2: +; SI: s_or_b64 exec, exec, [[BR_SREG]] +; SI: s_endpgm +define void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %is.0 = icmp ne i32 %tid, 0 + br i1 %is.0, label %store, label %exit + +store: + %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid + store i32 999, i32 addrspace(1)* %gep + ret void + +exit: + ret void +} + +; SI-LABEL: @simple_test_v_loop +; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}} +; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc +; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] +; SI: s_cbranch_execz BB2_2 + +; SI: ; BB#1: +; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} + +; SI: BB2_3: +; SI: buffer_load_dword +; SI: buffer_store_dword +; SI: v_cmp_eq_i32_e32 vcc, +; SI: s_or_b64 [[OR_SREG:s\[[0-9]+:[0-9]+\]]] +; SI: s_andn2_b64 exec, exec, [[OR_SREG]] +; SI: s_cbranch_execnz BB2_3 + +define void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { +entry: + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %is.0 = icmp ne i32 %tid, 0 + %limit = add i32 %tid, 64 + br i1 %is.0, label %loop, label %exit + +loop: + %i = phi i32 [%tid, %entry], [%i.inc, %loop] + %gep.src = getelementptr i32, i32 addrspace(1)* %src, i32 %i + %gep.dst = getelementptr i32, i32 addrspace(1)* %dst, i32 %i + %load = load i32, i32 addrspace(1)* %src + store i32 %load, i32 addrspace(1)* %gep.dst + %i.inc = add nsw i32 %i, 1 + %cmp = icmp eq i32 %limit, %i.inc + br i1 %cmp, label %exit, label %loop + +exit: + ret void +} + +; SI-LABEL: @multi_vcond_loop + +; Load loop limit from buffer +; Branch to exit if uniformly not taken +; SI: ; BB#0: +; SI: buffer_load_dword [[VBOUND:v[0-9]+]] +; SI: v_cmp_lt_i32_e32 vcc +; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc +; SI: s_xor_b64 [[OUTER_CMP_SREG]], exec, [[OUTER_CMP_SREG]] +; SI: s_cbranch_execz BB3_2 + +; Initialize inner condition to false +; SI: ; BB#1: +; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0{{$}} +; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], [[ZERO]] + +; Clear exec bits for workitems that load -1s +; SI: BB3_3: +; SI: buffer_load_dword [[B:v[0-9]+]] +; SI: buffer_load_dword [[A:v[0-9]+]] +; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]] +; SI-DAG: v_cmp_ne_i32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]] +; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]] +; SI: s_and_saveexec_b64 [[ORNEG1]], [[ORNEG1]] +; SI: s_xor_b64 [[ORNEG1]], exec, [[ORNEG1]] +; SI: s_cbranch_execz BB3_5 + +; SI: BB#4: +; SI: buffer_store_dword +; SI: v_cmp_ge_i64_e32 vcc +; SI: s_or_b64 [[COND_STATE]], vcc, [[COND_STATE]] + +; SI: BB3_5: +; SI: s_or_b64 exec, exec, [[ORNEG1]] +; SI: s_or_b64 [[COND_STATE]], [[ORNEG1]], [[COND_STATE]] +; SI: s_andn2_b64 exec, exec, [[COND_STATE]] +; SI: s_cbranch_execnz BB3_3 + +; SI: BB#6 +; SI: s_or_b64 exec, exec, [[COND_STATE]] + +; SI: BB3_2: +; SI-NOT: [[COND_STATE]] +; SI: s_endpgm + +define void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 { +bb: + %tmp = tail call i32 @llvm.r600.read.tidig.x() #0 + %tmp4 = sext i32 %tmp to i64 + %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg3, i64 %tmp4 + %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4 + %tmp7 = icmp sgt i32 %tmp6, 0 + %tmp8 = sext i32 %tmp6 to i64 + br i1 %tmp7, label %bb10, label %bb26 + +bb10: ; preds = %bb, %bb20 + %tmp11 = phi i64 [ %tmp23, %bb20 ], [ 0, %bb ] + %tmp12 = add nsw i64 %tmp11, %tmp4 + %tmp13 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp12 + %tmp14 = load i32, i32 addrspace(1)* %tmp13, align 4 + %tmp15 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp12 + %tmp16 = load i32, i32 addrspace(1)* %tmp15, align 4 + %tmp17 = icmp ne i32 %tmp14, -1 + %tmp18 = icmp ne i32 %tmp16, -1 + %tmp19 = and i1 %tmp17, %tmp18 + br i1 %tmp19, label %bb20, label %bb26 + +bb20: ; preds = %bb10 + %tmp21 = add nsw i32 %tmp16, %tmp14 + %tmp22 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp12 + store i32 %tmp21, i32 addrspace(1)* %tmp22, align 4 + %tmp23 = add nuw nsw i64 %tmp11, 1 + %tmp24 = icmp slt i64 %tmp23, %tmp8 + br i1 %tmp24, label %bb10, label %bb26 + +bb26: ; preds = %bb10, %bb20, %bb + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/vector-alloca.ll b/test/CodeGen/AMDGPU/vector-alloca.ll new file mode 100644 index 00000000000..6f3b4847fbd --- /dev/null +++ b/test/CodeGen/AMDGPU/vector-alloca.ll @@ -0,0 +1,77 @@ +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=verde -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=verde -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}vector_read: +; EG: MOV +; EG: MOV +; EG: MOV +; EG: MOV +; EG: MOVA_INT +define void @vector_read(i32 addrspace(1)* %out, i32 %index) { +entry: + %0 = alloca [4 x i32] + %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0 + %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1 + %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2 + %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3 + store i32 0, i32* %x + store i32 1, i32* %y + store i32 2, i32* %z + store i32 3, i32* %w + %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %index + %2 = load i32, i32* %1 + store i32 %2, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}vector_write: +; EG: MOV +; EG: MOV +; EG: MOV +; EG: MOV +; EG: MOVA_INT +; EG: MOVA_INT +define void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) { +entry: + %0 = alloca [4 x i32] + %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0 + %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1 + %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2 + %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3 + store i32 0, i32* %x + store i32 0, i32* %y + store i32 0, i32* %z + store i32 0, i32* %w + %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %w_index + store i32 1, i32* %1 + %2 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %r_index + %3 = load i32, i32* %2 + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; This test should be optimize to: +; store i32 0, i32 addrspace(1)* %out +; FUNC-LABEL: {{^}}bitcast_gep: +; EG: STORE_RAW +define void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) { +entry: + %0 = alloca [4 x i32] + %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0 + %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1 + %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2 + %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3 + store i32 0, i32* %x + store i32 0, i32* %y + store i32 0, i32* %z + store i32 0, i32* %w + %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1 + %2 = bitcast i32* %1 to [4 x i32]* + %3 = getelementptr [4 x i32], [4 x i32]* %2, i32 0, i32 0 + %4 = load i32, i32* %3 + store i32 %4, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll b/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll new file mode 100644 index 00000000000..fb6a17e6714 --- /dev/null +++ b/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll @@ -0,0 +1,25 @@ +; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=barts | FileCheck --check-prefix=NI %s +; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=cayman | FileCheck --check-prefix=CM %s + +; NI: {{^}}vtx_fetch32: +; NI: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0 ; encoding: [0x40,0x01,0x0[[GPR]],0x10,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x08,0x00 +; CM: {{^}}vtx_fetch32: +; CM: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0 ; encoding: [0x40,0x01,0x0[[GPR]],0x00,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x00,0x00 + +define void @vtx_fetch32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +entry: + %0 = load i32, i32 addrspace(1)* %in + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; NI: {{^}}vtx_fetch128: +; NI: VTX_READ_128 T[[DST:[0-9]]].XYZW, T[[SRC:[0-9]]].X, 0 ; encoding: [0x40,0x01,0x0[[SRC]],0x40,0x0[[DST]],0x10,0x8d,0x18,0x00,0x00,0x08,0x00 +; XXX: Add a case for Cayman when v4i32 stores are supported. + +define void @vtx_fetch128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { +entry: + %0 = load <4 x i32>, <4 x i32> addrspace(1)* %in + store <4 x i32> %0, <4 x i32> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/vop-shrink.ll b/test/CodeGen/AMDGPU/vop-shrink.ll new file mode 100644 index 00000000000..9b2f229c05a --- /dev/null +++ b/test/CodeGen/AMDGPU/vop-shrink.ll @@ -0,0 +1,51 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; Test that we correctly commute a sub instruction +; FUNC-LABEL: {{^}}sub_rev: +; SI-NOT: v_sub_i32_e32 v{{[0-9]+}}, s +; SI: v_subrev_i32_e32 v{{[0-9]+}}, s + +; ModuleID = 'vop-shrink.ll' + +define void @sub_rev(i32 addrspace(1)* %out, <4 x i32> %sgpr, i32 %cond) { +entry: + %vgpr = call i32 @llvm.r600.read.tidig.x() #1 + %tmp = icmp eq i32 %cond, 0 + br i1 %tmp, label %if, label %else + +if: ; preds = %entry + %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + %tmp2 = extractelement <4 x i32> %sgpr, i32 1 + store i32 %tmp2, i32 addrspace(1)* %out + br label %endif + +else: ; preds = %entry + %tmp3 = extractelement <4 x i32> %sgpr, i32 2 + %tmp4 = sub i32 %vgpr, %tmp3 + store i32 %tmp4, i32 addrspace(1)* %out + br label %endif + +endif: ; preds = %else, %if + ret void +} + +; Test that we fold an immediate that was illegal for a 64-bit op into the +; 32-bit op when we shrink it. + +; FUNC-LABEL: {{^}}add_fold: +; SI: v_add_f32_e32 v{{[0-9]+}}, 0x44800000 +define void @add_fold(float addrspace(1)* %out) { +entry: + %tmp = call i32 @llvm.r600.read.tidig.x() + %tmp1 = uitofp i32 %tmp to float + %tmp2 = fadd float %tmp1, 1.024000e+03 + store float %tmp2, float addrspace(1)* %out + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.x() #0 + +attributes #0 = { nounwind readnone } +attributes #1 = { readnone } diff --git a/test/CodeGen/AMDGPU/vselect.ll b/test/CodeGen/AMDGPU/vselect.ll new file mode 100644 index 00000000000..a3014b03d2b --- /dev/null +++ b/test/CodeGen/AMDGPU/vselect.ll @@ -0,0 +1,77 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s +;RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s + +;EG: {{^}}test_select_v2i32: +;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +;SI: {{^}}test_select_v2i32: +;SI: v_cndmask_b32_e64 +;SI: v_cndmask_b32_e64 + +define void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) { +entry: + %0 = load <2 x i32>, <2 x i32> addrspace(1)* %in0 + %1 = load <2 x i32>, <2 x i32> addrspace(1)* %in1 + %cmp = icmp ne <2 x i32> %0, %1 + %result = select <2 x i1> %cmp, <2 x i32> %0, <2 x i32> %1 + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +;EG: {{^}}test_select_v2f32: +;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +;SI: {{^}}test_select_v2f32: +;SI: v_cndmask_b32_e64 +;SI: v_cndmask_b32_e64 + +define void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) { +entry: + %0 = load <2 x float>, <2 x float> addrspace(1)* %in0 + %1 = load <2 x float>, <2 x float> addrspace(1)* %in1 + %cmp = fcmp une <2 x float> %0, %1 + %result = select <2 x i1> %cmp, <2 x float> %0, <2 x float> %1 + store <2 x float> %result, <2 x float> addrspace(1)* %out + ret void +} + +;EG: {{^}}test_select_v4i32: +;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +;SI: {{^}}test_select_v4i32: +;SI: v_cndmask_b32_e64 +;SI: v_cndmask_b32_e64 +;SI: v_cndmask_b32_e64 +;SI: v_cndmask_b32_e64 + +define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) { +entry: + %0 = load <4 x i32>, <4 x i32> addrspace(1)* %in0 + %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in1 + %cmp = icmp ne <4 x i32> %0, %1 + %result = select <4 x i1> %cmp, <4 x i32> %0, <4 x i32> %1 + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +;EG: {{^}}test_select_v4f32: +;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) { +entry: + %0 = load <4 x float>, <4 x float> addrspace(1)* %in0 + %1 = load <4 x float>, <4 x float> addrspace(1)* %in1 + %cmp = fcmp une <4 x float> %0, %1 + %result = select <4 x i1> %cmp, <4 x float> %0, <4 x float> %1 + store <4 x float> %result, <4 x float> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/vselect64.ll b/test/CodeGen/AMDGPU/vselect64.ll new file mode 100644 index 00000000000..ef85ebe7899 --- /dev/null +++ b/test/CodeGen/AMDGPU/vselect64.ll @@ -0,0 +1,15 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s +; XXX: Merge this test into vselect.ll once SI supports 64-bit select. + +; CHECK-LABEL: {{^}}test_select_v4i64: +; Make sure the vectors aren't being stored on the stack. We know they are +; being stored on the stack if the shaders uses at leat 10 registers. +; CHECK-NOT: {{\**}} MOV T{{[0-9][0-9]}}.X +define void @test_select_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> %c) { +entry: + %cmp = icmp ne <4 x i32> %c, + %result = select <4 x i1> %cmp, <4 x i64> , <4 x i64> + store <4 x i64> %result, <4 x i64> addrspace(1)* %out + ret void +} + diff --git a/test/CodeGen/AMDGPU/vtx-fetch-branch.ll b/test/CodeGen/AMDGPU/vtx-fetch-branch.ll new file mode 100644 index 00000000000..4584d6e2525 --- /dev/null +++ b/test/CodeGen/AMDGPU/vtx-fetch-branch.ll @@ -0,0 +1,29 @@ +; RUN: llc -march=r600 -mcpu=redwood %s -o - | FileCheck %s + +; This tests for a bug where vertex fetch clauses right before an ENDIF +; instruction where being emitted after the ENDIF. We were using ALU_POP_AFTER +; for the ALU clause before the vetex fetch instead of emitting a POP instruction +; after the fetch clause. + + +; CHECK-LABEL: {{^}}test: +; CHECK-NOT: ALU_POP_AFTER +; CHECK: TEX +; CHECK-NEXT: POP +define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) { +entry: + %0 = icmp eq i32 %cond, 0 + br i1 %0, label %endif, label %if + +if: + %1 = load i32, i32 addrspace(1)* %in + br label %endif + +endif: + %x = phi i32 [ %1, %if], [ 0, %entry] + store i32 %x, i32 addrspace(1)* %out + br label %done + +done: + ret void +} diff --git a/test/CodeGen/AMDGPU/vtx-schedule.ll b/test/CodeGen/AMDGPU/vtx-schedule.ll new file mode 100644 index 00000000000..912e258ebb8 --- /dev/null +++ b/test/CodeGen/AMDGPU/vtx-schedule.ll @@ -0,0 +1,18 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; This test is for a scheduler bug where VTX_READ instructions that used +; the result of another VTX_READ instruction were being grouped in the +; same fetch clasue. + +; CHECK: {{^}}test: +; CHECK: Fetch clause +; CHECK: VTX_READ_32 [[IN0:T[0-9]+\.X]], [[IN0]], 0 +; CHECK: Fetch clause +; CHECK: VTX_READ_32 [[IN1:T[0-9]+\.X]], [[IN1]], 0 +define void @test(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* addrspace(1)* nocapture %in0) { +entry: + %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in0 + %1 = load i32, i32 addrspace(1)* %0 + store i32 %1, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/wait.ll b/test/CodeGen/AMDGPU/wait.ll new file mode 100644 index 00000000000..5cc7577cad3 --- /dev/null +++ b/test/CodeGen/AMDGPU/wait.ll @@ -0,0 +1,45 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace %s + +; CHECK-LABEL: {{^}}main: +; CHECK: s_load_dwordx4 +; CHECK: s_load_dwordx4 +; CHECK: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; CHECK: s_endpgm +define void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 { +main_body: + %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 0 + %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 + %tmp11 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp10, i32 0, i32 %arg6) + %tmp12 = extractelement <4 x float> %tmp11, i32 0 + %tmp13 = extractelement <4 x float> %tmp11, i32 1 + call void @llvm.AMDGPU.barrier.global() #1 + %tmp14 = extractelement <4 x float> %tmp11, i32 2 +; %tmp15 = extractelement <4 x float> %tmp11, i32 3 + %tmp15 = load float, float addrspace(2)* %constptr, align 4 ; Force waiting for expcnt and lgkmcnt + %tmp16 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 1 + %tmp17 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp16, !tbaa !0 + %tmp18 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp17, i32 0, i32 %arg6) + %tmp19 = extractelement <4 x float> %tmp18, i32 0 + %tmp20 = extractelement <4 x float> %tmp18, i32 1 + %tmp21 = extractelement <4 x float> %tmp18, i32 2 + %tmp22 = extractelement <4 x float> %tmp18, i32 3 + call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %tmp19, float %tmp20, float %tmp21, float %tmp22) + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp12, float %tmp13, float %tmp14, float %tmp15) + ret void +} + +; Function Attrs: noduplicate nounwind +declare void @llvm.AMDGPU.barrier.global() #1 + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #2 + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="1" } +attributes #1 = { noduplicate nounwind } +attributes #2 = { nounwind readnone } + +!0 = !{!1, !1, i64 0, i32 1} +!1 = !{!"const", null} diff --git a/test/CodeGen/AMDGPU/work-item-intrinsics.ll b/test/CodeGen/AMDGPU/work-item-intrinsics.ll new file mode 100644 index 00000000000..4328e964c1b --- /dev/null +++ b/test/CodeGen/AMDGPU/work-item-intrinsics.ll @@ -0,0 +1,238 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + + +; FUNC-LABEL: {{^}}ngroups_x: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[0].X + +; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0 +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @ngroups_x (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.ngroups.x() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ngroups_y: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[0].Y + +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @ngroups_y (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.ngroups.y() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ngroups_z: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[0].Z + +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @ngroups_z (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.ngroups.z() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}global_size_x: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[0].W + +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @global_size_x (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.global.size.x() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}global_size_y: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[1].X + +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10 +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @global_size_y (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.global.size.y() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}global_size_z: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[1].Y + +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14 +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @global_size_z (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.global.size.z() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_x: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[1].Z + +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @local_size_x (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.local.size.x() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_y: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[1].W + +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @local_size_y (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.local.size.y() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_z: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[2].X + +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @local_size_z (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.local.size.z() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}get_work_dim: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[2].Z + +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @get_work_dim (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.AMDGPU.read.workdim() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; The tgid values are stored in sgprs offset by the number of user sgprs. +; Currently we always use exactly 2 user sgprs for the pointer to the +; kernel arguments, but this may change in the future. + +; FUNC-LABEL: {{^}}tgid_x: +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s4 +; GCN: buffer_store_dword [[VVAL]] +define void @tgid_x (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.tgid.x() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}tgid_y: +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s5 +; GCN: buffer_store_dword [[VVAL]] +define void @tgid_y (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.tgid.y() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}tgid_z: +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6 +; GCN: buffer_store_dword [[VVAL]] +define void @tgid_z (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.tgid.z() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}tidig_x: +; GCN: buffer_store_dword v0 +define void @tidig_x (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.tidig.x() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}tidig_y: +; GCN: buffer_store_dword v1 +define void @tidig_y (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.tidig.y() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}tidig_z: +; GCN: buffer_store_dword v2 +define void @tidig_z (i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.tidig.z() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +declare i32 @llvm.r600.read.ngroups.x() #0 +declare i32 @llvm.r600.read.ngroups.y() #0 +declare i32 @llvm.r600.read.ngroups.z() #0 + +declare i32 @llvm.r600.read.global.size.x() #0 +declare i32 @llvm.r600.read.global.size.y() #0 +declare i32 @llvm.r600.read.global.size.z() #0 + +declare i32 @llvm.r600.read.local.size.x() #0 +declare i32 @llvm.r600.read.local.size.y() #0 +declare i32 @llvm.r600.read.local.size.z() #0 + +declare i32 @llvm.r600.read.tgid.x() #0 +declare i32 @llvm.r600.read.tgid.y() #0 +declare i32 @llvm.r600.read.tgid.z() #0 + +declare i32 @llvm.r600.read.tidig.x() #0 +declare i32 @llvm.r600.read.tidig.y() #0 +declare i32 @llvm.r600.read.tidig.z() #0 + +declare i32 @llvm.AMDGPU.read.workdim() #0 + +attributes #0 = { readnone } diff --git a/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll b/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll new file mode 100644 index 00000000000..8b383e4c393 --- /dev/null +++ b/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll @@ -0,0 +1,81 @@ +; RUN: llc -march=r600 -mcpu=redwood -mtriple=r600-- < %s | FileCheck %s + +; We want all MULLO_INT inst to be last in their instruction group +;CHECK: {{^}}fill3d: +;CHECK-NOT: MULLO_INT T[0-9]+ + +define void @fill3d(i32 addrspace(1)* nocapture %out) #0 { +entry: + %x.i = tail call i32 @llvm.r600.read.global.size.x() #1 + %y.i18 = tail call i32 @llvm.r600.read.global.size.y() #1 + %mul = mul i32 %y.i18, %x.i + %z.i17 = tail call i32 @llvm.r600.read.global.size.z() #1 + %mul3 = mul i32 %mul, %z.i17 + %x.i.i = tail call i32 @llvm.r600.read.tgid.x() #1 + %x.i12.i = tail call i32 @llvm.r600.read.local.size.x() #1 + %mul26.i = mul i32 %x.i12.i, %x.i.i + %x.i4.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %add.i16 = add i32 %x.i4.i, %mul26.i + %mul7 = mul i32 %add.i16, %y.i18 + %y.i.i = tail call i32 @llvm.r600.read.tgid.y() #1 + %y.i14.i = tail call i32 @llvm.r600.read.local.size.y() #1 + %mul30.i = mul i32 %y.i14.i, %y.i.i + %y.i6.i = tail call i32 @llvm.r600.read.tidig.y() #1 + %add.i14 = add i32 %mul30.i, %mul7 + %mul819 = add i32 %add.i14, %y.i6.i + %add = mul i32 %mul819, %z.i17 + %z.i.i = tail call i32 @llvm.r600.read.tgid.z() #1 + %z.i16.i = tail call i32 @llvm.r600.read.local.size.z() #1 + %mul33.i = mul i32 %z.i16.i, %z.i.i + %z.i8.i = tail call i32 @llvm.r600.read.tidig.z() #1 + %add.i = add i32 %z.i8.i, %mul33.i + %add13 = add i32 %add.i, %add + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add13 + store i32 %mul3, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tgid.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tgid.y() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tgid.z() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.local.size.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.local.size.y() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.local.size.z() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.y() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.z() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.global.size.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.global.size.y() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.global.size.z() #1 + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } + +!opencl.kernels = !{!0, !1, !2} + +!0 = !{null} +!1 = !{null} +!2 = !{void (i32 addrspace(1)*)* @fill3d} diff --git a/test/CodeGen/AMDGPU/xor.ll b/test/CodeGen/AMDGPU/xor.ll new file mode 100644 index 00000000000..089db59eabc --- /dev/null +++ b/test/CodeGen/AMDGPU/xor.ll @@ -0,0 +1,173 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + + +; FUNC-LABEL: {{^}}xor_v2i32: +; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} + +define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) { + %a = load <2 x i32>, <2 x i32> addrspace(1) * %in0 + %b = load <2 x i32>, <2 x i32> addrspace(1) * %in1 + %result = xor <2 x i32> %a, %b + store <2 x i32> %result, <2 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}xor_v4i32: +; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} +; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} + +define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) { + %a = load <4 x i32>, <4 x i32> addrspace(1) * %in0 + %b = load <4 x i32>, <4 x i32> addrspace(1) * %in1 + %result = xor <4 x i32> %a, %b + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}xor_i1: +; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}} + +; SI-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 0, {{v[0-9]+}} +; SI-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 1.0, {{v[0-9]+}} +; SI: s_xor_b64 [[XOR:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]] +; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, [[XOR]] +; SI: buffer_store_dword [[RESULT]] +; SI: s_endpgm +define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) { + %a = load float, float addrspace(1) * %in0 + %b = load float, float addrspace(1) * %in1 + %acmp = fcmp oge float %a, 0.000000e+00 + %bcmp = fcmp oge float %b, 1.000000e+00 + %xor = xor i1 %acmp, %bcmp + %result = select i1 %xor, float %a, float %b + store float %result, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}v_xor_i1: +; SI: buffer_load_ubyte [[B:v[0-9]+]] +; SI: buffer_load_ubyte [[A:v[0-9]+]] +; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[A]], [[B]] +; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]] +; SI: buffer_store_byte [[RESULT]] +define void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) { + %a = load i1, i1 addrspace(1)* %in0 + %b = load i1, i1 addrspace(1)* %in1 + %xor = xor i1 %a, %b + store i1 %xor, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}vector_xor_i32: +; SI: v_xor_b32_e32 +define void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) { + %a = load i32, i32 addrspace(1)* %in0 + %b = load i32, i32 addrspace(1)* %in1 + %result = xor i32 %a, %b + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}scalar_xor_i32: +; SI: s_xor_b32 +define void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { + %result = xor i32 %a, %b + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}scalar_not_i32: +; SI: s_not_b32 +define void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) { + %result = xor i32 %a, -1 + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}vector_not_i32: +; SI: v_not_b32 +define void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) { + %a = load i32, i32 addrspace(1)* %in0 + %b = load i32, i32 addrspace(1)* %in1 + %result = xor i32 %a, -1 + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}vector_xor_i64: +; SI: v_xor_b32_e32 +; SI: v_xor_b32_e32 +; SI: s_endpgm +define void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) { + %a = load i64, i64 addrspace(1)* %in0 + %b = load i64, i64 addrspace(1)* %in1 + %result = xor i64 %a, %b + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}scalar_xor_i64: +; SI: s_xor_b64 +; SI: s_endpgm +define void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { + %result = xor i64 %a, %b + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}scalar_not_i64: +; SI: s_not_b64 +define void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) { + %result = xor i64 %a, -1 + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}vector_not_i64: +; SI: v_not_b32 +; SI: v_not_b32 +define void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) { + %a = load i64, i64 addrspace(1)* %in0 + %b = load i64, i64 addrspace(1)* %in1 + %result = xor i64 %a, -1 + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; Test that we have a pattern to match xor inside a branch. +; Note that in the future the backend may be smart enough to +; use an SALU instruction for this. + +; FUNC-LABEL: {{^}}xor_cf: +; SI: s_xor_b64 +define void @xor_cf(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b) { +entry: + %0 = icmp eq i64 %a, 0 + br i1 %0, label %if, label %else + +if: + %1 = xor i64 %a, %b + br label %endif + +else: + %2 = load i64, i64 addrspace(1)* %in + br label %endif + +endif: + %3 = phi i64 [%1, %if], [%2, %else] + store i64 %3, i64 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/zero_extend.ll b/test/CodeGen/AMDGPU/zero_extend.ll new file mode 100644 index 00000000000..033055db185 --- /dev/null +++ b/test/CodeGen/AMDGPU/zero_extend.ll @@ -0,0 +1,41 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600 +; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI + +; R600: {{^}}test: +; R600: MEM_RAT_CACHELESS STORE_RAW +; R600: MEM_RAT_CACHELESS STORE_RAW + +; SI: {{^}}test: +; SI: s_mov_b32 [[ZERO:s[0-9]]], 0{{$}} +; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], [[ZERO]] +; SI: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}} +define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { +entry: + %0 = mul i32 %a, %b + %1 = add i32 %0, %c + %2 = zext i32 %1 to i64 + store i64 %2, i64 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}testi1toi32: +; SI: v_cndmask_b32 +define void @testi1toi32(i32 addrspace(1)* %out, i32 %a, i32 %b) { +entry: + %0 = icmp eq i32 %a, %b + %1 = zext i1 %0 to i32 + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}zext_i1_to_i64: +; SI: s_mov_b32 s{{[0-9]+}}, 0 +; SI: v_cmp_eq_i32 +; SI: v_cndmask_b32 +define void @zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %cmp = icmp eq i32 %a, %b + %ext = zext i1 %cmp to i64 + store i64 %ext, i64 addrspace(1)* %out, align 8 + ret void +} diff --git a/test/CodeGen/R600/32-bit-local-address-space.ll b/test/CodeGen/R600/32-bit-local-address-space.ll deleted file mode 100644 index c7bcfd2ddab..00000000000 --- a/test/CodeGen/R600/32-bit-local-address-space.ll +++ /dev/null @@ -1,139 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; On Southern Islands GPUs the local address space(3) uses 32-bit pointers and -; the global address space(1) uses 64-bit pointers. These tests check to make sure -; the correct pointer size is used for the local address space. - -; The e{{32|64}} suffix on the instructions refers to the encoding size and not -; the size of the operands. The operand size is denoted in the instruction name. -; Instructions with B32, U32, and I32 in their name take 32-bit operands, while -; instructions with B64, U64, and I64 take 64-bit operands. - -; FUNC-LABEL: {{^}}local_address_load: -; SI: v_mov_b32_e{{32|64}} [[PTR:v[0-9]]] -; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]] -define void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { -entry: - %0 = load i32, i32 addrspace(3)* %in - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}local_address_gep: -; SI: s_add_i32 [[SPTR:s[0-9]]] -; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] -; SI: ds_read_b32 [[VPTR]] -define void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) { -entry: - %0 = getelementptr i32, i32 addrspace(3)* %in, i32 %offset - %1 = load i32, i32 addrspace(3)* %0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}local_address_gep_const_offset: -; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}} -; SI: ds_read_b32 v{{[0-9]+}}, [[VPTR]] offset:4 -define void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { -entry: - %0 = getelementptr i32, i32 addrspace(3)* %in, i32 1 - %1 = load i32, i32 addrspace(3)* %0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; Offset too large, can't fold into 16-bit immediate offset. -; FUNC-LABEL: {{^}}local_address_gep_large_const_offset: -; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004 -; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] -; SI: ds_read_b32 [[VPTR]] -define void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { -entry: - %0 = getelementptr i32, i32 addrspace(3)* %in, i32 16385 - %1 = load i32, i32 addrspace(3)* %0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}null_32bit_lds_ptr: -; SI: v_cmp_ne_i32 -; SI-NOT: v_cmp_ne_i32 -; SI: v_cndmask_b32 -define void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind { - %cmp = icmp ne i32 addrspace(3)* %lds, null - %x = select i1 %cmp, i32 123, i32 456 - store i32 %x, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}mul_32bit_ptr: -; SI: s_mul_i32 -; SI-NEXT: s_add_i32 -; SI: ds_read_b32 -define void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) { - %ptr = getelementptr [3 x float], [3 x float] addrspace(3)* %lds, i32 %tid, i32 0 - %val = load float, float addrspace(3)* %ptr - store float %val, float addrspace(1)* %out - ret void -} - -@g_lds = addrspace(3) global float undef, align 4 - -; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset: -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0 -; SI: ds_read_b32 v{{[0-9]+}}, [[REG]] -define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) { - %val = load float, float addrspace(3)* @g_lds - store float %val, float addrspace(1)* %out - ret void -} - - -@ptr = addrspace(3) global i32 addrspace(3)* undef -@dst = addrspace(3) global [16384 x i32] undef - -; FUNC-LABEL: {{^}}global_ptr: -; SI: ds_write_b32 -define void @global_ptr() nounwind { - store i32 addrspace(3)* getelementptr ([16384 x i32], [16384 x i32] addrspace(3)* @dst, i32 0, i32 16), i32 addrspace(3)* addrspace(3)* @ptr - ret void -} - -; FUNC-LABEL: {{^}}local_address_store: -; SI: ds_write_b32 -define void @local_address_store(i32 addrspace(3)* %out, i32 %val) { - store i32 %val, i32 addrspace(3)* %out - ret void -} - -; FUNC-LABEL: {{^}}local_address_gep_store: -; SI: s_add_i32 [[SADDR:s[0-9]+]], -; SI: v_mov_b32_e32 [[ADDR:v[0-9]+]], [[SADDR]] -; SI: ds_write_b32 [[ADDR]], v{{[0-9]+}} -define void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) { - %gep = getelementptr i32, i32 addrspace(3)* %out, i32 %offset - store i32 %val, i32 addrspace(3)* %gep, align 4 - ret void -} - -; FUNC-LABEL: {{^}}local_address_gep_const_offset_store: -; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}} -; SI: v_mov_b32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}} -; SI: ds_write_b32 [[VPTR]], [[VAL]] offset:4 -define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) { - %gep = getelementptr i32, i32 addrspace(3)* %out, i32 1 - store i32 %val, i32 addrspace(3)* %gep, align 4 - ret void -} - -; Offset too large, can't fold into 16-bit immediate offset. -; FUNC-LABEL: {{^}}local_address_gep_large_const_offset_store: -; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004 -; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] -; SI: ds_write_b32 [[VPTR]], v{{[0-9]+$}} -define void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) { - %gep = getelementptr i32, i32 addrspace(3)* %out, i32 16385 - store i32 %val, i32 addrspace(3)* %gep, align 4 - ret void -} diff --git a/test/CodeGen/R600/README b/test/CodeGen/R600/README deleted file mode 100644 index 96998bba28f..00000000000 --- a/test/CodeGen/R600/README +++ /dev/null @@ -1,21 +0,0 @@ -+==============================================================================+ -| How to organize the lit tests | -+==============================================================================+ - -- If you write a test for matching a single DAG opcode or intrinsic, it should - go in a file called {opcode_name,intrinsic_name}.ll (e.g. fadd.ll) - -- If you write a test that matches several DAG opcodes and checks for a single - ISA instruction, then that test should go in a file called {ISA_name}.ll (e.g. - bfi_int.ll - -- For all other tests, use your best judgement for organizing tests and naming - the files. - -+==============================================================================+ -| Naming conventions | -+==============================================================================+ - -- Use dash '-' and not underscore '_' to separate words in file names, unless - the file is named after a DAG opcode or ISA instruction that has an - underscore '_' in its name. diff --git a/test/CodeGen/R600/add-debug.ll b/test/CodeGen/R600/add-debug.ll deleted file mode 100644 index 529905dd36a..00000000000 --- a/test/CodeGen/R600/add-debug.ll +++ /dev/null @@ -1,24 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=tahiti -debug -; RUN: llc < %s -march=amdgcn -mcpu=tonga -debug -; REQUIRES: asserts - -; Check that SelectionDAGDumper does not crash on int_SI_if. -define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { -entry: - %0 = icmp eq i64 %a, 0 - br i1 %0, label %if, label %else - -if: - %1 = load i64, i64 addrspace(1)* %in - br label %endif - -else: - %2 = add i64 %a, %b - br label %endif - -endif: - %3 = phi i64 [%1, %if], [%2, %else] - store i64 %3, i64 addrspace(1)* %out - ret void -} - diff --git a/test/CodeGen/R600/add.ll b/test/CodeGen/R600/add.ll deleted file mode 100644 index 655e75dbc1a..00000000000 --- a/test/CodeGen/R600/add.ll +++ /dev/null @@ -1,192 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s - -;FUNC-LABEL: {{^}}test1: -;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -;SI: v_add_i32_e32 [[REG:v[0-9]+]], {{v[0-9]+, v[0-9]+}} -;SI-NOT: [[REG]] -;SI: buffer_store_dword [[REG]], -define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %a = load i32, i32 addrspace(1)* %in - %b = load i32, i32 addrspace(1)* %b_ptr - %result = add i32 %a, %b - store i32 %result, i32 addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}test2: -;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 - %a = load <2 x i32>, <2 x i32> addrspace(1)* %in - %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr - %result = add <2 x i32> %a, %b - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}test4: -;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;SI: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 - %a = load <4 x i32>, <4 x i32> addrspace(1)* %in - %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr - %result = add <4 x i32> %a, %b - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test8: -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT - -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -define void @test8(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) { -entry: - %0 = add <8 x i32> %a, %b - store <8 x i32> %0, <8 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test16: -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT -; EG: ADD_INT - -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -; SI: s_add_i32 -define void @test16(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) { -entry: - %0 = add <16 x i32> %a, %b - store <16 x i32> %0, <16 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}add64: -; SI: s_add_u32 -; SI: s_addc_u32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] -; EG-DAG: ADD_INT {{[* ]*}}[[LO]] -; EG-DAG: ADDC_UINT -; EG-DAG: ADD_INT -; EG-DAG: ADD_INT {{[* ]*}}[[HI]] -; EG-NOT: SUB -define void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) { -entry: - %0 = add i64 %a, %b - store i64 %0, i64 addrspace(1)* %out - ret void -} - -; The v_addc_u32 and v_add_i32 instruction can't read SGPRs, because they -; use VCC. The test is designed so that %a will be stored in an SGPR and -; %0 will be stored in a VGPR, so the comiler will be forced to copy %a -; to a VGPR before doing the add. - -; FUNC-LABEL: {{^}}add64_sgpr_vgpr: -; SI-NOT: v_addc_u32_e32 s - -; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] -; EG-DAG: ADD_INT {{[* ]*}}[[LO]] -; EG-DAG: ADDC_UINT -; EG-DAG: ADD_INT -; EG-DAG: ADD_INT {{[* ]*}}[[HI]] -; EG-NOT: SUB -define void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) { -entry: - %0 = load i64, i64 addrspace(1)* %in - %1 = add i64 %a, %0 - store i64 %1, i64 addrspace(1)* %out - ret void -} - -; Test i64 add inside a branch. -; FUNC-LABEL: {{^}}add64_in_branch: -; SI: s_add_u32 -; SI: s_addc_u32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] -; EG-DAG: ADD_INT {{[* ]*}}[[LO]] -; EG-DAG: ADDC_UINT -; EG-DAG: ADD_INT -; EG-DAG: ADD_INT {{[* ]*}}[[HI]] -; EG-NOT: SUB -define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { -entry: - %0 = icmp eq i64 %a, 0 - br i1 %0, label %if, label %else - -if: - %1 = load i64, i64 addrspace(1)* %in - br label %endif - -else: - %2 = add i64 %a, %b - br label %endif - -endif: - %3 = phi i64 [%1, %if], [%2, %else] - store i64 %3, i64 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/add_i64.ll b/test/CodeGen/R600/add_i64.ll deleted file mode 100644 index 8346add7df9..00000000000 --- a/test/CodeGen/R600/add_i64.ll +++ /dev/null @@ -1,84 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s - - -declare i32 @llvm.r600.read.tidig.x() readnone - -; SI-LABEL: {{^}}test_i64_vreg: -; SI: v_add_i32 -; SI: v_addc_u32 -define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) { - %tid = call i32 @llvm.r600.read.tidig.x() readnone - %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid - %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid - %a = load i64, i64 addrspace(1)* %a_ptr - %b = load i64, i64 addrspace(1)* %b_ptr - %result = add i64 %a, %b - store i64 %result, i64 addrspace(1)* %out - ret void -} - -; Check that the SGPR add operand is correctly moved to a VGPR. -; SI-LABEL: {{^}}sgpr_operand: -; SI: v_add_i32 -; SI: v_addc_u32 -define void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) { - %foo = load i64, i64 addrspace(1)* %in, align 8 - %result = add i64 %foo, %a - store i64 %result, i64 addrspace(1)* %out - ret void -} - -; Swap the arguments. Check that the SGPR -> VGPR copy works with the -; SGPR as other operand. -; -; SI-LABEL: {{^}}sgpr_operand_reversed: -; SI: v_add_i32 -; SI: v_addc_u32 -define void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) { - %foo = load i64, i64 addrspace(1)* %in, align 8 - %result = add i64 %a, %foo - store i64 %result, i64 addrspace(1)* %out - ret void -} - - -; SI-LABEL: {{^}}test_v2i64_sreg: -; SI: s_add_u32 -; SI: s_addc_u32 -; SI: s_add_u32 -; SI: s_addc_u32 -define void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) { - %result = add <2 x i64> %a, %b - store <2 x i64> %result, <2 x i64> addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}test_v2i64_vreg: -; SI: v_add_i32 -; SI: v_addc_u32 -; SI: v_add_i32 -; SI: v_addc_u32 -define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) { - %tid = call i32 @llvm.r600.read.tidig.x() readnone - %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid - %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid - %a = load <2 x i64>, <2 x i64> addrspace(1)* %a_ptr - %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr - %result = add <2 x i64> %a, %b - store <2 x i64> %result, <2 x i64> addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}trunc_i64_add_to_i32: -; SI: s_load_dword s[[SREG0:[0-9]+]] -; SI: s_load_dword s[[SREG1:[0-9]+]] -; SI: s_add_i32 [[SRESULT:s[0-9]+]], s[[SREG1]], s[[SREG0]] -; SI-NOT: addc -; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] -; SI: buffer_store_dword [[VRESULT]], -define void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { - %add = add i64 %b, %a - %trunc = trunc i64 %add to i32 - store i32 %trunc, i32 addrspace(1)* %out, align 8 - ret void -} diff --git a/test/CodeGen/R600/address-space.ll b/test/CodeGen/R600/address-space.ll deleted file mode 100644 index 4be8c584752..00000000000 --- a/test/CodeGen/R600/address-space.ll +++ /dev/null @@ -1,36 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s - -; Test that codegenprepare understands address space sizes - -%struct.foo = type { [3 x float], [3 x float] } - -; FIXME: Extra V_MOV from SGPR to VGPR for second read. The address is -; already in a VGPR after the first read. - -; CHECK-LABEL: {{^}}do_as_ptr_calcs: -; CHECK: s_load_dword [[SREG1:s[0-9]+]], -; CHECK: v_mov_b32_e32 [[VREG2:v[0-9]+]], [[SREG1]] -; CHECK: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]] -; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG1]] offset:12 -; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG2]] offset:20 -define void @do_as_ptr_calcs(%struct.foo addrspace(3)* nocapture %ptr) nounwind { -entry: - %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0 - %y = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 2 - br label %bb32 - -bb32: - %a = load float, float addrspace(3)* %x, align 4 - %b = load float, float addrspace(3)* %y, align 4 - %cmp = fcmp one float %a, %b - br i1 %cmp, label %bb34, label %bb33 - -bb33: - unreachable - -bb34: - unreachable -} - - diff --git a/test/CodeGen/R600/and.ll b/test/CodeGen/R600/and.ll deleted file mode 100644 index 5672d470bd7..00000000000 --- a/test/CodeGen/R600/and.ll +++ /dev/null @@ -1,296 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}test2: -; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 - %a = load <2 x i32>, <2 x i32> addrspace(1) * %in - %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr - %result = and <2 x i32> %a, %b - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test4: -; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 - %a = load <4 x i32>, <4 x i32> addrspace(1) * %in - %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr - %result = and <4 x i32> %a, %b - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}s_and_i32: -; SI: s_and_b32 -define void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { - %and = and i32 %a, %b - store i32 %and, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}s_and_constant_i32: -; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687 -define void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) { - %and = and i32 %a, 1234567 - store i32 %and, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_and_i32: -; SI: v_and_b32 -define void @v_and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) { - %a = load i32, i32 addrspace(1)* %aptr, align 4 - %b = load i32, i32 addrspace(1)* %bptr, align 4 - %and = and i32 %a, %b - store i32 %and, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_and_constant_i32 -; SI: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, v{{[0-9]+}} -define void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { - %a = load i32, i32 addrspace(1)* %aptr, align 4 - %and = and i32 %a, 1234567 - store i32 %and, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_and_inline_imm_64_i32 -; SI: v_and_b32_e32 v{{[0-9]+}}, 64, v{{[0-9]+}} -define void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { - %a = load i32, i32 addrspace(1)* %aptr, align 4 - %and = and i32 %a, 64 - store i32 %and, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_and_inline_imm_neg_16_i32 -; SI: v_and_b32_e32 v{{[0-9]+}}, -16, v{{[0-9]+}} -define void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) { - %a = load i32, i32 addrspace(1)* %aptr, align 4 - %and = and i32 %a, -16 - store i32 %and, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}s_and_i64 -; SI: s_and_b64 -define void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { - %and = and i64 %a, %b - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FIXME: Should use SGPRs -; FUNC-LABEL: {{^}}s_and_i1: -; SI: v_and_b32 -define void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) { - %and = and i1 %a, %b - store i1 %and, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}s_and_constant_i64 -; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} -define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) { - %and = and i64 %a, 281474976710655 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_and_i64: -; SI: v_and_b32 -; SI: v_and_b32 -define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) { - %a = load i64, i64 addrspace(1)* %aptr, align 8 - %b = load i64, i64 addrspace(1)* %bptr, align 8 - %and = and i64 %a, %b - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_and_i64_br: -; SI: v_and_b32 -; SI: v_and_b32 -define void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i32 %cond) { -entry: - %tmp0 = icmp eq i32 %cond, 0 - br i1 %tmp0, label %if, label %endif - -if: - %a = load i64, i64 addrspace(1)* %aptr, align 8 - %b = load i64, i64 addrspace(1)* %bptr, align 8 - %and = and i64 %a, %b - br label %endif - -endif: - %tmp1 = phi i64 [%and, %if], [0, %entry] - store i64 %tmp1, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_and_constant_i64: -; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { - %a = load i64, i64 addrspace(1)* %aptr, align 8 - %and = and i64 %a, 1234567 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FIXME: Replace and 0 with mov 0 -; FUNC-LABEL: {{^}}v_and_inline_imm_i64: -; SI: v_and_b32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}} -; SI: v_and_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}} -define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { - %a = load i64, i64 addrspace(1)* %aptr, align 8 - %and = and i64 %a, 64 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_and_inline_imm_64_i64 -; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 64 -define void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { - %and = and i64 %a, 64 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_and_inline_imm_1_i64 -; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1 -define void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { - %and = and i64 %a, 1 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_and_inline_imm_1.0_i64 -; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1.0 -define void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { - %and = and i64 %a, 4607182418800017408 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_and_inline_imm_neg_1.0_i64 -; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -1.0 -define void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { - %and = and i64 %a, 13830554455654793216 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_and_inline_imm_0.5_i64 -; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0.5 -define void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { - %and = and i64 %a, 4602678819172646912 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_and_inline_imm_neg_0.5_i64 -; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -0.5 -define void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { - %and = and i64 %a, 13826050856027422720 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_and_inline_imm_2.0_i64 -; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 2.0 -define void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { - %and = and i64 %a, 4611686018427387904 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_and_inline_imm_neg_2.0_i64 -; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -2.0 -define void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { - %and = and i64 %a, 13835058055282163712 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_and_inline_imm_4.0_i64 -; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 4.0 -define void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { - %and = and i64 %a, 4616189618054758400 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_and_inline_imm_neg_4.0_i64 -; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -4.0 -define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { - %and = and i64 %a, 13839561654909534208 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - - -; Test with the 64-bit integer bitpattern for a 32-bit float in the -; low 32-bits, which is not a valid 64-bit inline immmediate. - -; FUNC-LABEL: {{^}}s_and_inline_imm_f32_4.0_i64 -; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 4.0 -; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0{{$}} -; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} -define void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { - %and = and i64 %a, 1082130432 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FIXME: Copy of -1 register -; FUNC-LABEL: {{^}}s_and_inline_imm_f32_neg_4.0_i64 -; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], -4.0 -; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -1{{$}} -; SI-DAG: s_mov_b32 s[[K_HI_COPY:[0-9]+]], s[[K_HI]] -; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI_COPY]]{{\]}} -define void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { - %and = and i64 %a, -1065353216 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; Shift into upper 32-bits -; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_4.0_i64 -; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 4.0 -; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}} -; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} -define void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { - %and = and i64 %a, 4647714815446351872 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_neg_4.0_i64 -; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -4.0 -; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}} -; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} -define void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { - %and = and i64 %a, 13871086852301127680 - store i64 %and, i64 addrspace(1)* %out, align 8 - ret void -} diff --git a/test/CodeGen/R600/anyext.ll b/test/CodeGen/R600/anyext.ll deleted file mode 100644 index 48d8f312249..00000000000 --- a/test/CodeGen/R600/anyext.ll +++ /dev/null @@ -1,15 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -; CHECK-LABEL: {{^}}anyext_i1_i32: -; CHECK: v_cndmask_b32_e64 -define void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) { -entry: - %0 = icmp eq i32 %cond, 0 - %1 = zext i1 %0 to i8 - %2 = xor i8 %1, -1 - %3 = and i8 %2, 1 - %4 = zext i8 %3 to i32 - store i32 %4, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/array-ptr-calc-i32.ll b/test/CodeGen/R600/array-ptr-calc-i32.ll deleted file mode 100644 index 8c2a0795860..00000000000 --- a/test/CodeGen/R600/array-ptr-calc-i32.ll +++ /dev/null @@ -1,44 +0,0 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=+promote-alloca < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s - -declare i32 @llvm.SI.tid() nounwind readnone -declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate - -; The required pointer calculations for the alloca'd actually requires -; an add and won't be folded into the addressing, which fails with a -; 64-bit pointer add. This should work since private pointers should -; be 32-bits. - -; SI-LABEL: {{^}}test_private_array_ptr_calc: - -; FIXME: We end up with zero argument for ADD, because -; SIRegisterInfo::eliminateFrameIndex() blindly replaces the frame index -; with the appropriate offset. We should fold this into the store. -; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], 0, v{{[0-9]+}} -; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}] -; -; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this -; alloca to a vector. It currently fails because it does not know how -; to interpret: -; getelementptr [4 x i32], [4 x i32]* %alloca, i32 1, i32 %b - -; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], 16 -; SI-PROMOTE: ds_write_b32 [[PTRREG]] -define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) { - %alloca = alloca [4 x i32], i32 4, align 16 - %tid = call i32 @llvm.SI.tid() readnone - %a_ptr = getelementptr i32, i32 addrspace(1)* %inA, i32 %tid - %b_ptr = getelementptr i32, i32 addrspace(1)* %inB, i32 %tid - %a = load i32, i32 addrspace(1)* %a_ptr - %b = load i32, i32 addrspace(1)* %b_ptr - %result = add i32 %a, %b - %alloca_ptr = getelementptr [4 x i32], [4 x i32]* %alloca, i32 1, i32 %b - store i32 %result, i32* %alloca_ptr, align 4 - ; Dummy call - call void @llvm.AMDGPU.barrier.local() nounwind noduplicate - %reload = load i32, i32* %alloca_ptr, align 4 - %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - store i32 %reload, i32 addrspace(1)* %out_ptr, align 4 - ret void -} - diff --git a/test/CodeGen/R600/array-ptr-calc-i64.ll b/test/CodeGen/R600/array-ptr-calc-i64.ll deleted file mode 100644 index eae095eb844..00000000000 --- a/test/CodeGen/R600/array-ptr-calc-i64.ll +++ /dev/null @@ -1,17 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare i32 @llvm.SI.tid() readnone - -; SI-LABEL: {{^}}test_array_ptr_calc: -; SI: v_mul_lo_i32 -; SI: v_mul_hi_i32 -define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) { - %tid = call i32 @llvm.SI.tid() readnone - %a_ptr = getelementptr [1025 x i32], [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0 - %b_ptr = getelementptr i32, i32 addrspace(1)* %inB, i32 %tid - %a = load i32, i32 addrspace(1)* %a_ptr - %b = load i32, i32 addrspace(1)* %b_ptr - %result = add i32 %a, %b - store i32 %result, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/atomic_cmp_swap_local.ll b/test/CodeGen/R600/atomic_cmp_swap_local.ll deleted file mode 100644 index ef2560ef184..00000000000 --- a/test/CodeGen/R600/atomic_cmp_swap_local.ll +++ /dev/null @@ -1,92 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SICI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SICI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset: -; GCN: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7 -; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 -; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] -; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]] -; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 -; GCN: s_endpgm -define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic - %result = extractvalue { i32, i1 } %pair, 0 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset: -; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7 -; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0 -; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd -; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34 -; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] -; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]] -; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]] -; GCN: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32 -; GCN: buffer_store_dwordx2 [[RESULT]], -; GCN: s_endpgm -define void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic - %result = extractvalue { i64, i1 } %pair, 0 - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_bad_si_offset -; SI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CIVI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind { - %sub = sub i32 %a, %b - %add = add i32 %sub, 4 - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add - %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic - %result = extractvalue { i32, i1 } %pair, 0 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i32_offset: -; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9 -; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xa -; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24 -; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28 -; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7 -; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] -; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]] -; GCN: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 -; GCN: s_endpgm -define void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic - %result = extractvalue { i32, i1 } %pair, 0 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i64_offset: -; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9 -; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24 -; VI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7 -; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0 -; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] -; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]] -; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]] -; GCN: ds_cmpst_b64 [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_cmpxchg_noret_i64_offset(i64 addrspace(3)* %ptr, i64 %swap) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic - %result = extractvalue { i64, i1 } %pair, 0 - ret void -} diff --git a/test/CodeGen/R600/atomic_load_add.ll b/test/CodeGen/R600/atomic_load_add.ll deleted file mode 100644 index 20c685447ee..00000000000 --- a/test/CodeGen/R600/atomic_load_add.ll +++ /dev/null @@ -1,39 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}atomic_add_local: -; R600: LDS_ADD * -; SI: ds_add_u32 -define void @atomic_add_local(i32 addrspace(3)* %local) { - %unused = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_add_local_const_offset: -; R600: LDS_ADD * -; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 -define void @atomic_add_local_const_offset(i32 addrspace(3)* %local) { - %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4 - %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_add_ret_local: -; R600: LDS_ADD_RET * -; SI: ds_add_rtn_u32 -define void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { - %val = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst - store i32 %val, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}atomic_add_ret_local_const_offset: -; R600: LDS_ADD_RET * -; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20 -define void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { - %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5 - %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst - store i32 %val, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/atomic_load_sub.ll b/test/CodeGen/R600/atomic_load_sub.ll deleted file mode 100644 index 4c6f45525b9..00000000000 --- a/test/CodeGen/R600/atomic_load_sub.ll +++ /dev/null @@ -1,39 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}atomic_sub_local: -; R600: LDS_SUB * -; SI: ds_sub_u32 -define void @atomic_sub_local(i32 addrspace(3)* %local) { - %unused = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_sub_local_const_offset: -; R600: LDS_SUB * -; SI: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 -define void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) { - %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4 - %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_sub_ret_local: -; R600: LDS_SUB_RET * -; SI: ds_sub_rtn_u32 -define void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { - %val = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst - store i32 %val, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}atomic_sub_ret_local_const_offset: -; R600: LDS_SUB_RET * -; SI: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20 -define void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { - %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5 - %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst - store i32 %val, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/basic-branch.ll b/test/CodeGen/R600/basic-branch.ll deleted file mode 100644 index abdc4afef47..00000000000 --- a/test/CodeGen/R600/basic-branch.ll +++ /dev/null @@ -1,16 +0,0 @@ -; XFAIL: * -; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s - -; CHECK-LABEL: {{^}}test_branch( -define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind { - %cmp = icmp ne i32 %val, 0 - br i1 %cmp, label %store, label %end - -store: - store i32 222, i32 addrspace(1)* %out - ret void - -end: - ret void -} diff --git a/test/CodeGen/R600/basic-loop.ll b/test/CodeGen/R600/basic-loop.ll deleted file mode 100644 index f0263caf5d6..00000000000 --- a/test/CodeGen/R600/basic-loop.ll +++ /dev/null @@ -1,18 +0,0 @@ -; RUN: llc -O0 -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -; RUN: llc -O0 -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s - -; CHECK-LABEL: {{^}}test_loop: -define void @test_loop(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind { -entry: - br label %loop.body - -loop.body: - %i = phi i32 [0, %entry], [%i.inc, %loop.body] - store i32 222, i32 addrspace(1)* %out - %cmp = icmp ne i32 %i, %val - %i.inc = add i32 %i, 1 - br i1 %cmp, label %loop.body, label %end - -end: - ret void -} diff --git a/test/CodeGen/R600/bfe_uint.ll b/test/CodeGen/R600/bfe_uint.ll deleted file mode 100644 index 32e3fc26106..00000000000 --- a/test/CodeGen/R600/bfe_uint.ll +++ /dev/null @@ -1,26 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; CHECK: {{^}}bfe_def: -; CHECK: BFE_UINT -define void @bfe_def(i32 addrspace(1)* %out, i32 %x) { -entry: - %0 = lshr i32 %x, 5 - %1 = and i32 %0, 15 ; 0xf - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; This program could be implemented using a BFE_UINT instruction, however -; since the lshr constant + number of bits in the mask is >= 32, it can also be -; implmented with a LSHR instruction, which is better, because LSHR has less -; operands and requires less constants. - -; CHECK: {{^}}bfe_shift: -; CHECK-NOT: BFE_UINT -define void @bfe_shift(i32 addrspace(1)* %out, i32 %x) { -entry: - %0 = lshr i32 %x, 16 - %1 = and i32 %0, 65535 ; 0xffff - store i32 %1, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/bfi_int.ll b/test/CodeGen/R600/bfi_int.ll deleted file mode 100644 index 03349349735..00000000000 --- a/test/CodeGen/R600/bfi_int.ll +++ /dev/null @@ -1,53 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 %s -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s - -; BFI_INT Definition pattern from ISA docs -; (y & x) | (z & ~x) -; -; R600: {{^}}bfi_def: -; R600: BFI_INT -; SI: @bfi_def -; SI: v_bfi_b32 -define void @bfi_def(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { -entry: - %0 = xor i32 %x, -1 - %1 = and i32 %z, %0 - %2 = and i32 %y, %x - %3 = or i32 %1, %2 - store i32 %3, i32 addrspace(1)* %out - ret void -} - -; SHA-256 Ch function -; z ^ (x & (y ^ z)) -; R600: {{^}}bfi_sha256_ch: -; R600: BFI_INT -; SI: @bfi_sha256_ch -; SI: v_bfi_b32 -define void @bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { -entry: - %0 = xor i32 %y, %z - %1 = and i32 %x, %0 - %2 = xor i32 %z, %1 - store i32 %2, i32 addrspace(1)* %out - ret void -} - -; SHA-256 Ma function -; ((x & z) | (y & (x | z))) -; R600: {{^}}bfi_sha256_ma: -; R600: XOR_INT * [[DST:T[0-9]+\.[XYZW]]], KC0[2].Z, KC0[2].W -; R600: BFI_INT * {{T[0-9]+\.[XYZW]}}, {{[[DST]]|PV\.[XYZW]}}, KC0[3].X, KC0[2].W -; SI: v_xor_b32_e32 [[DST:v[0-9]+]], {{s[0-9]+, v[0-9]+}} -; SI: v_bfi_b32 {{v[0-9]+}}, [[DST]], {{s[0-9]+, v[0-9]+}} - -define void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { -entry: - %0 = and i32 %x, %z - %1 = or i32 %x, %z - %2 = and i32 %y, %1 - %3 = or i32 %0, %2 - store i32 %3, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/big_alu.ll b/test/CodeGen/R600/big_alu.ll deleted file mode 100644 index 2671c5d102b..00000000000 --- a/test/CodeGen/R600/big_alu.ll +++ /dev/null @@ -1,1173 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=cedar - -;This test ensures that R600 backend can handle ifcvt properly -;and do not generate ALU clauses with more than 128 instructions. - -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7, <4 x float> inreg %reg8, <4 x float> inreg %reg9) #0 { -main_body: - %0 = extractelement <4 x float> %reg0, i32 0 - %1 = extractelement <4 x float> %reg0, i32 1 - %2 = extractelement <4 x float> %reg0, i32 2 - %3 = extractelement <4 x float> %reg0, i32 3 - %4 = extractelement <4 x float> %reg1, i32 0 - %5 = extractelement <4 x float> %reg9, i32 0 - %6 = extractelement <4 x float> %reg8, i32 0 - %7 = fcmp ugt float %6, 0.000000e+00 - %8 = select i1 %7, float %4, float %5 - %9 = extractelement <4 x float> %reg1, i32 1 - %10 = extractelement <4 x float> %reg9, i32 1 - %11 = extractelement <4 x float> %reg8, i32 0 - %12 = fcmp ugt float %11, 0.000000e+00 - %13 = select i1 %12, float %9, float %10 - %14 = extractelement <4 x float> %reg1, i32 2 - %15 = extractelement <4 x float> %reg9, i32 2 - %16 = extractelement <4 x float> %reg8, i32 0 - %17 = fcmp ugt float %16, 0.000000e+00 - %18 = select i1 %17, float %14, float %15 - %19 = extractelement <4 x float> %reg1, i32 3 - %20 = extractelement <4 x float> %reg9, i32 3 - %21 = extractelement <4 x float> %reg8, i32 0 - %22 = extractelement <4 x float> %reg2, i32 0 - %23 = extractelement <4 x float> %reg2, i32 1 - %24 = extractelement <4 x float> %reg2, i32 2 - %25 = extractelement <4 x float> %reg2, i32 3 - %26 = extractelement <4 x float> %reg3, i32 0 - %27 = extractelement <4 x float> %reg3, i32 1 - %28 = extractelement <4 x float> %reg3, i32 2 - %29 = extractelement <4 x float> %reg3, i32 3 - %30 = extractelement <4 x float> %reg4, i32 0 - %31 = extractelement <4 x float> %reg4, i32 1 - %32 = extractelement <4 x float> %reg4, i32 2 - %33 = extractelement <4 x float> %reg4, i32 3 - %34 = extractelement <4 x float> %reg5, i32 0 - %35 = extractelement <4 x float> %reg5, i32 1 - %36 = extractelement <4 x float> %reg5, i32 2 - %37 = extractelement <4 x float> %reg5, i32 3 - %38 = extractelement <4 x float> %reg6, i32 0 - %39 = extractelement <4 x float> %reg6, i32 1 - %40 = extractelement <4 x float> %reg6, i32 2 - %41 = extractelement <4 x float> %reg6, i32 3 - %42 = extractelement <4 x float> %reg7, i32 0 - %43 = extractelement <4 x float> %reg7, i32 1 - %44 = extractelement <4 x float> %reg7, i32 2 - %45 = extractelement <4 x float> %reg7, i32 3 - %46 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11) - %47 = extractelement <4 x float> %46, i32 0 - %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11) - %49 = extractelement <4 x float> %48, i32 1 - %50 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11) - %51 = extractelement <4 x float> %50, i32 2 - %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12) - %53 = extractelement <4 x float> %52, i32 0 - %54 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14) - %55 = extractelement <4 x float> %54, i32 0 - %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14) - %57 = extractelement <4 x float> %56, i32 1 - %58 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14) - %59 = extractelement <4 x float> %58, i32 2 - %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14) - %61 = extractelement <4 x float> %60, i32 3 - %62 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16) - %63 = extractelement <4 x float> %62, i32 0 - %64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16) - %65 = extractelement <4 x float> %64, i32 1 - %66 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16) - %67 = extractelement <4 x float> %66, i32 2 - %68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) - %69 = extractelement <4 x float> %68, i32 0 - %70 = fcmp oge float %69, 3.500000e+00 - %71 = sext i1 %70 to i32 - %72 = bitcast i32 %71 to float - %73 = bitcast float %72 to i32 - %74 = icmp ne i32 %73, 0 - %. = select i1 %74, float 0.000000e+00, float 0.000000e+00 - %75 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) - %76 = extractelement <4 x float> %75, i32 0 - %77 = fcmp oge float %76, 2.000000e+00 - %78 = sext i1 %77 to i32 - %79 = bitcast i32 %78 to float - %80 = bitcast float %79 to i32 - %81 = icmp ne i32 %80, 0 - br i1 %81, label %IF137, label %ENDIF136 - -IF137: ; preds = %main_body - %82 = insertelement <4 x float> undef, float %30, i32 0 - %83 = insertelement <4 x float> %82, float %31, i32 1 - %84 = insertelement <4 x float> %83, float %32, i32 2 - %85 = insertelement <4 x float> %84, float 0.000000e+00, i32 3 - %86 = insertelement <4 x float> undef, float %30, i32 0 - %87 = insertelement <4 x float> %86, float %31, i32 1 - %88 = insertelement <4 x float> %87, float %32, i32 2 - %89 = insertelement <4 x float> %88, float 0.000000e+00, i32 3 - %90 = call float @llvm.AMDGPU.dp4(<4 x float> %85, <4 x float> %89) - %91 = call float @llvm.AMDGPU.rsq.f32(float %90) - %92 = fmul float %30, %91 - %93 = fmul float %31, %91 - %94 = fmul float %32, %91 - %95 = insertelement <4 x float> undef, float %92, i32 0 - %96 = insertelement <4 x float> %95, float %93, i32 1 - %97 = insertelement <4 x float> %96, float %94, i32 2 - %98 = insertelement <4 x float> %97, float 0.000000e+00, i32 3 - %99 = insertelement <4 x float> undef, float %37, i32 0 - %100 = insertelement <4 x float> %99, float %38, i32 1 - %101 = insertelement <4 x float> %100, float %39, i32 2 - %102 = insertelement <4 x float> %101, float 0.000000e+00, i32 3 - %103 = call float @llvm.AMDGPU.dp4(<4 x float> %98, <4 x float> %102) - %104 = insertelement <4 x float> undef, float %92, i32 0 - %105 = insertelement <4 x float> %104, float %93, i32 1 - %106 = insertelement <4 x float> %105, float %94, i32 2 - %107 = insertelement <4 x float> %106, float 0.000000e+00, i32 3 - %108 = insertelement <4 x float> undef, float %40, i32 0 - %109 = insertelement <4 x float> %108, float %41, i32 1 - %110 = insertelement <4 x float> %109, float %42, i32 2 - %111 = insertelement <4 x float> %110, float 0.000000e+00, i32 3 - %112 = call float @llvm.AMDGPU.dp4(<4 x float> %107, <4 x float> %111) - %113 = fsub float -0.000000e+00, %92 - %114 = fsub float -0.000000e+00, %93 - %115 = fsub float -0.000000e+00, %94 - %116 = insertelement <4 x float> undef, float %34, i32 0 - %117 = insertelement <4 x float> %116, float %35, i32 1 - %118 = insertelement <4 x float> %117, float %36, i32 2 - %119 = insertelement <4 x float> %118, float 0.000000e+00, i32 3 - %120 = insertelement <4 x float> undef, float %113, i32 0 - %121 = insertelement <4 x float> %120, float %114, i32 1 - %122 = insertelement <4 x float> %121, float %115, i32 2 - %123 = insertelement <4 x float> %122, float 0.000000e+00, i32 3 - %124 = call float @llvm.AMDGPU.dp4(<4 x float> %119, <4 x float> %123) - %125 = fdiv float 1.000000e+00, %124 - %126 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) - %127 = extractelement <4 x float> %126, i32 0 - %128 = fmul float %127, %125 - %129 = fmul float %103, %128 - %130 = fmul float %112, %128 - %131 = bitcast float %. to i32 - %132 = sitofp i32 %131 to float - %133 = fdiv float 1.000000e+00, %132 - %134 = bitcast float %. to i32 - %135 = add i32 %134, -1 - %136 = bitcast i32 %135 to float - %137 = bitcast float %136 to i32 - br label %LOOP - -ENDIF136: ; preds = %main_body, %ENDIF154 - %temp68.1 = phi float [ %600, %ENDIF154 ], [ 0.000000e+00, %main_body ] - %temp69.0 = phi float [ %602, %ENDIF154 ], [ 0.000000e+00, %main_body ] - %temp70.0 = phi float [ %604, %ENDIF154 ], [ 1.000000e+00, %main_body ] - %138 = fmul float %26, 0x3F847AE140000000 - %139 = fmul float %27, 0x3F847AE140000000 - %140 = fmul float %28, 0x3F847AE140000000 - %141 = insertelement <4 x float> undef, float %138, i32 0 - %142 = insertelement <4 x float> %141, float %139, i32 1 - %143 = insertelement <4 x float> %142, float %140, i32 2 - %144 = insertelement <4 x float> %143, float 0.000000e+00, i32 3 - %145 = extractelement <4 x float> %144, i32 0 - %146 = extractelement <4 x float> %144, i32 1 - %147 = extractelement <4 x float> %144, i32 2 - %148 = extractelement <4 x float> %144, i32 3 - %149 = insertelement <4 x float> undef, float %145, i32 0 - %150 = insertelement <4 x float> %149, float %146, i32 1 - %151 = insertelement <4 x float> %150, float %147, i32 2 - %152 = insertelement <4 x float> %151, float %148, i32 3 - %153 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %152, i32 16, i32 0, i32 3) - %154 = extractelement <4 x float> %153, i32 0 - %155 = extractelement <4 x float> %153, i32 1 - %156 = extractelement <4 x float> %153, i32 2 - %157 = extractelement <4 x float> %153, i32 3 - %158 = fmul float %26, 0x3F45A07B40000000 - %159 = fmul float %27, 0x3F45A07B40000000 - %160 = fmul float %28, 0x3F45A07B40000000 - %161 = insertelement <4 x float> undef, float %158, i32 0 - %162 = insertelement <4 x float> %161, float %159, i32 1 - %163 = insertelement <4 x float> %162, float %160, i32 2 - %164 = insertelement <4 x float> %163, float 0.000000e+00, i32 3 - %165 = extractelement <4 x float> %164, i32 0 - %166 = extractelement <4 x float> %164, i32 1 - %167 = extractelement <4 x float> %164, i32 2 - %168 = extractelement <4 x float> %164, i32 3 - %169 = insertelement <4 x float> undef, float %165, i32 0 - %170 = insertelement <4 x float> %169, float %166, i32 1 - %171 = insertelement <4 x float> %170, float %167, i32 2 - %172 = insertelement <4 x float> %171, float %168, i32 3 - %173 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %172, i32 16, i32 0, i32 3) - %174 = extractelement <4 x float> %173, i32 0 - %175 = extractelement <4 x float> %173, i32 1 - %176 = extractelement <4 x float> %173, i32 2 - %177 = extractelement <4 x float> %173, i32 3 - %178 = fmul float %176, 3.000000e+03 - %179 = fadd float %178, %28 - %180 = fdiv float 1.000000e+00, %33 - %181 = fmul float %32, %180 - %182 = call float @fabs(float %181) - %183 = fmul float %174, 0x3FD99999A0000000 - %184 = fadd float %183, 0x3FAEB851E0000000 - %185 = fmul float %175, 0x3FE3333340000000 - %186 = fadd float %185, %184 - %187 = fmul float %176, 2.000000e+00 - %188 = fadd float %187, %186 - %189 = fmul float %177, 4.000000e+00 - %190 = fadd float %189, %188 - %191 = fmul float %154, 0x3FB99999A0000000 - %192 = fadd float %191, %190 - %193 = fmul float %155, 0x3FD99999A0000000 - %194 = fadd float %193, %192 - %195 = fmul float %156, 0x3FE99999A0000000 - %196 = fadd float %195, %194 - %197 = fmul float %157, 0x4000CCCCC0000000 - %198 = fadd float %197, %196 - %199 = fmul float 0xBE5EFB4CC0000000, %182 - %200 = fmul float %199, %182 - %201 = call float @llvm.AMDIL.exp.(float %200) - %202 = call float @llvm.AMDGPU.lrp(float %201, float %198, float 0x3FA99999A0000000) - %203 = fadd float %202, 0x3FF4CCCCC0000000 - %204 = fmul float %203, 0x3FE1C71C80000000 - %205 = call float @llvm.AMDIL.clamp.(float %204, float 0.000000e+00, float 1.000000e+00) - %206 = fadd float %202, 0x3FF4CCCCC0000000 - %207 = fmul float %206, 0x3FE1C71C80000000 - %208 = call float @llvm.AMDIL.clamp.(float %207, float 0.000000e+00, float 1.000000e+00) - %209 = fadd float %202, 2.000000e+00 - %210 = fmul float %209, 0x3FD611A7A0000000 - %211 = call float @llvm.AMDIL.clamp.(float %210, float 0.000000e+00, float 1.000000e+00) - %212 = fmul float 2.000000e+00, %205 - %213 = fsub float -0.000000e+00, %212 - %214 = fadd float 3.000000e+00, %213 - %215 = fmul float %205, %214 - %216 = fmul float %205, %215 - %217 = fmul float 2.000000e+00, %208 - %218 = fsub float -0.000000e+00, %217 - %219 = fadd float 3.000000e+00, %218 - %220 = fmul float %208, %219 - %221 = fmul float %208, %220 - %222 = fmul float 2.000000e+00, %211 - %223 = fsub float -0.000000e+00, %222 - %224 = fadd float 3.000000e+00, %223 - %225 = fmul float %211, %224 - %226 = fmul float %211, %225 - %227 = fmul float %26, 0x3F368B5CC0000000 - %228 = fmul float %27, 0x3F368B5CC0000000 - %229 = insertelement <4 x float> undef, float %227, i32 0 - %230 = insertelement <4 x float> %229, float %228, i32 1 - %231 = insertelement <4 x float> %230, float 0.000000e+00, i32 2 - %232 = insertelement <4 x float> %231, float 0.000000e+00, i32 3 - %233 = extractelement <4 x float> %232, i32 0 - %234 = extractelement <4 x float> %232, i32 1 - %235 = insertelement <4 x float> undef, float %233, i32 0 - %236 = insertelement <4 x float> %235, float %234, i32 1 - %237 = insertelement <4 x float> %236, float undef, i32 2 - %238 = insertelement <4 x float> %237, float undef, i32 3 - %239 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %238, i32 17, i32 1, i32 2) - %240 = extractelement <4 x float> %239, i32 0 - %241 = insertelement <4 x float> undef, float %240, i32 0 - %242 = insertelement <4 x float> %241, float %228, i32 1 - %243 = insertelement <4 x float> %242, float 0.000000e+00, i32 2 - %244 = insertelement <4 x float> %243, float 0.000000e+00, i32 3 - %245 = extractelement <4 x float> %244, i32 0 - %246 = insertelement <4 x float> undef, float %245, i32 0 - %247 = insertelement <4 x float> %246, float undef, i32 1 - %248 = insertelement <4 x float> %247, float undef, i32 2 - %249 = insertelement <4 x float> %248, float undef, i32 3 - %250 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %249, i32 18, i32 2, i32 1) - %251 = extractelement <4 x float> %250, i32 0 - %252 = extractelement <4 x float> %250, i32 1 - %253 = extractelement <4 x float> %250, i32 2 - %254 = extractelement <4 x float> %250, i32 3 - %255 = fmul float %251, %216 - %256 = fmul float %252, %221 - %257 = fmul float %253, %226 - %258 = fmul float %254, 0.000000e+00 - %259 = fadd float %202, 0x3FF4CCCCC0000000 - %260 = fmul float %259, 0x3FE1C71C80000000 - %261 = call float @llvm.AMDIL.clamp.(float %260, float 0.000000e+00, float 1.000000e+00) - %262 = fadd float %202, 0x3FF4CCCCC0000000 - %263 = fmul float %262, 0x3FE1C71C80000000 - %264 = call float @llvm.AMDIL.clamp.(float %263, float 0.000000e+00, float 1.000000e+00) - %265 = fadd float %202, 2.000000e+00 - %266 = fmul float %265, 0x3FD611A7A0000000 - %267 = call float @llvm.AMDIL.clamp.(float %266, float 0.000000e+00, float 1.000000e+00) - %268 = fmul float 2.000000e+00, %261 - %269 = fsub float -0.000000e+00, %268 - %270 = fadd float 3.000000e+00, %269 - %271 = fmul float %261, %270 - %272 = fmul float %261, %271 - %273 = fmul float 2.000000e+00, %264 - %274 = fsub float -0.000000e+00, %273 - %275 = fadd float 3.000000e+00, %274 - %276 = fmul float %264, %275 - %277 = fmul float %264, %276 - %278 = fmul float 2.000000e+00, %267 - %279 = fsub float -0.000000e+00, %278 - %280 = fadd float 3.000000e+00, %279 - %281 = fmul float %267, %280 - %282 = fmul float %267, %281 - %283 = fmul float %26, 0x3F22DFD6A0000000 - %284 = fmul float %27, 0x3F22DFD6A0000000 - %285 = insertelement <4 x float> undef, float %283, i32 0 - %286 = insertelement <4 x float> %285, float %284, i32 1 - %287 = insertelement <4 x float> %286, float 0.000000e+00, i32 2 - %288 = insertelement <4 x float> %287, float 0.000000e+00, i32 3 - %289 = extractelement <4 x float> %288, i32 0 - %290 = extractelement <4 x float> %288, i32 1 - %291 = insertelement <4 x float> undef, float %289, i32 0 - %292 = insertelement <4 x float> %291, float %290, i32 1 - %293 = insertelement <4 x float> %292, float undef, i32 2 - %294 = insertelement <4 x float> %293, float undef, i32 3 - %295 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %294, i32 19, i32 3, i32 2) - %296 = extractelement <4 x float> %295, i32 0 - %297 = extractelement <4 x float> %295, i32 1 - %298 = extractelement <4 x float> %295, i32 2 - %299 = extractelement <4 x float> %295, i32 3 - %300 = fmul float %296, %272 - %301 = fmul float %297, %277 - %302 = fmul float %298, %282 - %303 = fmul float %299, 0.000000e+00 - %304 = fmul float %temp68.1, %37 - %305 = fmul float %temp68.1, %38 - %306 = fmul float %temp68.1, %39 - %307 = fmul float %temp69.0, %40 - %308 = fadd float %307, %304 - %309 = fmul float %temp69.0, %41 - %310 = fadd float %309, %305 - %311 = fmul float %temp69.0, %42 - %312 = fadd float %311, %306 - %313 = fmul float %temp70.0, %34 - %314 = fadd float %313, %308 - %315 = fmul float %temp70.0, %35 - %316 = fadd float %315, %310 - %317 = fmul float %temp70.0, %36 - %318 = fadd float %317, %312 - %319 = insertelement <4 x float> undef, float %314, i32 0 - %320 = insertelement <4 x float> %319, float %316, i32 1 - %321 = insertelement <4 x float> %320, float %318, i32 2 - %322 = insertelement <4 x float> %321, float 0.000000e+00, i32 3 - %323 = insertelement <4 x float> undef, float %314, i32 0 - %324 = insertelement <4 x float> %323, float %316, i32 1 - %325 = insertelement <4 x float> %324, float %318, i32 2 - %326 = insertelement <4 x float> %325, float 0.000000e+00, i32 3 - %327 = call float @llvm.AMDGPU.dp4(<4 x float> %322, <4 x float> %326) - %328 = call float @llvm.AMDGPU.rsq.f32(float %327) - %329 = fmul float %314, %328 - %330 = fmul float %316, %328 - %331 = fmul float %318, %328 - %332 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6) - %333 = extractelement <4 x float> %332, i32 0 - %334 = fsub float -0.000000e+00, %333 - %335 = fadd float 1.000000e+00, %334 - %336 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) - %337 = extractelement <4 x float> %336, i32 0 - %338 = fsub float -0.000000e+00, %337 - %339 = fadd float 1.000000e+00, %338 - %340 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) - %341 = extractelement <4 x float> %340, i32 0 - %342 = fsub float -0.000000e+00, %341 - %343 = fadd float 1.000000e+00, %342 - %344 = fsub float -0.000000e+00, %335 - %345 = fadd float %202, %344 - %346 = fsub float -0.000000e+00, %339 - %347 = fadd float %202, %346 - %348 = fadd float %347, 0xBFE3333340000000 - %349 = fsub float -0.000000e+00, %202 - %350 = fsub float -0.000000e+00, %343 - %351 = fadd float %349, %350 - %352 = insertelement <4 x float> undef, float %43, i32 0 - %353 = insertelement <4 x float> %352, float %44, i32 1 - %354 = insertelement <4 x float> %353, float %45, i32 2 - %355 = insertelement <4 x float> %354, float 0.000000e+00, i32 3 - %356 = insertelement <4 x float> undef, float %43, i32 0 - %357 = insertelement <4 x float> %356, float %44, i32 1 - %358 = insertelement <4 x float> %357, float %45, i32 2 - %359 = insertelement <4 x float> %358, float 0.000000e+00, i32 3 - %360 = call float @llvm.AMDGPU.dp4(<4 x float> %355, <4 x float> %359) - %361 = call float @llvm.AMDGPU.rsq.f32(float %360) - %362 = fmul float %45, %361 - %363 = call float @fabs(float %362) - %364 = fmul float %176, 0x3FECCCCCC0000000 - %365 = fadd float %364, %363 - %366 = fadd float %365, 0xBFEFAE1480000000 - %367 = fmul float %366, 0xC023FFFFC0000000 - %368 = call float @llvm.AMDIL.clamp.(float %367, float 0.000000e+00, float 1.000000e+00) - %369 = fsub float -0.000000e+00, %335 - %370 = fadd float %202, %369 - %371 = fadd float %370, 0x3FBEB851E0000000 - %372 = fsub float -0.000000e+00, %339 - %373 = fadd float %202, %372 - %374 = fadd float %373, 0xBFE0A3D700000000 - %375 = fsub float -0.000000e+00, %202 - %376 = fsub float -0.000000e+00, %343 - %377 = fadd float %375, %376 - %378 = insertelement <4 x float> undef, float %43, i32 0 - %379 = insertelement <4 x float> %378, float %44, i32 1 - %380 = insertelement <4 x float> %379, float %45, i32 2 - %381 = insertelement <4 x float> %380, float 0.000000e+00, i32 3 - %382 = insertelement <4 x float> undef, float %43, i32 0 - %383 = insertelement <4 x float> %382, float %44, i32 1 - %384 = insertelement <4 x float> %383, float %45, i32 2 - %385 = insertelement <4 x float> %384, float 0.000000e+00, i32 3 - %386 = call float @llvm.AMDGPU.dp4(<4 x float> %381, <4 x float> %385) - %387 = call float @llvm.AMDGPU.rsq.f32(float %386) - %388 = fmul float %45, %387 - %389 = call float @fabs(float %388) - %390 = fmul float %176, 0x3FF51EB860000000 - %391 = fadd float %390, %389 - %392 = fadd float %391, 0xBFEFAE1480000000 - %393 = fmul float %392, 0xC0490001A0000000 - %394 = call float @llvm.AMDIL.clamp.(float %393, float 0.000000e+00, float 1.000000e+00) - %395 = fmul float 2.000000e+00, %368 - %396 = fsub float -0.000000e+00, %395 - %397 = fadd float 3.000000e+00, %396 - %398 = fmul float %368, %397 - %399 = fmul float %368, %398 - %400 = call float @llvm.AMDGPU.lrp(float %399, float %255, float %345) - %401 = call float @llvm.AMDGPU.lrp(float %399, float %256, float %348) - %402 = call float @llvm.AMDGPU.lrp(float %399, float %257, float %351) - %403 = call float @llvm.AMDGPU.lrp(float %399, float %258, float 0.000000e+00) - %404 = fmul float 2.000000e+00, %394 - %405 = fsub float -0.000000e+00, %404 - %406 = fadd float 3.000000e+00, %405 - %407 = fmul float %394, %406 - %408 = fmul float %394, %407 - %409 = call float @llvm.AMDGPU.lrp(float %408, float %255, float %371) - %410 = call float @llvm.AMDGPU.lrp(float %408, float %256, float %374) - %411 = call float @llvm.AMDGPU.lrp(float %408, float %257, float %377) - %412 = call float @llvm.AMDGPU.lrp(float %408, float %258, float 0x3FD3333340000000) - %413 = fcmp oge float 2.200000e+03, %179 - %414 = sext i1 %413 to i32 - %415 = bitcast i32 %414 to float - %416 = bitcast float %415 to i32 - %417 = icmp ne i32 %416, 0 - br i1 %417, label %IF161, label %ENDIF160 - -LOOP: ; preds = %ENDIF139, %IF137 - %temp88.0 = phi float [ 0.000000e+00, %IF137 ], [ %446, %ENDIF139 ] - %temp92.0 = phi float [ 1.000000e+00, %IF137 ], [ %.temp92.0, %ENDIF139 ] - %temp96.0 = phi float [ 0.000000e+00, %IF137 ], [ %477, %ENDIF139 ] - %418 = bitcast float %temp96.0 to i32 - %419 = icmp sge i32 %418, %137 - %420 = sext i1 %419 to i32 - %421 = bitcast i32 %420 to float - %422 = bitcast float %421 to i32 - %423 = icmp ne i32 %422, 0 - br i1 %423, label %IF140, label %ENDIF139 - -IF140: ; preds = %LOOP - %424 = fmul float %133, 5.000000e-01 - %425 = fmul float %129, %temp92.0 - %426 = fadd float %425, %22 - %427 = fmul float %130, %temp92.0 - %428 = fadd float %427, %23 - %429 = insertelement <4 x float> undef, float %426, i32 0 - %430 = insertelement <4 x float> %429, float %428, i32 1 - %431 = insertelement <4 x float> %430, float 0.000000e+00, i32 2 - %432 = insertelement <4 x float> %431, float 0.000000e+00, i32 3 - %433 = extractelement <4 x float> %432, i32 0 - %434 = extractelement <4 x float> %432, i32 1 - %435 = insertelement <4 x float> undef, float %433, i32 0 - %436 = insertelement <4 x float> %435, float %434, i32 1 - %437 = insertelement <4 x float> %436, float undef, i32 2 - %438 = insertelement <4 x float> %437, float undef, i32 3 - %439 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %438, i32 20, i32 4, i32 2) - %440 = extractelement <4 x float> %439, i32 3 - %441 = fcmp oge float %temp92.0, %440 - %442 = sext i1 %441 to i32 - %443 = bitcast i32 %442 to float - %444 = bitcast float %443 to i32 - %445 = icmp ne i32 %444, 0 - br i1 %445, label %IF146, label %ENDIF145 - -ENDIF139: ; preds = %LOOP - %446 = fadd float %temp88.0, %133 - %447 = fmul float %129, %446 - %448 = fadd float %447, %22 - %449 = fmul float %130, %446 - %450 = fadd float %449, %23 - %451 = insertelement <4 x float> undef, float %448, i32 0 - %452 = insertelement <4 x float> %451, float %450, i32 1 - %453 = insertelement <4 x float> %452, float 0.000000e+00, i32 2 - %454 = insertelement <4 x float> %453, float 0.000000e+00, i32 3 - %455 = extractelement <4 x float> %454, i32 0 - %456 = extractelement <4 x float> %454, i32 1 - %457 = insertelement <4 x float> undef, float %455, i32 0 - %458 = insertelement <4 x float> %457, float %456, i32 1 - %459 = insertelement <4 x float> %458, float undef, i32 2 - %460 = insertelement <4 x float> %459, float undef, i32 3 - %461 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %460, i32 20, i32 4, i32 2) - %462 = extractelement <4 x float> %461, i32 3 - %463 = fcmp olt float 0x3FEFDF3B60000000, %temp92.0 - %464 = sext i1 %463 to i32 - %465 = bitcast i32 %464 to float - %466 = fcmp oge float %446, %462 - %467 = sext i1 %466 to i32 - %468 = bitcast i32 %467 to float - %469 = bitcast float %465 to i32 - %470 = bitcast float %468 to i32 - %471 = and i32 %469, %470 - %472 = bitcast i32 %471 to float - %473 = bitcast float %472 to i32 - %474 = icmp ne i32 %473, 0 - %.temp92.0 = select i1 %474, float %446, float %temp92.0 - %475 = bitcast float %temp96.0 to i32 - %476 = add i32 %475, 1 - %477 = bitcast i32 %476 to float - br label %LOOP - -IF146: ; preds = %IF140 - %478 = fmul float 2.000000e+00, %424 - %479 = fsub float -0.000000e+00, %478 - %480 = fadd float %temp92.0, %479 - br label %ENDIF145 - -ENDIF145: ; preds = %IF140, %IF146 - %temp88.1 = phi float [ %480, %IF146 ], [ %temp92.0, %IF140 ] - %481 = fadd float %temp88.1, %424 - %482 = fmul float %424, 5.000000e-01 - %483 = fmul float %129, %481 - %484 = fadd float %483, %22 - %485 = fmul float %130, %481 - %486 = fadd float %485, %23 - %487 = insertelement <4 x float> undef, float %484, i32 0 - %488 = insertelement <4 x float> %487, float %486, i32 1 - %489 = insertelement <4 x float> %488, float 0.000000e+00, i32 2 - %490 = insertelement <4 x float> %489, float %440, i32 3 - %491 = extractelement <4 x float> %490, i32 0 - %492 = extractelement <4 x float> %490, i32 1 - %493 = insertelement <4 x float> undef, float %491, i32 0 - %494 = insertelement <4 x float> %493, float %492, i32 1 - %495 = insertelement <4 x float> %494, float undef, i32 2 - %496 = insertelement <4 x float> %495, float undef, i32 3 - %497 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %496, i32 20, i32 4, i32 2) - %498 = extractelement <4 x float> %497, i32 3 - %499 = fcmp oge float %481, %498 - %500 = sext i1 %499 to i32 - %501 = bitcast i32 %500 to float - %502 = bitcast float %501 to i32 - %503 = icmp ne i32 %502, 0 - br i1 %503, label %IF149, label %ENDIF148 - -IF149: ; preds = %ENDIF145 - %504 = fmul float 2.000000e+00, %482 - %505 = fsub float -0.000000e+00, %504 - %506 = fadd float %481, %505 - br label %ENDIF148 - -ENDIF148: ; preds = %ENDIF145, %IF149 - %temp88.2 = phi float [ %506, %IF149 ], [ %481, %ENDIF145 ] - %temp92.2 = phi float [ %481, %IF149 ], [ %temp92.0, %ENDIF145 ] - %507 = fadd float %temp88.2, %482 - %508 = fmul float %482, 5.000000e-01 - %509 = fmul float %129, %507 - %510 = fadd float %509, %22 - %511 = fmul float %130, %507 - %512 = fadd float %511, %23 - %513 = insertelement <4 x float> undef, float %510, i32 0 - %514 = insertelement <4 x float> %513, float %512, i32 1 - %515 = insertelement <4 x float> %514, float 0.000000e+00, i32 2 - %516 = insertelement <4 x float> %515, float %498, i32 3 - %517 = extractelement <4 x float> %516, i32 0 - %518 = extractelement <4 x float> %516, i32 1 - %519 = insertelement <4 x float> undef, float %517, i32 0 - %520 = insertelement <4 x float> %519, float %518, i32 1 - %521 = insertelement <4 x float> %520, float undef, i32 2 - %522 = insertelement <4 x float> %521, float undef, i32 3 - %523 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %522, i32 20, i32 4, i32 2) - %524 = extractelement <4 x float> %523, i32 3 - %525 = fcmp oge float %507, %524 - %526 = sext i1 %525 to i32 - %527 = bitcast i32 %526 to float - %528 = bitcast float %527 to i32 - %529 = icmp ne i32 %528, 0 - br i1 %529, label %IF152, label %ENDIF151 - -IF152: ; preds = %ENDIF148 - %530 = fmul float 2.000000e+00, %508 - %531 = fsub float -0.000000e+00, %530 - %532 = fadd float %507, %531 - br label %ENDIF151 - -ENDIF151: ; preds = %ENDIF148, %IF152 - %temp88.3 = phi float [ %532, %IF152 ], [ %507, %ENDIF148 ] - %temp92.3 = phi float [ %507, %IF152 ], [ %temp92.2, %ENDIF148 ] - %533 = fadd float %temp88.3, %508 - %534 = fmul float %508, 5.000000e-01 - %535 = fmul float %129, %533 - %536 = fadd float %535, %22 - %537 = fmul float %130, %533 - %538 = fadd float %537, %23 - %539 = insertelement <4 x float> undef, float %536, i32 0 - %540 = insertelement <4 x float> %539, float %538, i32 1 - %541 = insertelement <4 x float> %540, float 0.000000e+00, i32 2 - %542 = insertelement <4 x float> %541, float %524, i32 3 - %543 = extractelement <4 x float> %542, i32 0 - %544 = extractelement <4 x float> %542, i32 1 - %545 = insertelement <4 x float> undef, float %543, i32 0 - %546 = insertelement <4 x float> %545, float %544, i32 1 - %547 = insertelement <4 x float> %546, float undef, i32 2 - %548 = insertelement <4 x float> %547, float undef, i32 3 - %549 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %548, i32 20, i32 4, i32 2) - %550 = extractelement <4 x float> %549, i32 3 - %551 = fcmp oge float %533, %550 - %552 = sext i1 %551 to i32 - %553 = bitcast i32 %552 to float - %554 = bitcast float %553 to i32 - %555 = icmp ne i32 %554, 0 - br i1 %555, label %IF155, label %ENDIF154 - -IF155: ; preds = %ENDIF151 - %556 = fmul float 2.000000e+00, %534 - %557 = fsub float -0.000000e+00, %556 - %558 = fadd float %533, %557 - br label %ENDIF154 - -ENDIF154: ; preds = %ENDIF151, %IF155 - %temp88.4 = phi float [ %558, %IF155 ], [ %533, %ENDIF151 ] - %temp92.4 = phi float [ %533, %IF155 ], [ %temp92.3, %ENDIF151 ] - %559 = fadd float %temp88.4, %534 - %560 = fmul float %129, %559 - %561 = fadd float %560, %22 - %562 = fmul float %130, %559 - %563 = fadd float %562, %23 - %564 = insertelement <4 x float> undef, float %561, i32 0 - %565 = insertelement <4 x float> %564, float %563, i32 1 - %566 = insertelement <4 x float> %565, float 0.000000e+00, i32 2 - %567 = insertelement <4 x float> %566, float %550, i32 3 - %568 = extractelement <4 x float> %567, i32 0 - %569 = extractelement <4 x float> %567, i32 1 - %570 = insertelement <4 x float> undef, float %568, i32 0 - %571 = insertelement <4 x float> %570, float %569, i32 1 - %572 = insertelement <4 x float> %571, float undef, i32 2 - %573 = insertelement <4 x float> %572, float undef, i32 3 - %574 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %573, i32 20, i32 4, i32 2) - %575 = extractelement <4 x float> %574, i32 3 - %576 = fcmp oge float %559, %575 - %577 = sext i1 %576 to i32 - %578 = bitcast i32 %577 to float - %579 = bitcast float %578 to i32 - %580 = icmp ne i32 %579, 0 - %.temp92.4 = select i1 %580, float %559, float %temp92.4 - %581 = fmul float %129, %.temp92.4 - %582 = fadd float %581, %22 - %583 = fmul float %130, %.temp92.4 - %584 = fadd float %583, %23 - %585 = insertelement <4 x float> undef, float %582, i32 0 - %586 = insertelement <4 x float> %585, float %584, i32 1 - %587 = insertelement <4 x float> %586, float 0.000000e+00, i32 2 - %588 = insertelement <4 x float> %587, float %575, i32 3 - %589 = extractelement <4 x float> %588, i32 0 - %590 = extractelement <4 x float> %588, i32 1 - %591 = insertelement <4 x float> undef, float %589, i32 0 - %592 = insertelement <4 x float> %591, float %590, i32 1 - %593 = insertelement <4 x float> %592, float undef, i32 2 - %594 = insertelement <4 x float> %593, float undef, i32 3 - %595 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %594, i32 20, i32 4, i32 2) - %596 = extractelement <4 x float> %595, i32 0 - %597 = extractelement <4 x float> %595, i32 1 - %598 = extractelement <4 x float> %595, i32 2 - %599 = fmul float %596, 2.000000e+00 - %600 = fadd float %599, -1.000000e+00 - %601 = fmul float %597, 2.000000e+00 - %602 = fadd float %601, -1.000000e+00 - %603 = fmul float %598, 2.000000e+00 - %604 = fadd float %603, -1.000000e+00 - br label %ENDIF136 - -IF161: ; preds = %ENDIF136 - %605 = fmul float %202, 0x3FB99999A0000000 - %606 = fcmp uge float 0x3FE4CCCCC0000000, %605 - %607 = select i1 %606, float 0x3FE4CCCCC0000000, float %605 - %608 = fcmp uge float %607, 5.000000e-01 - %609 = select i1 %608, float 5.000000e-01, float %607 - %610 = call float @llvm.AMDGPU.lrp(float %609, float %400, float %300) - %611 = call float @llvm.AMDGPU.lrp(float %609, float %401, float %301) - %612 = call float @llvm.AMDGPU.lrp(float %609, float %402, float %302) - %613 = call float @llvm.AMDGPU.lrp(float %609, float %403, float %303) - %614 = insertelement <4 x float> undef, float %329, i32 0 - %615 = insertelement <4 x float> %614, float %330, i32 1 - %616 = insertelement <4 x float> %615, float %331, i32 2 - %617 = insertelement <4 x float> %616, float 0.000000e+00, i32 3 - %618 = insertelement <4 x float> undef, float %63, i32 0 - %619 = insertelement <4 x float> %618, float %65, i32 1 - %620 = insertelement <4 x float> %619, float %67, i32 2 - %621 = insertelement <4 x float> %620, float 0.000000e+00, i32 3 - %622 = call float @llvm.AMDGPU.dp4(<4 x float> %617, <4 x float> %621) - %623 = fcmp uge float 0x3FE6666660000000, %622 - %624 = select i1 %623, float 0x3FE6666660000000, float %622 - %625 = fmul float %8, %624 - %626 = fmul float %13, %624 - %627 = fmul float %18, %624 - %628 = insertelement <4 x float> undef, float %34, i32 0 - %629 = insertelement <4 x float> %628, float %35, i32 1 - %630 = insertelement <4 x float> %629, float %36, i32 2 - %631 = insertelement <4 x float> %630, float 0.000000e+00, i32 3 - %632 = insertelement <4 x float> undef, float %63, i32 0 - %633 = insertelement <4 x float> %632, float %65, i32 1 - %634 = insertelement <4 x float> %633, float %67, i32 2 - %635 = insertelement <4 x float> %634, float 0.000000e+00, i32 3 - %636 = call float @llvm.AMDGPU.dp4(<4 x float> %631, <4 x float> %635) - %637 = fcmp uge float 0x3FECCCCCC0000000, %636 - %638 = select i1 %637, float 0x3FECCCCCC0000000, float %636 - %639 = fmul float %625, %638 - %640 = fmul float %626, %638 - %641 = fmul float %627, %638 - br label %ENDIF160 - -ENDIF160: ; preds = %ENDIF136, %IF161 - %temp84.0 = phi float [ %610, %IF161 ], [ %255, %ENDIF136 ] - %temp85.0 = phi float [ %611, %IF161 ], [ %256, %ENDIF136 ] - %temp86.0 = phi float [ %612, %IF161 ], [ %257, %ENDIF136 ] - %temp87.0 = phi float [ %613, %IF161 ], [ %258, %ENDIF136 ] - %temp92.6 = phi float [ %639, %IF161 ], [ %415, %ENDIF136 ] - %temp93.0 = phi float [ %640, %IF161 ], [ 0.000000e+00, %ENDIF136 ] - %temp94.0 = phi float [ %641, %IF161 ], [ 0.000000e+00, %ENDIF136 ] - %642 = fcmp olt float 2.200000e+03, %179 - %643 = sext i1 %642 to i32 - %644 = bitcast i32 %643 to float - %645 = fcmp olt float %179, 2.300000e+03 - %646 = sext i1 %645 to i32 - %647 = bitcast i32 %646 to float - %648 = bitcast float %644 to i32 - %649 = bitcast float %647 to i32 - %650 = and i32 %648, %649 - %651 = bitcast i32 %650 to float - %652 = bitcast float %651 to i32 - %653 = icmp ne i32 %652, 0 - br i1 %653, label %IF164, label %ENDIF163 - -IF164: ; preds = %ENDIF160 - %654 = fmul float %202, 5.000000e-01 - %655 = fcmp uge float 0x3FE4CCCCC0000000, %654 - %656 = select i1 %655, float 0x3FE4CCCCC0000000, float %654 - %657 = fcmp uge float %656, 0x3FD6666660000000 - %658 = select i1 %657, float 0x3FD6666660000000, float %656 - %659 = call float @llvm.AMDGPU.lrp(float %658, float %400, float %300) - %660 = call float @llvm.AMDGPU.lrp(float %658, float %401, float %301) - %661 = call float @llvm.AMDGPU.lrp(float %658, float %402, float %302) - %662 = call float @llvm.AMDGPU.lrp(float %658, float %403, float %303) - %663 = insertelement <4 x float> undef, float %329, i32 0 - %664 = insertelement <4 x float> %663, float %330, i32 1 - %665 = insertelement <4 x float> %664, float %331, i32 2 - %666 = insertelement <4 x float> %665, float 0.000000e+00, i32 3 - %667 = insertelement <4 x float> undef, float %63, i32 0 - %668 = insertelement <4 x float> %667, float %65, i32 1 - %669 = insertelement <4 x float> %668, float %67, i32 2 - %670 = insertelement <4 x float> %669, float 0.000000e+00, i32 3 - %671 = call float @llvm.AMDGPU.dp4(<4 x float> %666, <4 x float> %670) - %672 = fcmp uge float 0x3FE6666660000000, %671 - %673 = select i1 %672, float 0x3FE6666660000000, float %671 - %674 = fmul float %8, %673 - %675 = fmul float %13, %673 - %676 = fmul float %18, %673 - %677 = insertelement <4 x float> undef, float %34, i32 0 - %678 = insertelement <4 x float> %677, float %35, i32 1 - %679 = insertelement <4 x float> %678, float %36, i32 2 - %680 = insertelement <4 x float> %679, float 0.000000e+00, i32 3 - %681 = insertelement <4 x float> undef, float %63, i32 0 - %682 = insertelement <4 x float> %681, float %65, i32 1 - %683 = insertelement <4 x float> %682, float %67, i32 2 - %684 = insertelement <4 x float> %683, float 0.000000e+00, i32 3 - %685 = call float @llvm.AMDGPU.dp4(<4 x float> %680, <4 x float> %684) - %686 = fcmp uge float 0x3FECCCCCC0000000, %685 - %687 = select i1 %686, float 0x3FECCCCCC0000000, float %685 - %688 = fmul float %674, %687 - %689 = fmul float %675, %687 - %690 = fmul float %676, %687 - br label %ENDIF163 - -ENDIF163: ; preds = %ENDIF160, %IF164 - %temp84.1 = phi float [ %659, %IF164 ], [ %temp84.0, %ENDIF160 ] - %temp85.1 = phi float [ %660, %IF164 ], [ %temp85.0, %ENDIF160 ] - %temp86.1 = phi float [ %661, %IF164 ], [ %temp86.0, %ENDIF160 ] - %temp87.1 = phi float [ %662, %IF164 ], [ %temp87.0, %ENDIF160 ] - %temp92.7 = phi float [ %688, %IF164 ], [ %temp92.6, %ENDIF160 ] - %temp93.1 = phi float [ %689, %IF164 ], [ %temp93.0, %ENDIF160 ] - %temp94.1 = phi float [ %690, %IF164 ], [ %temp94.0, %ENDIF160 ] - %691 = fcmp oge float %179, 2.300000e+03 - %692 = sext i1 %691 to i32 - %693 = bitcast i32 %692 to float - %694 = fcmp olt float %179, 2.480000e+03 - %695 = sext i1 %694 to i32 - %696 = bitcast i32 %695 to float - %697 = bitcast float %693 to i32 - %698 = bitcast float %696 to i32 - %699 = and i32 %697, %698 - %700 = bitcast i32 %699 to float - %701 = bitcast float %700 to i32 - %702 = icmp ne i32 %701, 0 - br i1 %702, label %IF167, label %ENDIF166 - -IF167: ; preds = %ENDIF163 - %703 = fmul float %202, 5.000000e-01 - %704 = fcmp uge float 0x3FE4CCCCC0000000, %703 - %705 = select i1 %704, float 0x3FE4CCCCC0000000, float %703 - %706 = fcmp uge float %705, 0x3FD3333340000000 - %707 = select i1 %706, float 0x3FD3333340000000, float %705 - %708 = call float @llvm.AMDGPU.lrp(float %707, float %409, float %300) - %709 = call float @llvm.AMDGPU.lrp(float %707, float %410, float %301) - %710 = call float @llvm.AMDGPU.lrp(float %707, float %411, float %302) - %711 = call float @llvm.AMDGPU.lrp(float %707, float %412, float %303) - %712 = insertelement <4 x float> undef, float %329, i32 0 - %713 = insertelement <4 x float> %712, float %330, i32 1 - %714 = insertelement <4 x float> %713, float %331, i32 2 - %715 = insertelement <4 x float> %714, float 0.000000e+00, i32 3 - %716 = insertelement <4 x float> undef, float %63, i32 0 - %717 = insertelement <4 x float> %716, float %65, i32 1 - %718 = insertelement <4 x float> %717, float %67, i32 2 - %719 = insertelement <4 x float> %718, float 0.000000e+00, i32 3 - %720 = call float @llvm.AMDGPU.dp4(<4 x float> %715, <4 x float> %719) - %721 = fcmp uge float 0x3FEB333340000000, %720 - %722 = select i1 %721, float 0x3FEB333340000000, float %720 - %723 = fmul float %8, %722 - %724 = fmul float %13, %722 - %725 = fmul float %18, %722 - %726 = insertelement <4 x float> undef, float %34, i32 0 - %727 = insertelement <4 x float> %726, float %35, i32 1 - %728 = insertelement <4 x float> %727, float %36, i32 2 - %729 = insertelement <4 x float> %728, float 0.000000e+00, i32 3 - %730 = insertelement <4 x float> undef, float %63, i32 0 - %731 = insertelement <4 x float> %730, float %65, i32 1 - %732 = insertelement <4 x float> %731, float %67, i32 2 - %733 = insertelement <4 x float> %732, float 0.000000e+00, i32 3 - %734 = call float @llvm.AMDGPU.dp4(<4 x float> %729, <4 x float> %733) - %735 = fcmp uge float 0x3FECCCCCC0000000, %734 - %736 = select i1 %735, float 0x3FECCCCCC0000000, float %734 - %737 = fmul float %723, %736 - %738 = fmul float %724, %736 - %739 = fmul float %725, %736 - br label %ENDIF166 - -ENDIF166: ; preds = %ENDIF163, %IF167 - %temp84.2 = phi float [ %708, %IF167 ], [ %temp84.1, %ENDIF163 ] - %temp85.2 = phi float [ %709, %IF167 ], [ %temp85.1, %ENDIF163 ] - %temp86.2 = phi float [ %710, %IF167 ], [ %temp86.1, %ENDIF163 ] - %temp87.2 = phi float [ %711, %IF167 ], [ %temp87.1, %ENDIF163 ] - %temp92.8 = phi float [ %737, %IF167 ], [ %temp92.7, %ENDIF163 ] - %temp93.2 = phi float [ %738, %IF167 ], [ %temp93.1, %ENDIF163 ] - %temp94.2 = phi float [ %739, %IF167 ], [ %temp94.1, %ENDIF163 ] - %740 = fcmp oge float %179, 2.480000e+03 - %741 = sext i1 %740 to i32 - %742 = bitcast i32 %741 to float - %743 = fcmp olt float %179, 2.530000e+03 - %744 = sext i1 %743 to i32 - %745 = bitcast i32 %744 to float - %746 = bitcast float %742 to i32 - %747 = bitcast float %745 to i32 - %748 = and i32 %746, %747 - %749 = bitcast i32 %748 to float - %750 = bitcast float %749 to i32 - %751 = icmp ne i32 %750, 0 - br i1 %751, label %IF170, label %ENDIF169 - -IF170: ; preds = %ENDIF166 - %752 = fmul float %202, 5.000000e-01 - %753 = fcmp uge float 0x3FE4CCCCC0000000, %752 - %754 = select i1 %753, float 0x3FE4CCCCC0000000, float %752 - %755 = fcmp uge float %754, 0x3FC99999A0000000 - %756 = select i1 %755, float 0x3FC99999A0000000, float %754 - %757 = call float @llvm.AMDGPU.lrp(float %756, float %409, float %300) - %758 = call float @llvm.AMDGPU.lrp(float %756, float %410, float %301) - %759 = call float @llvm.AMDGPU.lrp(float %756, float %411, float %302) - %760 = call float @llvm.AMDGPU.lrp(float %756, float %412, float %303) - %761 = insertelement <4 x float> undef, float %329, i32 0 - %762 = insertelement <4 x float> %761, float %330, i32 1 - %763 = insertelement <4 x float> %762, float %331, i32 2 - %764 = insertelement <4 x float> %763, float 0.000000e+00, i32 3 - %765 = insertelement <4 x float> undef, float %63, i32 0 - %766 = insertelement <4 x float> %765, float %65, i32 1 - %767 = insertelement <4 x float> %766, float %67, i32 2 - %768 = insertelement <4 x float> %767, float 0.000000e+00, i32 3 - %769 = call float @llvm.AMDGPU.dp4(<4 x float> %764, <4 x float> %768) - %770 = fcmp uge float 0x3FEB333340000000, %769 - %771 = select i1 %770, float 0x3FEB333340000000, float %769 - %772 = fmul float %8, %771 - %773 = fmul float %13, %771 - %774 = fmul float %18, %771 - %775 = insertelement <4 x float> undef, float %34, i32 0 - %776 = insertelement <4 x float> %775, float %35, i32 1 - %777 = insertelement <4 x float> %776, float %36, i32 2 - %778 = insertelement <4 x float> %777, float 0.000000e+00, i32 3 - %779 = insertelement <4 x float> undef, float %63, i32 0 - %780 = insertelement <4 x float> %779, float %65, i32 1 - %781 = insertelement <4 x float> %780, float %67, i32 2 - %782 = insertelement <4 x float> %781, float 0.000000e+00, i32 3 - %783 = call float @llvm.AMDGPU.dp4(<4 x float> %778, <4 x float> %782) - %784 = fcmp uge float 0x3FECCCCCC0000000, %783 - %785 = select i1 %784, float 0x3FECCCCCC0000000, float %783 - %786 = fmul float %772, %785 - %787 = fmul float %773, %785 - %788 = fmul float %774, %785 - br label %ENDIF169 - -ENDIF169: ; preds = %ENDIF166, %IF170 - %temp84.3 = phi float [ %757, %IF170 ], [ %temp84.2, %ENDIF166 ] - %temp85.3 = phi float [ %758, %IF170 ], [ %temp85.2, %ENDIF166 ] - %temp86.3 = phi float [ %759, %IF170 ], [ %temp86.2, %ENDIF166 ] - %temp87.3 = phi float [ %760, %IF170 ], [ %temp87.2, %ENDIF166 ] - %temp92.9 = phi float [ %786, %IF170 ], [ %temp92.8, %ENDIF166 ] - %temp93.3 = phi float [ %787, %IF170 ], [ %temp93.2, %ENDIF166 ] - %temp94.3 = phi float [ %788, %IF170 ], [ %temp94.2, %ENDIF166 ] - %789 = fcmp oge float %179, 2.530000e+03 - %790 = sext i1 %789 to i32 - %791 = bitcast i32 %790 to float - %792 = fcmp olt float %179, 2.670000e+03 - %793 = sext i1 %792 to i32 - %794 = bitcast i32 %793 to float - %795 = bitcast float %791 to i32 - %796 = bitcast float %794 to i32 - %797 = and i32 %795, %796 - %798 = bitcast i32 %797 to float - %799 = bitcast float %798 to i32 - %800 = icmp ne i32 %799, 0 - br i1 %800, label %IF173, label %ENDIF172 - -IF173: ; preds = %ENDIF169 - %801 = fmul float %202, 5.000000e-01 - %802 = fcmp uge float 0x3FE4CCCCC0000000, %801 - %803 = select i1 %802, float 0x3FE4CCCCC0000000, float %801 - %804 = fcmp uge float %803, 0x3FB99999A0000000 - %805 = select i1 %804, float 0x3FB99999A0000000, float %803 - %806 = call float @llvm.AMDGPU.lrp(float %805, float %400, float %300) - %807 = call float @llvm.AMDGPU.lrp(float %805, float %401, float %301) - %808 = call float @llvm.AMDGPU.lrp(float %805, float %402, float %302) - %809 = call float @llvm.AMDGPU.lrp(float %805, float %403, float %303) - %810 = insertelement <4 x float> undef, float %329, i32 0 - %811 = insertelement <4 x float> %810, float %330, i32 1 - %812 = insertelement <4 x float> %811, float %331, i32 2 - %813 = insertelement <4 x float> %812, float 0.000000e+00, i32 3 - %814 = insertelement <4 x float> undef, float %63, i32 0 - %815 = insertelement <4 x float> %814, float %65, i32 1 - %816 = insertelement <4 x float> %815, float %67, i32 2 - %817 = insertelement <4 x float> %816, float 0.000000e+00, i32 3 - %818 = call float @llvm.AMDGPU.dp4(<4 x float> %813, <4 x float> %817) - %819 = fcmp uge float 0x3FEB333340000000, %818 - %820 = select i1 %819, float 0x3FEB333340000000, float %818 - %821 = fmul float %8, %820 - %822 = fmul float %13, %820 - %823 = fmul float %18, %820 - %824 = insertelement <4 x float> undef, float %34, i32 0 - %825 = insertelement <4 x float> %824, float %35, i32 1 - %826 = insertelement <4 x float> %825, float %36, i32 2 - %827 = insertelement <4 x float> %826, float 0.000000e+00, i32 3 - %828 = insertelement <4 x float> undef, float %63, i32 0 - %829 = insertelement <4 x float> %828, float %65, i32 1 - %830 = insertelement <4 x float> %829, float %67, i32 2 - %831 = insertelement <4 x float> %830, float 0.000000e+00, i32 3 - %832 = call float @llvm.AMDGPU.dp4(<4 x float> %827, <4 x float> %831) - %833 = fcmp uge float 0x3FECCCCCC0000000, %832 - %834 = select i1 %833, float 0x3FECCCCCC0000000, float %832 - %835 = fmul float %821, %834 - %836 = fmul float %822, %834 - %837 = fmul float %823, %834 - br label %ENDIF172 - -ENDIF172: ; preds = %ENDIF169, %IF173 - %temp84.4 = phi float [ %806, %IF173 ], [ %temp84.3, %ENDIF169 ] - %temp85.4 = phi float [ %807, %IF173 ], [ %temp85.3, %ENDIF169 ] - %temp86.4 = phi float [ %808, %IF173 ], [ %temp86.3, %ENDIF169 ] - %temp87.4 = phi float [ %809, %IF173 ], [ %temp87.3, %ENDIF169 ] - %temp92.10 = phi float [ %835, %IF173 ], [ %temp92.9, %ENDIF169 ] - %temp93.4 = phi float [ %836, %IF173 ], [ %temp93.3, %ENDIF169 ] - %temp94.4 = phi float [ %837, %IF173 ], [ %temp94.3, %ENDIF169 ] - %838 = fcmp oge float %179, 2.670000e+03 - %839 = sext i1 %838 to i32 - %840 = bitcast i32 %839 to float - %841 = bitcast float %840 to i32 - %842 = icmp ne i32 %841, 0 - br i1 %842, label %IF176, label %ENDIF175 - -IF176: ; preds = %ENDIF172 - %843 = fmul float %202, 0x3FB99999A0000000 - %844 = fcmp uge float 0.000000e+00, %843 - %845 = select i1 %844, float 0.000000e+00, float %843 - %846 = fcmp uge float %845, 0x3FD99999A0000000 - %847 = select i1 %846, float 0x3FD99999A0000000, float %845 - %848 = call float @llvm.AMDGPU.lrp(float %847, float %400, float %300) - %849 = call float @llvm.AMDGPU.lrp(float %847, float %401, float %301) - %850 = call float @llvm.AMDGPU.lrp(float %847, float %402, float %302) - %851 = call float @llvm.AMDGPU.lrp(float %847, float %403, float %303) - %852 = insertelement <4 x float> undef, float %329, i32 0 - %853 = insertelement <4 x float> %852, float %330, i32 1 - %854 = insertelement <4 x float> %853, float %331, i32 2 - %855 = insertelement <4 x float> %854, float 0.000000e+00, i32 3 - %856 = insertelement <4 x float> undef, float %63, i32 0 - %857 = insertelement <4 x float> %856, float %65, i32 1 - %858 = insertelement <4 x float> %857, float %67, i32 2 - %859 = insertelement <4 x float> %858, float 0.000000e+00, i32 3 - %860 = call float @llvm.AMDGPU.dp4(<4 x float> %855, <4 x float> %859) - %861 = fcmp uge float 0x3FEB333340000000, %860 - %862 = select i1 %861, float 0x3FEB333340000000, float %860 - %863 = fmul float %8, %862 - %864 = fmul float %13, %862 - %865 = fmul float %18, %862 - %866 = insertelement <4 x float> undef, float %34, i32 0 - %867 = insertelement <4 x float> %866, float %35, i32 1 - %868 = insertelement <4 x float> %867, float %36, i32 2 - %869 = insertelement <4 x float> %868, float 0.000000e+00, i32 3 - %870 = insertelement <4 x float> undef, float %63, i32 0 - %871 = insertelement <4 x float> %870, float %65, i32 1 - %872 = insertelement <4 x float> %871, float %67, i32 2 - %873 = insertelement <4 x float> %872, float 0.000000e+00, i32 3 - %874 = call float @llvm.AMDGPU.dp4(<4 x float> %869, <4 x float> %873) - %875 = fcmp uge float 0x3FECCCCCC0000000, %874 - %876 = select i1 %875, float 0x3FECCCCCC0000000, float %874 - %877 = fmul float %863, %876 - %878 = fmul float %864, %876 - %879 = fmul float %865, %876 - br label %ENDIF175 - -ENDIF175: ; preds = %ENDIF172, %IF176 - %temp84.5 = phi float [ %848, %IF176 ], [ %temp84.4, %ENDIF172 ] - %temp85.5 = phi float [ %849, %IF176 ], [ %temp85.4, %ENDIF172 ] - %temp86.5 = phi float [ %850, %IF176 ], [ %temp86.4, %ENDIF172 ] - %temp87.5 = phi float [ %851, %IF176 ], [ %temp87.4, %ENDIF172 ] - %temp92.11 = phi float [ %877, %IF176 ], [ %temp92.10, %ENDIF172 ] - %temp93.5 = phi float [ %878, %IF176 ], [ %temp93.4, %ENDIF172 ] - %temp94.5 = phi float [ %879, %IF176 ], [ %temp94.4, %ENDIF172 ] - %880 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10) - %881 = extractelement <4 x float> %880, i32 0 - %882 = fcmp olt float %881, %179 - %883 = sext i1 %882 to i32 - %884 = bitcast i32 %883 to float - %885 = bitcast float %884 to i32 - %886 = icmp ne i32 %885, 0 - br i1 %886, label %IF179, label %ENDIF178 - -IF179: ; preds = %ENDIF175 - %887 = fadd float %202, 1.000000e+00 - %888 = fadd float %202, 1.000000e+00 - %889 = fadd float %202, 1.000000e+00 - %890 = insertelement <4 x float> undef, float %43, i32 0 - %891 = insertelement <4 x float> %890, float %44, i32 1 - %892 = insertelement <4 x float> %891, float %45, i32 2 - %893 = insertelement <4 x float> %892, float 0.000000e+00, i32 3 - %894 = insertelement <4 x float> undef, float %43, i32 0 - %895 = insertelement <4 x float> %894, float %44, i32 1 - %896 = insertelement <4 x float> %895, float %45, i32 2 - %897 = insertelement <4 x float> %896, float 0.000000e+00, i32 3 - %898 = call float @llvm.AMDGPU.dp4(<4 x float> %893, <4 x float> %897) - %899 = call float @llvm.AMDGPU.rsq.f32(float %898) - %900 = fmul float %45, %899 - %901 = call float @fabs(float %900) - %902 = fmul float %176, 0x3FECCCCCC0000000 - %903 = fadd float %902, %901 - %904 = fadd float %903, 0xBFEFAE1480000000 - %905 = fmul float %904, 0xC043FFFE20000000 - %906 = call float @llvm.AMDIL.clamp.(float %905, float 0.000000e+00, float 1.000000e+00) - %907 = fmul float 2.000000e+00, %906 - %908 = fsub float -0.000000e+00, %907 - %909 = fadd float 3.000000e+00, %908 - %910 = fmul float %906, %909 - %911 = fmul float %906, %910 - %912 = call float @llvm.AMDGPU.lrp(float %911, float %temp84.5, float %887) - %913 = call float @llvm.AMDGPU.lrp(float %911, float %temp85.5, float %888) - %914 = call float @llvm.AMDGPU.lrp(float %911, float %temp86.5, float %889) - %915 = call float @llvm.AMDGPU.lrp(float %911, float %temp87.5, float 0.000000e+00) - %916 = fmul float %202, 5.000000e-01 - %917 = fcmp uge float 0x3FE4CCCCC0000000, %916 - %918 = select i1 %917, float 0x3FE4CCCCC0000000, float %916 - %919 = fcmp uge float %918, 0x3FE3333340000000 - %920 = select i1 %919, float 0x3FE3333340000000, float %918 - %921 = call float @llvm.AMDGPU.lrp(float %920, float %912, float %temp84.5) - %922 = call float @llvm.AMDGPU.lrp(float %920, float %913, float %temp85.5) - %923 = call float @llvm.AMDGPU.lrp(float %920, float %914, float %temp86.5) - %924 = call float @llvm.AMDGPU.lrp(float %920, float %915, float %temp87.5) - %925 = insertelement <4 x float> undef, float %329, i32 0 - %926 = insertelement <4 x float> %925, float %330, i32 1 - %927 = insertelement <4 x float> %926, float %331, i32 2 - %928 = insertelement <4 x float> %927, float 0.000000e+00, i32 3 - %929 = insertelement <4 x float> undef, float %63, i32 0 - %930 = insertelement <4 x float> %929, float %65, i32 1 - %931 = insertelement <4 x float> %930, float %67, i32 2 - %932 = insertelement <4 x float> %931, float 0.000000e+00, i32 3 - %933 = call float @llvm.AMDGPU.dp4(<4 x float> %928, <4 x float> %932) - %934 = fcmp uge float 0x3FE99999A0000000, %933 - %935 = select i1 %934, float 0x3FE99999A0000000, float %933 - %936 = fmul float %8, %935 - %937 = fmul float %13, %935 - %938 = fmul float %18, %935 - %939 = insertelement <4 x float> undef, float %34, i32 0 - %940 = insertelement <4 x float> %939, float %35, i32 1 - %941 = insertelement <4 x float> %940, float %36, i32 2 - %942 = insertelement <4 x float> %941, float 0.000000e+00, i32 3 - %943 = insertelement <4 x float> undef, float %63, i32 0 - %944 = insertelement <4 x float> %943, float %65, i32 1 - %945 = insertelement <4 x float> %944, float %67, i32 2 - %946 = insertelement <4 x float> %945, float 0.000000e+00, i32 3 - %947 = call float @llvm.AMDGPU.dp4(<4 x float> %942, <4 x float> %946) - %948 = fcmp uge float 0x3FECCCCCC0000000, %947 - %949 = select i1 %948, float 0x3FECCCCCC0000000, float %947 - %950 = fmul float %936, %949 - %951 = fmul float %937, %949 - %952 = fmul float %938, %949 - br label %ENDIF178 - -ENDIF178: ; preds = %ENDIF175, %IF179 - %temp84.6 = phi float [ %921, %IF179 ], [ %temp84.5, %ENDIF175 ] - %temp85.6 = phi float [ %922, %IF179 ], [ %temp85.5, %ENDIF175 ] - %temp86.6 = phi float [ %923, %IF179 ], [ %temp86.5, %ENDIF175 ] - %temp87.6 = phi float [ %924, %IF179 ], [ %temp87.5, %ENDIF175 ] - %temp92.12 = phi float [ %950, %IF179 ], [ %temp92.11, %ENDIF175 ] - %temp93.6 = phi float [ %951, %IF179 ], [ %temp93.5, %ENDIF175 ] - %temp94.6 = phi float [ %952, %IF179 ], [ %temp94.5, %ENDIF175 ] - %953 = fmul float %55, %temp92.12 - %954 = fmul float %57, %temp93.6 - %955 = fmul float %59, %temp94.6 - %956 = fmul float %61, 0.000000e+00 - %957 = fmul float %temp84.6, %953 - %958 = fmul float %temp85.6, %954 - %959 = fmul float %temp86.6, %955 - %960 = fmul float %temp87.6, %956 - %961 = fmul float %2, -2.000000e+00 - %962 = fadd float %961, 1.000000e+00 - %963 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 23) - %964 = extractelement <4 x float> %963, i32 2 - %965 = fsub float -0.000000e+00, %964 - %966 = fadd float %962, %965 - %967 = fdiv float 1.000000e+00, %966 - %968 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 24) - %969 = extractelement <4 x float> %968, i32 2 - %970 = fmul float %969, %967 - %971 = fsub float -0.000000e+00, %53 - %972 = fmul float %971, %53 - %973 = fmul float %972, %970 - %974 = fmul float %973, %970 - %975 = fmul float %974, 0x3FF7154760000000 - %976 = call float @llvm.AMDIL.exp.(float %975) - %977 = fcmp oeq float %53, 1.000000e+00 - %978 = sext i1 %977 to i32 - %979 = bitcast i32 %978 to float - %980 = bitcast float %979 to i32 - %981 = icmp ne i32 %980, 0 - %.184 = select i1 %981, float 1.000000e+00, float %976 - %982 = call float @llvm.AMDGPU.lrp(float %.184, float %957, float %47) - %983 = call float @llvm.AMDGPU.lrp(float %.184, float %958, float %49) - %984 = call float @llvm.AMDGPU.lrp(float %.184, float %959, float %51) - %985 = insertelement <4 x float> undef, float %982, i32 0 - %986 = insertelement <4 x float> %985, float %983, i32 1 - %987 = insertelement <4 x float> %986, float %984, i32 2 - %988 = insertelement <4 x float> %987, float %960, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %988, i32 0, i32 0) - ret void -} - -; Function Attrs: readnone -declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 - -; Function Attrs: readnone -declare float @llvm.AMDGPU.rsq.f32(float) #1 - -; Function Attrs: readnone -declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1 - -; Function Attrs: readonly -declare float @fabs(float) #2 - -; Function Attrs: readnone -declare float @llvm.AMDIL.exp.(float) #1 - -; Function Attrs: readnone -declare float @llvm.AMDGPU.lrp(float, float, float) #1 - -; Function Attrs: readnone -declare float @llvm.AMDIL.clamp.(float, float, float) #1 - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { readnone } -attributes #2 = { readonly } diff --git a/test/CodeGen/R600/bitcast.ll b/test/CodeGen/R600/bitcast.ll deleted file mode 100644 index fd56d956bf3..00000000000 --- a/test/CodeGen/R600/bitcast.ll +++ /dev/null @@ -1,79 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; This test just checks that the compiler doesn't crash. - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -; FUNC-LABEL: {{^}}v32i8_to_v8i32: -; SI: s_endpgm -define void @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 { -entry: - %1 = load <32 x i8>, <32 x i8> addrspace(2)* %0 - %2 = bitcast <32 x i8> %1 to <8 x i32> - %3 = extractelement <8 x i32> %2, i32 1 - %4 = icmp ne i32 %3, 0 - %5 = select i1 %4, float 0.0, float 1.0 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %5, float %5, float %5) - ret void -} - -; FUNC-LABEL: {{^}}i8ptr_v16i8ptr: -; SI: s_endpgm -define void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) { -entry: - %0 = bitcast i8 addrspace(1)* %in to <16 x i8> addrspace(1)* - %1 = load <16 x i8>, <16 x i8> addrspace(1)* %0 - store <16 x i8> %1, <16 x i8> addrspace(1)* %out - ret void -} - -define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind { - %load = load float, float addrspace(1)* %in, align 4 - %bc = bitcast float %load to <2 x i16> - store <2 x i16> %bc, <2 x i16> addrspace(1)* %out, align 4 - ret void -} - -define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind { - %load = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4 - %bc = bitcast <2 x i16> %load to float - store float %bc, float addrspace(1)* %out, align 4 - ret void -} - -define void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { - %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 - %bc = bitcast <4 x i8> %load to i32 - store i32 %bc, i32 addrspace(1)* %out, align 4 - ret void -} - -define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %load = load i32, i32 addrspace(1)* %in, align 4 - %bc = bitcast i32 %load to <4 x i8> - store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bitcast_v2i32_to_f64: -; SI: s_endpgm -define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8 - %add = add <2 x i32> %val, - %bc = bitcast <2 x i32> %add to double - store double %bc, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}bitcast_f64_to_v2i32: -; SI: s_endpgm -define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) { - %val = load double, double addrspace(1)* %in, align 8 - %add = fadd double %val, 4.0 - %bc = bitcast double %add to <2 x i32> - store <2 x i32> %bc, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - -attributes #0 = { "ShaderType"="0" } diff --git a/test/CodeGen/R600/bswap.ll b/test/CodeGen/R600/bswap.ll deleted file mode 100644 index 4cf8e4bfed5..00000000000 --- a/test/CodeGen/R600/bswap.ll +++ /dev/null @@ -1,115 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.bswap.i32(i32) nounwind readnone -declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) nounwind readnone -declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone -declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>) nounwind readnone -declare i64 @llvm.bswap.i64(i64) nounwind readnone -declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) nounwind readnone -declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone - -; FUNC-LABEL: @test_bswap_i32 -; SI: buffer_load_dword [[VAL:v[0-9]+]] -; SI-DAG: v_alignbit_b32 [[TMP0:v[0-9]+]], [[VAL]], [[VAL]], 8 -; SI-DAG: v_alignbit_b32 [[TMP1:v[0-9]+]], [[VAL]], [[VAL]], 24 -; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xff00ff -; SI: v_bfi_b32 [[RESULT:v[0-9]+]], [[K]], [[TMP1]], [[TMP0]] -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 - %bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone - store i32 %bswap, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @test_bswap_v2i32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_bfi_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_bfi_b32 -; SI: s_endpgm -define void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind { - %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8 - %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone - store <2 x i32> %bswap, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: @test_bswap_v4i32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_bfi_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_bfi_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_bfi_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_bfi_b32 -; SI: s_endpgm -define void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind { - %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16 - %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val) nounwind readnone - store <4 x i32> %bswap, <4 x i32> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: @test_bswap_v8i32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_bfi_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_bfi_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_bfi_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_bfi_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_bfi_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_bfi_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_bfi_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_bfi_b32 -; SI: s_endpgm -define void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) nounwind { - %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32 - %bswap = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %val) nounwind readnone - store <8 x i32> %bswap, <8 x i32> addrspace(1)* %out, align 32 - ret void -} - -define void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { - %val = load i64, i64 addrspace(1)* %in, align 8 - %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone - store i64 %bswap, i64 addrspace(1)* %out, align 8 - ret void -} - -define void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) nounwind { - %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 - %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone - store <2 x i64> %bswap, <2 x i64> addrspace(1)* %out, align 16 - ret void -} - -define void @test_bswap_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) nounwind { - %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32 - %bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone - store <4 x i64> %bswap, <4 x i64> addrspace(1)* %out, align 32 - ret void -} diff --git a/test/CodeGen/R600/build_vector.ll b/test/CodeGen/R600/build_vector.ll deleted file mode 100644 index 65eacf5adc4..00000000000 --- a/test/CodeGen/R600/build_vector.ll +++ /dev/null @@ -1,35 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600 -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI - -; R600: {{^}}build_vector2: -; R600: MOV -; R600: MOV -; R600-NOT: MOV -; SI: {{^}}build_vector2: -; SI-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5 -; SI-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6 -; SI: buffer_store_dwordx2 v{{\[}}[[X]]:[[Y]]{{\]}} -define void @build_vector2 (<2 x i32> addrspace(1)* %out) { -entry: - store <2 x i32> , <2 x i32> addrspace(1)* %out - ret void -} - -; R600: {{^}}build_vector4: -; R600: MOV -; R600: MOV -; R600: MOV -; R600: MOV -; R600-NOT: MOV -; SI: {{^}}build_vector4: -; SI-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5 -; SI-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6 -; SI-DAG: v_mov_b32_e32 v[[Z:[0-9]]], 7 -; SI-DAG: v_mov_b32_e32 v[[W:[0-9]]], 8 -; SI: buffer_store_dwordx4 v{{\[}}[[X]]:[[W]]{{\]}} -define void @build_vector4 (<4 x i32> addrspace(1)* %out) { -entry: - store <4 x i32> , <4 x i32> addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/call.ll b/test/CodeGen/R600/call.ll deleted file mode 100644 index e769fd11c28..00000000000 --- a/test/CodeGen/R600/call.ll +++ /dev/null @@ -1,33 +0,0 @@ -; RUN: not llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s 2>&1 | FileCheck %s -; RUN: not llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s 2>&1 | FileCheck %s -; RUN: not llc -march=r600 -mcpu=cypress < %s 2>&1 | FileCheck %s - -; CHECK: error: unsupported call to function external_function in test_call_external - - -declare i32 @external_function(i32) nounwind - -define void @test_call_external(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %a = load i32, i32 addrspace(1)* %in - %b = load i32, i32 addrspace(1)* %b_ptr - %c = call i32 @external_function(i32 %b) nounwind - %result = add i32 %a, %c - store i32 %result, i32 addrspace(1)* %out - ret void -} - -define i32 @defined_function(i32 %x) nounwind noinline { - %y = add i32 %x, 8 - ret i32 %y -} - -define void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %a = load i32, i32 addrspace(1)* %in - %b = load i32, i32 addrspace(1)* %b_ptr - %c = call i32 @defined_function(i32 %b) nounwind - %result = add i32 %a, %c - store i32 %result, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/call_fs.ll b/test/CodeGen/R600/call_fs.ll deleted file mode 100644 index 87bebbc49d5..00000000000 --- a/test/CodeGen/R600/call_fs.ll +++ /dev/null @@ -1,17 +0,0 @@ - -; RUN: llc < %s -march=r600 -mcpu=redwood -show-mc-encoding -o - | FileCheck --check-prefix=EG %s -; RUN: llc < %s -march=r600 -mcpu=rv710 -show-mc-encoding -o - | FileCheck --check-prefix=R600 %s - -; EG: .long 257 -; EG: {{^}}call_fs: -; EG: CALL_FS ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0xc0,0x84] -; R600: .long 257 -; R600: {{^}}call_fs: -; R600:CALL_FS ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x80,0x89] - - -define void @call_fs() #0 { - ret void -} - -attributes #0 = { "ShaderType"="1" } ; Vertex Shader diff --git a/test/CodeGen/R600/cayman-loop-bug.ll b/test/CodeGen/R600/cayman-loop-bug.ll deleted file mode 100644 index c7b8c403731..00000000000 --- a/test/CodeGen/R600/cayman-loop-bug.ll +++ /dev/null @@ -1,32 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s - -; CHECK-LABEL: {{^}}main: -; CHECK: LOOP_START_DX10 -; CHECK: ALU_PUSH_BEFORE -; CHECK: LOOP_START_DX10 -; CHECK: PUSH -; CHECK-NOT: ALU_PUSH_BEFORE -; CHECK: END_LOOP -; CHECK: END_LOOP -define void @main (<4 x float> inreg %reg0) #0 { -entry: - br label %outer_loop -outer_loop: - %cnt = phi i32 [0, %entry], [%cnt_incr, %inner_loop] - %cond = icmp eq i32 %cnt, 16 - br i1 %cond, label %outer_loop_body, label %exit -outer_loop_body: - %cnt_incr = add i32 %cnt, 1 - br label %inner_loop -inner_loop: - %cnt2 = phi i32 [0, %outer_loop_body], [%cnt2_incr, %inner_loop_body] - %cond2 = icmp eq i32 %cnt2, 16 - br i1 %cond, label %inner_loop_body, label %outer_loop -inner_loop_body: - %cnt2_incr = add i32 %cnt2, 1 - br label %inner_loop -exit: - ret void -} - -attributes #0 = { "ShaderType"="0" } \ No newline at end of file diff --git a/test/CodeGen/R600/cf-stack-bug.ll b/test/CodeGen/R600/cf-stack-bug.ll deleted file mode 100644 index 75b87e48622..00000000000 --- a/test/CodeGen/R600/cf-stack-bug.ll +++ /dev/null @@ -1,244 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC -; RUN: FileCheck --check-prefix=BUG64 %s < %t - -; RUN: llc -march=r600 -mcpu=sumo -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC -; RUN: FileCheck --check-prefix=BUG64 %s < %t - -; RUN: llc -march=r600 -mcpu=barts -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC -; RUN: FileCheck --check-prefix=BUG64 %s < %t - -; RUN: llc -march=r600 -mcpu=turks -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC -; RUN: FileCheck --check-prefix=BUG64 %s < %t - -; RUN: llc -march=r600 -mcpu=caicos -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC -; RUN: FileCheck --check-prefix=BUG64 %s < %t - -; RUN: llc -march=r600 -mcpu=cedar -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC -; RUN: FileCheck --check-prefix=BUG32 %s < %t - -; RUN: llc -march=r600 -mcpu=juniper -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC -; RUN: FileCheck --check-prefix=NOBUG %s < %t - -; RUN: llc -march=r600 -mcpu=cypress -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC -; RUN: FileCheck --check-prefix=NOBUG %s < %t - -; RUN: llc -march=r600 -mcpu=cayman -debug-only=r600cf %s -o - 2>%t | FileCheck %s --check-prefix=FUNC -; RUN: FileCheck --check-prefix=NOBUG %s < %t - -; REQUIRES: asserts - -; We are currently allocating 2 extra sub-entries on Evergreen / NI for -; non-WQM push instructions if we change this to 1, then we will need to -; add one level of depth to each of these tests. - -; BUG64-NOT: Applying bug work-around -; BUG32-NOT: Applying bug work-around -; NOBUG-NOT: Applying bug work-around -; FUNC-LABEL: {{^}}nested3: -define void @nested3(i32 addrspace(1)* %out, i32 %cond) { -entry: - %0 = icmp sgt i32 %cond, 0 - br i1 %0, label %if.1, label %end - -if.1: - %1 = icmp sgt i32 %cond, 10 - br i1 %1, label %if.2, label %if.store.1 - -if.store.1: - store i32 1, i32 addrspace(1)* %out - br label %end - -if.2: - %2 = icmp sgt i32 %cond, 20 - br i1 %2, label %if.3, label %if.2.store - -if.2.store: - store i32 2, i32 addrspace(1)* %out - br label %end - -if.3: - store i32 3, i32 addrspace(1)* %out - br label %end - -end: - ret void -} - -; BUG64: Applying bug work-around -; BUG32-NOT: Applying bug work-around -; NOBUG-NOT: Applying bug work-around -; FUNC-LABEL: {{^}}nested4: -define void @nested4(i32 addrspace(1)* %out, i32 %cond) { -entry: - %0 = icmp sgt i32 %cond, 0 - br i1 %0, label %if.1, label %end - -if.1: - %1 = icmp sgt i32 %cond, 10 - br i1 %1, label %if.2, label %if.1.store - -if.1.store: - store i32 1, i32 addrspace(1)* %out - br label %end - -if.2: - %2 = icmp sgt i32 %cond, 20 - br i1 %2, label %if.3, label %if.2.store - -if.2.store: - store i32 2, i32 addrspace(1)* %out - br label %end - -if.3: - %3 = icmp sgt i32 %cond, 30 - br i1 %3, label %if.4, label %if.3.store - -if.3.store: - store i32 3, i32 addrspace(1)* %out - br label %end - -if.4: - store i32 4, i32 addrspace(1)* %out - br label %end - -end: - ret void -} - -; BUG64: Applying bug work-around -; BUG32-NOT: Applying bug work-around -; NOBUG-NOT: Applying bug work-around -; FUNC-LABEL: {{^}}nested7: -define void @nested7(i32 addrspace(1)* %out, i32 %cond) { -entry: - %0 = icmp sgt i32 %cond, 0 - br i1 %0, label %if.1, label %end - -if.1: - %1 = icmp sgt i32 %cond, 10 - br i1 %1, label %if.2, label %if.1.store - -if.1.store: - store i32 1, i32 addrspace(1)* %out - br label %end - -if.2: - %2 = icmp sgt i32 %cond, 20 - br i1 %2, label %if.3, label %if.2.store - -if.2.store: - store i32 2, i32 addrspace(1)* %out - br label %end - -if.3: - %3 = icmp sgt i32 %cond, 30 - br i1 %3, label %if.4, label %if.3.store - -if.3.store: - store i32 3, i32 addrspace(1)* %out - br label %end - -if.4: - %4 = icmp sgt i32 %cond, 40 - br i1 %4, label %if.5, label %if.4.store - -if.4.store: - store i32 4, i32 addrspace(1)* %out - br label %end - -if.5: - %5 = icmp sgt i32 %cond, 50 - br i1 %5, label %if.6, label %if.5.store - -if.5.store: - store i32 5, i32 addrspace(1)* %out - br label %end - -if.6: - %6 = icmp sgt i32 %cond, 60 - br i1 %6, label %if.7, label %if.6.store - -if.6.store: - store i32 6, i32 addrspace(1)* %out - br label %end - -if.7: - store i32 7, i32 addrspace(1)* %out - br label %end - -end: - ret void -} - -; BUG64: Applying bug work-around -; BUG32: Applying bug work-around -; NOBUG-NOT: Applying bug work-around -; FUNC-LABEL: {{^}}nested8: -define void @nested8(i32 addrspace(1)* %out, i32 %cond) { -entry: - %0 = icmp sgt i32 %cond, 0 - br i1 %0, label %if.1, label %end - -if.1: - %1 = icmp sgt i32 %cond, 10 - br i1 %1, label %if.2, label %if.1.store - -if.1.store: - store i32 1, i32 addrspace(1)* %out - br label %end - -if.2: - %2 = icmp sgt i32 %cond, 20 - br i1 %2, label %if.3, label %if.2.store - -if.2.store: - store i32 2, i32 addrspace(1)* %out - br label %end - -if.3: - %3 = icmp sgt i32 %cond, 30 - br i1 %3, label %if.4, label %if.3.store - -if.3.store: - store i32 3, i32 addrspace(1)* %out - br label %end - -if.4: - %4 = icmp sgt i32 %cond, 40 - br i1 %4, label %if.5, label %if.4.store - -if.4.store: - store i32 4, i32 addrspace(1)* %out - br label %end - -if.5: - %5 = icmp sgt i32 %cond, 50 - br i1 %5, label %if.6, label %if.5.store - -if.5.store: - store i32 5, i32 addrspace(1)* %out - br label %end - -if.6: - %6 = icmp sgt i32 %cond, 60 - br i1 %6, label %if.7, label %if.6.store - -if.6.store: - store i32 6, i32 addrspace(1)* %out - br label %end - -if.7: - %7 = icmp sgt i32 %cond, 70 - br i1 %7, label %if.8, label %if.7.store - -if.7.store: - store i32 7, i32 addrspace(1)* %out - br label %end - -if.8: - store i32 8, i32 addrspace(1)* %out - br label %end - -end: - ret void -} diff --git a/test/CodeGen/R600/cf_end.ll b/test/CodeGen/R600/cf_end.ll deleted file mode 100644 index c74ee22868d..00000000000 --- a/test/CodeGen/R600/cf_end.ll +++ /dev/null @@ -1,9 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood --show-mc-encoding | FileCheck --check-prefix=EG %s -; RUN: llc < %s -march=r600 -mcpu=caicos --show-mc-encoding | FileCheck --check-prefix=EG %s -; RUN: llc < %s -march=r600 -mcpu=cayman --show-mc-encoding | FileCheck --check-prefix=CM %s - -; EG: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x80] -; CM: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x88] -define void @eop() { - ret void -} diff --git a/test/CodeGen/R600/cgp-addressing-modes.ll b/test/CodeGen/R600/cgp-addressing-modes.ll deleted file mode 100644 index 77f7bd01b7f..00000000000 --- a/test/CodeGen/R600/cgp-addressing-modes.ll +++ /dev/null @@ -1,242 +0,0 @@ -; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown < %s | FileCheck -check-prefix=OPT %s -; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN %s - -declare i32 @llvm.r600.read.tidig.x() #0 - -; OPT-LABEL: @test_sink_global_small_offset_i32( -; OPT-NOT: getelementptr i32, i32 addrspace(1)* %in -; OPT: br i1 -; OPT: ptrtoint - -; GCN-LABEL: {{^}}test_sink_global_small_offset_i32: -; GCN: {{^}}BB0_2: -define void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) { -entry: - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 - %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 7 - %tmp0 = icmp eq i32 %cond, 0 - br i1 %tmp0, label %endif, label %if - -if: - %tmp1 = load i32, i32 addrspace(1)* %in.gep - br label %endif - -endif: - %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] - store i32 %x, i32 addrspace(1)* %out.gep - br label %done - -done: - ret void -} - -; OPT-LABEL: @test_sink_global_small_max_i32_ds_offset( -; OPT: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535 -; OPT: br i1 - -; GCN-LABEL: {{^}}test_sink_global_small_max_i32_ds_offset: -; GCN: s_and_saveexec_b64 -; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} -; GCN: {{^}}BB1_2: -; GCN: s_or_b64 exec -define void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) { -entry: - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999 - %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535 - %tmp0 = icmp eq i32 %cond, 0 - br i1 %tmp0, label %endif, label %if - -if: - %tmp1 = load i8, i8 addrspace(1)* %in.gep - %tmp2 = sext i8 %tmp1 to i32 - br label %endif - -endif: - %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] - store i32 %x, i32 addrspace(1)* %out.gep - br label %done - -done: - ret void -} - -; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset: -; GCN: s_and_saveexec_b64 -; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}} -; GCN: {{^}}BB2_2: -; GCN: s_or_b64 exec -define void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) { -entry: - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024 - %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4095 - %tmp0 = icmp eq i32 %cond, 0 - br i1 %tmp0, label %endif, label %if - -if: - %tmp1 = load i8, i8 addrspace(1)* %in.gep - %tmp2 = sext i8 %tmp1 to i32 - br label %endif - -endif: - %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] - store i32 %x, i32 addrspace(1)* %out.gep - br label %done - -done: - ret void -} - -; GCN-LABEL: {{^}}test_sink_global_small_max_plus_1_mubuf_offset: -; GCN: s_and_saveexec_b64 -; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} -; GCN: {{^}}BB3_2: -; GCN: s_or_b64 exec -define void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) { -entry: - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999 - %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4096 - %tmp0 = icmp eq i32 %cond, 0 - br i1 %tmp0, label %endif, label %if - -if: - %tmp1 = load i8, i8 addrspace(1)* %in.gep - %tmp2 = sext i8 %tmp1 to i32 - br label %endif - -endif: - %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] - store i32 %x, i32 addrspace(1)* %out.gep - br label %done - -done: - ret void -} - -; OPT-LABEL: @test_no_sink_flat_small_offset_i32( -; OPT: getelementptr i32, i32 addrspace(4)* %in -; OPT: br i1 -; OPT-NOT: ptrtoint - -; GCN-LABEL: {{^}}test_no_sink_flat_small_offset_i32: -; GCN: flat_load_dword -; GCN: {{^}}BB4_2: - -define void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) { -entry: - %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999 - %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7 - %tmp0 = icmp eq i32 %cond, 0 - br i1 %tmp0, label %endif, label %if - -if: - %tmp1 = load i32, i32 addrspace(4)* %in.gep - br label %endif - -endif: - %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] - store i32 %x, i32 addrspace(4)* %out.gep - br label %done - -done: - ret void -} - -; OPT-LABEL: @test_sink_scratch_small_offset_i32( -; OPT-NOT: getelementptr [512 x i32] -; OPT: br i1 -; OPT: ptrtoint - -; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32: -; GCN: s_and_saveexec_b64 -; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}} -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}} -; GCN: {{^}}BB5_2: -define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) { -entry: - %alloca = alloca [512 x i32], align 4 - %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999 - %add.arg = add i32 %arg, 8 - %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1023 - %tmp0 = icmp eq i32 %cond, 0 - br i1 %tmp0, label %endif, label %if - -if: - store volatile i32 123, i32* %alloca.gep - %tmp1 = load volatile i32, i32* %alloca.gep - br label %endif - -endif: - %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] - store i32 %x, i32 addrspace(1)* %out.gep.0 - %load = load volatile i32, i32* %alloca.gep - store i32 %load, i32 addrspace(1)* %out.gep.1 - br label %done - -done: - ret void -} - -; OPT-LABEL: @test_no_sink_scratch_large_offset_i32( -; OPT: %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1024 -; OPT: br i1 -; OPT-NOT: ptrtoint - -; GCN-LABEL: {{^}}test_no_sink_scratch_large_offset_i32: -; GCN: s_and_saveexec_b64 -; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} -; GCN: {{^}}BB6_2: -define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) { -entry: - %alloca = alloca [512 x i32], align 4 - %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999 - %add.arg = add i32 %arg, 8 - %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1024 - %tmp0 = icmp eq i32 %cond, 0 - br i1 %tmp0, label %endif, label %if - -if: - store volatile i32 123, i32* %alloca.gep - %tmp1 = load volatile i32, i32* %alloca.gep - br label %endif - -endif: - %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] - store i32 %x, i32 addrspace(1)* %out.gep.0 - %load = load volatile i32, i32* %alloca.gep - store i32 %load, i32 addrspace(1)* %out.gep.1 - br label %done - -done: - ret void -} - -; GCN-LABEL: {{^}}test_sink_global_vreg_sreg_i32: -; GCN: s_and_saveexec_b64 -; GCN: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN: {{^}}BB7_2: -define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset, i32 %cond) { -entry: - %offset.ext = zext i32 %offset to i64 - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 - %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 %offset.ext - %tmp0 = icmp eq i32 %cond, 0 - br i1 %tmp0, label %endif, label %if - -if: - %tmp1 = load i32, i32 addrspace(1)* %in.gep - br label %endif - -endif: - %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] - store i32 %x, i32 addrspace(1)* %out.gep - br label %done - -done: - ret void -} - -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind } diff --git a/test/CodeGen/R600/coalescer_remat.ll b/test/CodeGen/R600/coalescer_remat.ll deleted file mode 100644 index 96730bcf2e8..00000000000 --- a/test/CodeGen/R600/coalescer_remat.ll +++ /dev/null @@ -1,57 +0,0 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -mtriple=amdgcn-- -o - %s | FileCheck %s - -declare float @llvm.fma.f32(float, float, float) - -; This checks that rematerialization support of the coalescer does not -; unnecessarily widen the register class. Without those fixes > 20 VGprs -; are used here -; Also check that some rematerialization of the 0 constant happened. -; CHECK-LABEL: foobar -; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 -; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 -; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 -; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 -; It's probably OK if this is slightly higher: -; CHECK: ; NumVgprs: 9 -define void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) { -entry: - %cmpflag = icmp eq i32 %flag, 1 - br i1 %cmpflag, label %loop, label %exit - -loop: - %c = phi i32 [0, %entry], [%cnext, %loop] - %v0 = phi float [0.0, %entry], [%fma.0, %loop] - %v1 = phi float [0.0, %entry], [%fma.1, %loop] - %v2 = phi float [0.0, %entry], [%fma.2, %loop] - %v3 = phi float [0.0, %entry], [%fma.3, %loop] - - ; Try to get the 0 constant to get coalesced into a wide register - %blup = insertelement <4 x float> undef, float %v0, i32 0 - store <4 x float> %blup, <4 x float> addrspace(1)* %out - - %load = load <4 x float>, <4 x float> addrspace(1)* %in - %load.0 = extractelement <4 x float> %load, i32 0 - %load.1 = extractelement <4 x float> %load, i32 1 - %load.2 = extractelement <4 x float> %load, i32 2 - %load.3 = extractelement <4 x float> %load, i32 3 - %fma.0 = call float @llvm.fma.f32(float %v0, float %load.0, float %v0) - %fma.1 = call float @llvm.fma.f32(float %v1, float %load.1, float %v1) - %fma.2 = call float @llvm.fma.f32(float %v2, float %load.2, float %v2) - %fma.3 = call float @llvm.fma.f32(float %v3, float %load.3, float %v3) - - %cnext = add nsw i32 %c, 1 - %cmp = icmp eq i32 %cnext, 42 - br i1 %cmp, label %exit, label %loop - -exit: - %ev0 = phi float [0.0, %entry], [%fma.0, %loop] - %ev1 = phi float [0.0, %entry], [%fma.1, %loop] - %ev2 = phi float [0.0, %entry], [%fma.2, %loop] - %ev3 = phi float [0.0, %entry], [%fma.3, %loop] - %dst.0 = insertelement <4 x float> undef, float %ev0, i32 0 - %dst.1 = insertelement <4 x float> %dst.0, float %ev1, i32 1 - %dst.2 = insertelement <4 x float> %dst.1, float %ev2, i32 2 - %dst.3 = insertelement <4 x float> %dst.2, float %ev3, i32 3 - store <4 x float> %dst.3, <4 x float> addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll b/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll deleted file mode 100644 index 58517209267..00000000000 --- a/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll +++ /dev/null @@ -1,18 +0,0 @@ -; RUN: opt -mtriple=amdgcn-- -codegenprepare -S < %s | FileCheck -check-prefix=OPT %s -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI-LLC %s - -; OPT-LABEL: @test( -; OPT: mul nsw i32 -; OPT-NEXT: sext - -; SI-LLC-LABEL: {{^}}test: -; SI-LLC: s_mul_i32 -; SI-LLC-NOT: mul -define void @test(i8 addrspace(1)* nocapture readonly %in, i32 %a, i8 %b) { -entry: - %0 = mul nsw i32 %a, 3 - %1 = sext i32 %0 to i64 - %2 = getelementptr i8, i8 addrspace(1)* %in, i64 %1 - store i8 %b, i8 addrspace(1)* %2 - ret void -} diff --git a/test/CodeGen/R600/combine_vloads.ll b/test/CodeGen/R600/combine_vloads.ll deleted file mode 100644 index 01572afa620..00000000000 --- a/test/CodeGen/R600/combine_vloads.ll +++ /dev/null @@ -1,42 +0,0 @@ -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s - -; -; kernel void combine_vloads(global char8* src, global char8* result) { -; for (int i = 0; i < 1024; ++i) -; result[i] = src[0] + src[1] + src[2] + src[3]; -; } -; - - -; 128-bit loads instead of many 8-bit -; EG-LABEL: {{^}}combine_vloads: -; EG: VTX_READ_128 -; EG: VTX_READ_128 -define void @combine_vloads(<8 x i8> addrspace(1)* nocapture %src, <8 x i8> addrspace(1)* nocapture %result) nounwind { -entry: - br label %for.body - -for.exit: ; preds = %for.body - ret void - -for.body: ; preds = %for.body, %entry - %i.01 = phi i32 [ 0, %entry ], [ %tmp19, %for.body ] - %arrayidx_v4 = bitcast <8 x i8> addrspace(1)* %src to <32 x i8> addrspace(1)* - %0 = bitcast <32 x i8> addrspace(1)* %arrayidx_v4 to <8 x i32> addrspace(1)* - %vecload2 = load <8 x i32>, <8 x i32> addrspace(1)* %0, align 32 - %1 = bitcast <8 x i32> %vecload2 to <32 x i8> - %tmp5 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> - %tmp8 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> - %tmp9 = add nsw <8 x i8> %tmp5, %tmp8 - %tmp12 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> - %tmp13 = add nsw <8 x i8> %tmp9, %tmp12 - %tmp16 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> - %tmp17 = add nsw <8 x i8> %tmp13, %tmp16 - %scevgep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %result, i32 %i.01 - %2 = bitcast <8 x i8> %tmp17 to <2 x i32> - %3 = bitcast <8 x i8> addrspace(1)* %scevgep to <2 x i32> addrspace(1)* - store <2 x i32> %2, <2 x i32> addrspace(1)* %3, align 8 - %tmp19 = add nsw i32 %i.01, 1 - %exitcond = icmp eq i32 %tmp19, 1024 - br i1 %exitcond, label %for.exit, label %for.body -} diff --git a/test/CodeGen/R600/commute-compares.ll b/test/CodeGen/R600/commute-compares.ll deleted file mode 100644 index 31766047a35..00000000000 --- a/test/CodeGen/R600/commute-compares.ll +++ /dev/null @@ -1,697 +0,0 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s - -declare i32 @llvm.r600.read.tidig.x() #0 - -; -------------------------------------------------------------------------------- -; i32 compares -; -------------------------------------------------------------------------------- - -; GCN-LABEL: {{^}}commute_eq_64_i32: -; GCN: v_cmp_eq_i32_e32 vcc, 64, v{{[0-9]+}} -define void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i32, i32 addrspace(1)* %gep.in - %cmp = icmp eq i32 %val, 64 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ne_64_i32: -; GCN: v_cmp_ne_i32_e32 vcc, 64, v{{[0-9]+}} -define void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i32, i32 addrspace(1)* %gep.in - %cmp = icmp ne i32 %val, 64 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; FIXME: Why isn't this being folded as a constant? -; GCN-LABEL: {{^}}commute_ne_litk_i32: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3039 -; GCN: v_cmp_ne_i32_e32 vcc, [[K]], v{{[0-9]+}} -define void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i32, i32 addrspace(1)* %gep.in - %cmp = icmp ne i32 %val, 12345 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ugt_64_i32: -; GCN: v_cmp_lt_u32_e32 vcc, 64, v{{[0-9]+}} -define void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i32, i32 addrspace(1)* %gep.in - %cmp = icmp ugt i32 %val, 64 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_uge_64_i32: -; GCN: v_cmp_lt_u32_e32 vcc, 63, v{{[0-9]+}} -define void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i32, i32 addrspace(1)* %gep.in - %cmp = icmp uge i32 %val, 64 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ult_64_i32: -; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}} -define void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i32, i32 addrspace(1)* %gep.in - %cmp = icmp ult i32 %val, 64 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ule_63_i32: -; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}} -define void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i32, i32 addrspace(1)* %gep.in - %cmp = icmp ule i32 %val, 63 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; FIXME: Undo canonicalization to gt (x + 1) since it doesn't use the inline imm - -; GCN-LABEL: {{^}}commute_ule_64_i32: -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x41{{$}} -; GCN: v_cmp_gt_u32_e32 vcc, [[K]], v{{[0-9]+}} -define void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i32, i32 addrspace(1)* %gep.in - %cmp = icmp ule i32 %val, 64 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_sgt_neg1_i32: -; GCN: v_cmp_lt_i32_e32 vcc, -1, v{{[0-9]+}} -define void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i32, i32 addrspace(1)* %gep.in - %cmp = icmp sgt i32 %val, -1 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_sge_neg2_i32: -; GCN: v_cmp_lt_i32_e32 vcc, -3, v{{[0-9]+}} -define void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i32, i32 addrspace(1)* %gep.in - %cmp = icmp sge i32 %val, -2 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_slt_neg16_i32: -; GCN: v_cmp_gt_i32_e32 vcc, -16, v{{[0-9]+}} -define void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i32, i32 addrspace(1)* %gep.in - %cmp = icmp slt i32 %val, -16 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_sle_5_i32: -; GCN: v_cmp_gt_i32_e32 vcc, 6, v{{[0-9]+}} -define void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i32, i32 addrspace(1)* %gep.in - %cmp = icmp sle i32 %val, 5 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; -------------------------------------------------------------------------------- -; i64 compares -; -------------------------------------------------------------------------------- - -; GCN-LABEL: {{^}}commute_eq_64_i64: -; GCN: v_cmp_eq_i64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i64, i64 addrspace(1)* %gep.in - %cmp = icmp eq i64 %val, 64 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ne_64_i64: -; GCN: v_cmp_ne_i64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i64, i64 addrspace(1)* %gep.in - %cmp = icmp ne i64 %val, 64 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ugt_64_i64: -; GCN: v_cmp_lt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i64, i64 addrspace(1)* %gep.in - %cmp = icmp ugt i64 %val, 64 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_uge_64_i64: -; GCN: v_cmp_lt_u64_e32 vcc, 63, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i64, i64 addrspace(1)* %gep.in - %cmp = icmp uge i64 %val, 64 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ult_64_i64: -; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i64, i64 addrspace(1)* %gep.in - %cmp = icmp ult i64 %val, 64 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ule_63_i64: -; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i64, i64 addrspace(1)* %gep.in - %cmp = icmp ule i64 %val, 63 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; FIXME: Undo canonicalization to gt (x + 1) since it doesn't use the inline imm - -; GCN-LABEL: {{^}}commute_ule_64_i64: -; GCN-DAG: s_movk_i32 s[[KLO:[0-9]+]], 0x41{{$}} -; GCN: v_cmp_gt_u64_e32 vcc, s{{\[}}[[KLO]]:{{[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i64, i64 addrspace(1)* %gep.in - %cmp = icmp ule i64 %val, 64 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_sgt_neg1_i64: -; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i64, i64 addrspace(1)* %gep.in - %cmp = icmp sgt i64 %val, -1 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_sge_neg2_i64: -; GCN: v_cmp_lt_i64_e32 vcc, -3, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i64, i64 addrspace(1)* %gep.in - %cmp = icmp sge i64 %val, -2 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_slt_neg16_i64: -; GCN: v_cmp_gt_i64_e32 vcc, -16, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i64, i64 addrspace(1)* %gep.in - %cmp = icmp slt i64 %val, -16 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_sle_5_i64: -; GCN: v_cmp_gt_i64_e32 vcc, 6, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load i64, i64 addrspace(1)* %gep.in - %cmp = icmp sle i64 %val, 5 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; -------------------------------------------------------------------------------- -; f32 compares -; -------------------------------------------------------------------------------- - - -; GCN-LABEL: {{^}}commute_oeq_2.0_f32: -; GCN: v_cmp_eq_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load float, float addrspace(1)* %gep.in - %cmp = fcmp oeq float %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - - -; GCN-LABEL: {{^}}commute_ogt_2.0_f32: -; GCN: v_cmp_lt_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load float, float addrspace(1)* %gep.in - %cmp = fcmp ogt float %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_oge_2.0_f32: -; GCN: v_cmp_le_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load float, float addrspace(1)* %gep.in - %cmp = fcmp oge float %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_olt_2.0_f32: -; GCN: v_cmp_gt_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load float, float addrspace(1)* %gep.in - %cmp = fcmp olt float %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ole_2.0_f32: -; GCN: v_cmp_ge_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load float, float addrspace(1)* %gep.in - %cmp = fcmp ole float %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_one_2.0_f32: -; GCN: v_cmp_lg_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load float, float addrspace(1)* %gep.in - %cmp = fcmp one float %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ord_2.0_f32: -; GCN: v_cmp_o_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]] -define void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load float, float addrspace(1)* %gep.in - %cmp = fcmp ord float %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ueq_2.0_f32: -; GCN: v_cmp_nlg_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load float, float addrspace(1)* %gep.in - %cmp = fcmp ueq float %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ugt_2.0_f32: -; GCN: v_cmp_nge_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load float, float addrspace(1)* %gep.in - %cmp = fcmp ugt float %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_uge_2.0_f32: -; GCN: v_cmp_ngt_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load float, float addrspace(1)* %gep.in - %cmp = fcmp uge float %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ult_2.0_f32: -; GCN: v_cmp_nle_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load float, float addrspace(1)* %gep.in - %cmp = fcmp ult float %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ule_2.0_f32: -; GCN: v_cmp_nlt_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load float, float addrspace(1)* %gep.in - %cmp = fcmp ule float %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_une_2.0_f32: -; GCN: v_cmp_neq_f32_e32 vcc, 2.0, v{{[0-9]+}} -define void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load float, float addrspace(1)* %gep.in - %cmp = fcmp une float %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_uno_2.0_f32: -; GCN: v_cmp_u_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]] -define void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load float, float addrspace(1)* %gep.in - %cmp = fcmp uno float %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; -------------------------------------------------------------------------------- -; f64 compares -; -------------------------------------------------------------------------------- - - -; GCN-LABEL: {{^}}commute_oeq_2.0_f64: -; GCN: v_cmp_eq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load double, double addrspace(1)* %gep.in - %cmp = fcmp oeq double %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - - -; GCN-LABEL: {{^}}commute_ogt_2.0_f64: -; GCN: v_cmp_lt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load double, double addrspace(1)* %gep.in - %cmp = fcmp ogt double %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_oge_2.0_f64: -; GCN: v_cmp_le_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load double, double addrspace(1)* %gep.in - %cmp = fcmp oge double %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_olt_2.0_f64: -; GCN: v_cmp_gt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load double, double addrspace(1)* %gep.in - %cmp = fcmp olt double %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ole_2.0_f64: -; GCN: v_cmp_ge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load double, double addrspace(1)* %gep.in - %cmp = fcmp ole double %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_one_2.0_f64: -; GCN: v_cmp_lg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load double, double addrspace(1)* %gep.in - %cmp = fcmp one double %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ord_2.0_f64: -; GCN: v_cmp_o_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]] -define void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load double, double addrspace(1)* %gep.in - %cmp = fcmp ord double %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ueq_2.0_f64: -; GCN: v_cmp_nlg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load double, double addrspace(1)* %gep.in - %cmp = fcmp ueq double %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ugt_2.0_f64: -; GCN: v_cmp_nge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load double, double addrspace(1)* %gep.in - %cmp = fcmp ugt double %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_uge_2.0_f64: -; GCN: v_cmp_ngt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load double, double addrspace(1)* %gep.in - %cmp = fcmp uge double %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ult_2.0_f64: -; GCN: v_cmp_nle_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load double, double addrspace(1)* %gep.in - %cmp = fcmp ult double %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_ule_2.0_f64: -; GCN: v_cmp_nlt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load double, double addrspace(1)* %gep.in - %cmp = fcmp ule double %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_une_2.0_f64: -; GCN: v_cmp_neq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}} -define void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load double, double addrspace(1)* %gep.in - %cmp = fcmp une double %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -; GCN-LABEL: {{^}}commute_uno_2.0_f64: -; GCN: v_cmp_u_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]] -define void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %val = load double, double addrspace(1)* %gep.in - %cmp = fcmp uno double %val, 2.0 - %ext = sext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %gep.out - ret void -} - -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind } diff --git a/test/CodeGen/R600/commute_modifiers.ll b/test/CodeGen/R600/commute_modifiers.ll deleted file mode 100644 index 7fc36eabb78..00000000000 --- a/test/CodeGen/R600/commute_modifiers.ll +++ /dev/null @@ -1,181 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.r600.read.tidig.x() #1 -declare float @llvm.fabs.f32(float) #1 -declare float @llvm.fma.f32(float, float, float) nounwind readnone - -; FUNC-LABEL: @commute_add_imm_fabs_f32 -; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: v_add_f32_e64 [[REG:v[0-9]+]], 2.0, |[[X]]| -; SI-NEXT: buffer_store_dword [[REG]] -define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %x = load float, float addrspace(1)* %gep.0 - %x.fabs = call float @llvm.fabs.f32(float %x) #1 - %z = fadd float 2.0, %x.fabs - store float %z, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: @commute_mul_imm_fneg_fabs_f32 -; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: v_mul_f32_e64 [[REG:v[0-9]+]], -4.0, |[[X]]| -; SI-NEXT: buffer_store_dword [[REG]] -define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %x = load float, float addrspace(1)* %gep.0 - %x.fabs = call float @llvm.fabs.f32(float %x) #1 - %x.fneg.fabs = fsub float -0.000000e+00, %x.fabs - %z = fmul float 4.0, %x.fneg.fabs - store float %z, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: @commute_mul_imm_fneg_f32 -; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: v_mul_f32_e32 [[REG:v[0-9]+]], -4.0, [[X]] -; SI-NEXT: buffer_store_dword [[REG]] -define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %x = load float, float addrspace(1)* %gep.0 - %x.fneg = fsub float -0.000000e+00, %x - %z = fmul float 4.0, %x.fneg - store float %z, float addrspace(1)* %out - ret void -} - -; FIXME: Should use SGPR for literal. -; FUNC-LABEL: @commute_add_lit_fabs_f32 -; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x44800000 -; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, [[K]] -; SI-NEXT: buffer_store_dword [[REG]] -define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %x = load float, float addrspace(1)* %gep.0 - %x.fabs = call float @llvm.fabs.f32(float %x) #1 - %z = fadd float 1024.0, %x.fabs - store float %z, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: @commute_add_fabs_f32 -; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[X]], |[[Y]]| -; SI-NEXT: buffer_store_dword [[REG]] -define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %x = load float, float addrspace(1)* %gep.0 - %y = load float, float addrspace(1)* %gep.1 - %y.fabs = call float @llvm.fabs.f32(float %y) #1 - %z = fadd float %x, %y.fabs - store float %z, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: @commute_mul_fneg_f32 -; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -[[Y]] -; SI-NEXT: buffer_store_dword [[REG]] -define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %x = load float, float addrspace(1)* %gep.0 - %y = load float, float addrspace(1)* %gep.1 - %y.fneg = fsub float -0.000000e+00, %y - %z = fmul float %x, %y.fneg - store float %z, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: @commute_mul_fabs_fneg_f32 -; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -|[[Y]]| -; SI-NEXT: buffer_store_dword [[REG]] -define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %x = load float, float addrspace(1)* %gep.0 - %y = load float, float addrspace(1)* %gep.1 - %y.fabs = call float @llvm.fabs.f32(float %y) #1 - %y.fabs.fneg = fsub float -0.000000e+00, %y.fabs - %z = fmul float %x, %y.fabs.fneg - store float %z, float addrspace(1)* %out - ret void -} - -; There's no reason to commute this. -; FUNC-LABEL: @commute_mul_fabs_x_fabs_y_f32 -; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, |[[Y]]| -; SI-NEXT: buffer_store_dword [[REG]] -define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %x = load float, float addrspace(1)* %gep.0 - %y = load float, float addrspace(1)* %gep.1 - %x.fabs = call float @llvm.fabs.f32(float %x) #1 - %y.fabs = call float @llvm.fabs.f32(float %y) #1 - %z = fmul float %x.fabs, %y.fabs - store float %z, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: @commute_mul_fabs_x_fneg_fabs_y_f32 -; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -|[[Y]]| -; SI-NEXT: buffer_store_dword [[REG]] -define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %x = load float, float addrspace(1)* %gep.0 - %y = load float, float addrspace(1)* %gep.1 - %x.fabs = call float @llvm.fabs.f32(float %x) #1 - %y.fabs = call float @llvm.fabs.f32(float %y) #1 - %y.fabs.fneg = fsub float -0.000000e+00, %y.fabs - %z = fmul float %x.fabs, %y.fabs.fneg - store float %z, float addrspace(1)* %out - ret void -} - -; Make sure we commute the multiply part for the constant in src0 even -; though we have negate modifier on src2. - -; SI-LABEL: {{^}}fma_a_2.0_neg_b_f32 -; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], |[[R2]]| -; SI: buffer_store_dword [[RESULT]] -define void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load float, float addrspace(1)* %gep.0 - %r2 = load float, float addrspace(1)* %gep.1 - - %r2.fabs = call float @llvm.fabs.f32(float %r2) - - %r3 = tail call float @llvm.fma.f32(float %r1, float 2.0, float %r2.fabs) - store float %r3, float addrspace(1)* %gep.out - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/complex-folding.ll b/test/CodeGen/R600/complex-folding.ll deleted file mode 100644 index a5399a71324..00000000000 --- a/test/CodeGen/R600/complex-folding.ll +++ /dev/null @@ -1,19 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; CHECK: {{^}}main: -; CHECK-NOT: MOV -define void @main(<4 x float> inreg %reg0) #0 { -entry: - %0 = extractelement <4 x float> %reg0, i32 0 - %1 = call float @fabs(float %0) - %2 = fptoui float %1 to i32 - %3 = bitcast i32 %2 to float - %4 = insertelement <4 x float> undef, float %3, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %4, i32 0, i32 0) - ret void -} - -declare float @fabs(float ) readnone -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } \ No newline at end of file diff --git a/test/CodeGen/R600/concat_vectors.ll b/test/CodeGen/R600/concat_vectors.ll deleted file mode 100644 index a09ed1f7385..00000000000 --- a/test/CodeGen/R600/concat_vectors.ll +++ /dev/null @@ -1,296 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}test_concat_v1i32: -; 0x80f000 is the high 32 bits of the resource descriptor used by MUBUF -; instructions that access scratch memory. Bit 23, which is the add_tid_enable -; bit, is only set for scratch access, so we can check for the absence of this -; value if we want to ensure scratch memory is not being used. -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind { - %concat = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> - store <2 x i32> %concat, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v2i32: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { - %concat = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> - store <4 x i32> %concat, <4 x i32> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v4i32: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind { - %concat = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> - store <8 x i32> %concat, <8 x i32> addrspace(1)* %out, align 32 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v8i32: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind { - %concat = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> - store <16 x i32> %concat, <16 x i32> addrspace(1)* %out, align 64 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v16i32: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) nounwind { - %concat = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> - store <32 x i32> %concat, <32 x i32> addrspace(1)* %out, align 128 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v1f32: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <1 x float> %b) nounwind { - %concat = shufflevector <1 x float> %a, <1 x float> %b, <2 x i32> - store <2 x float> %concat, <2 x float> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v2f32: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind { - %concat = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> - store <4 x float> %concat, <4 x float> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v4f32: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind { - %concat = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> - store <8 x float> %concat, <8 x float> addrspace(1)* %out, align 32 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v8f32: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind { - %concat = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> - store <16 x float> %concat, <16 x float> addrspace(1)* %out, align 64 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v16f32: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind { - %concat = shufflevector <16 x float> %a, <16 x float> %b, <32 x i32> - store <32 x float> %concat, <32 x float> addrspace(1)* %out, align 128 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v1i64: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind { - %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> - store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v2i64: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { - %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> - store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v4i64: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { - %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> - store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v8i64: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { - %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> - store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v16i64: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { - %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> - store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v1f64: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind { - %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> - store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v2f64: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { - %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> - store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v4f64: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { - %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> - store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v8f64: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { - %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> - store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v16f64: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { - %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> - store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v1i1: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1> %b) nounwind { - %concat = shufflevector <1 x i1> %a, <1 x i1> %b, <2 x i32> - store <2 x i1> %concat, <2 x i1> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v2i1: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1> %b) nounwind { - %concat = shufflevector <2 x i1> %a, <2 x i1> %b, <4 x i32> - store <4 x i1> %concat, <4 x i1> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v4i1: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1> %b) nounwind { - %concat = shufflevector <4 x i1> %a, <4 x i1> %b, <8 x i32> - store <8 x i1> %concat, <8 x i1> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v8i1: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1> %b) nounwind { - %concat = shufflevector <8 x i1> %a, <8 x i1> %b, <16 x i32> - store <16 x i1> %concat, <16 x i1> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v16i1: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x i1> %b) nounwind { - %concat = shufflevector <16 x i1> %a, <16 x i1> %b, <32 x i32> - store <32 x i1> %concat, <32 x i1> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v32i1: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x i1> %b) nounwind { - %concat = shufflevector <32 x i1> %a, <32 x i1> %b, <64 x i32> - store <64 x i1> %concat, <64 x i1> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v1i16: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x i16> %b) nounwind { - %concat = shufflevector <1 x i16> %a, <1 x i16> %b, <2 x i32> - store <2 x i16> %concat, <2 x i16> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v2i16: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) nounwind { - %concat = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> - store <4 x i16> %concat, <4 x i16> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v4i16: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind { - %concat = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> - store <8 x i16> %concat, <8 x i16> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v8i16: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind { - %concat = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> - store <16 x i16> %concat, <16 x i16> addrspace(1)* %out, align 32 - ret void -} - -; FUNC-LABEL: {{^}}test_concat_v16i16: -; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000 -; SI-NOT: movrel -define void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <16 x i16> %b) nounwind { - %concat = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> - store <32 x i16> %concat, <32 x i16> addrspace(1)* %out, align 64 - ret void -} - -; FUNC-LABEL: {{^}}concat_vector_crash: -; SI: s_endpgm -define void @concat_vector_crash(<8 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) { -bb: - %tmp = load <2 x float>, <2 x float> addrspace(1)* %in, align 4 - %tmp1 = shufflevector <2 x float> %tmp, <2 x float> undef, <8 x i32> - %tmp2 = shufflevector <8 x float> undef, <8 x float> %tmp1, <8 x i32> - store <8 x float> %tmp2, <8 x float> addrspace(1)* %out, align 32 - ret void -} diff --git a/test/CodeGen/R600/copy-illegal-type.ll b/test/CodeGen/R600/copy-illegal-type.ll deleted file mode 100644 index 8b397566066..00000000000 --- a/test/CodeGen/R600/copy-illegal-type.ll +++ /dev/null @@ -1,167 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}test_copy_v4i8: -; SI: buffer_load_dword [[REG:v[0-9]+]] -; SI: buffer_store_dword [[REG]] -; SI: s_endpgm -define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 - store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_copy_v4i8_x2: -; SI: buffer_load_dword [[REG:v[0-9]+]] -; SI: buffer_store_dword [[REG]] -; SI: buffer_store_dword [[REG]] -; SI: s_endpgm -define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 - store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 - store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_copy_v4i8_x3: -; SI: buffer_load_dword [[REG:v[0-9]+]] -; SI: buffer_store_dword [[REG]] -; SI: buffer_store_dword [[REG]] -; SI: buffer_store_dword [[REG]] -; SI: s_endpgm -define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 - store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 - store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 - store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_copy_v4i8_x4: -; SI: buffer_load_dword [[REG:v[0-9]+]] -; SI: buffer_store_dword [[REG]] -; SI: buffer_store_dword [[REG]] -; SI: buffer_store_dword [[REG]] -; SI: buffer_store_dword [[REG]] -; SI: s_endpgm -define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind { - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 - store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 - store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 - store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 - store <4 x i8> %val, <4 x i8> addrspace(1)* %out3, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_copy_v4i8_extra_use: -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI-DAG: v_add -; SI-DAG: v_add -; SI-DAG: v_add -; SI-DAG: v_add -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI_DAG: buffer_store_byte - -; After scalarizing v4i8 loads is fixed. -; XSI: buffer_load_dword -; XSI: V_BFE -; XSI: V_ADD -; XSI: V_ADD -; XSI: V_ADD -; XSI: buffer_store_dword -; XSI: buffer_store_dword - -; SI: s_endpgm -define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 - %add = add <4 x i8> %val, - store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 - store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_copy_v4i8_x2_extra_use: -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI-DAG: v_add -; SI-DAG: v_add -; SI-DAG: v_add -; SI-DAG: v_add -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI_DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI_DAG: buffer_store_byte - -; XSI: buffer_load_dword -; XSI: BFE -; XSI: buffer_store_dword -; XSI: V_ADD -; XSI: buffer_store_dword -; XSI-NEXT: buffer_store_dword - -; SI: s_endpgm -define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 - %add = add <4 x i8> %val, - store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 - store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4 - store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_copy_v3i8: -; SI-NOT: bfe -; SI-NOT: bfi -; SI: s_endpgm -define void @test_copy_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { - %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 - store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_load: -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: s_endpgm -define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { - %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 - store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_store: -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: s_endpgm -define void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { - %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 - store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/copy-to-reg.ll b/test/CodeGen/R600/copy-to-reg.ll deleted file mode 100644 index fc875f6ef7a..00000000000 --- a/test/CodeGen/R600/copy-to-reg.ll +++ /dev/null @@ -1,27 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s - -; Test that CopyToReg instructions don't have non-register operands prior -; to being emitted. - -; Make sure this doesn't crash -; CHECK-LABEL: {{^}}copy_to_reg_frameindex: -define void @copy_to_reg_frameindex(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { -entry: - %alloca = alloca [16 x i32] - br label %loop - -loop: - %inc = phi i32 [0, %entry], [%inc.i, %loop] - %ptr = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %inc - store i32 %inc, i32* %ptr - %inc.i = add i32 %inc, 1 - %cnd = icmp uge i32 %inc.i, 16 - br i1 %cnd, label %done, label %loop - -done: - %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 0 - %tmp1 = load i32, i32* %tmp0 - store i32 %tmp1, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/ctlz_zero_undef.ll b/test/CodeGen/R600/ctlz_zero_undef.ll deleted file mode 100644 index bd26c302fe5..00000000000 --- a/test/CodeGen/R600/ctlz_zero_undef.ll +++ /dev/null @@ -1,71 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone -declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone -declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone - -; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i32: -; SI: s_load_dword [[VAL:s[0-9]+]], -; SI: s_flbit_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]] -; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] -; SI: buffer_store_dword [[VRESULT]], -; SI: s_endpgm -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] -; EG: FFBH_UINT {{\*? *}}[[RESULT]] -define void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { - %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone - store i32 %ctlz, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32: -; SI: buffer_load_dword [[VAL:v[0-9]+]], -; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] -; EG: FFBH_UINT {{\*? *}}[[RESULT]] -define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr, align 4 - %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone - store i32 %ctlz, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v2i32: -; SI: buffer_load_dwordx2 -; SI: v_ffbh_u32_e32 -; SI: v_ffbh_u32_e32 -; SI: buffer_store_dwordx2 -; SI: s_endpgm -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} -; EG: FFBH_UINT {{\*? *}}[[RESULT]] -; EG: FFBH_UINT {{\*? *}}[[RESULT]] -define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { - %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8 - %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone - store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v4i32: -; SI: buffer_load_dwordx4 -; SI: v_ffbh_u32_e32 -; SI: v_ffbh_u32_e32 -; SI: v_ffbh_u32_e32 -; SI: v_ffbh_u32_e32 -; SI: buffer_store_dwordx4 -; SI: s_endpgm -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} -; EG: FFBH_UINT {{\*? *}}[[RESULT]] -; EG: FFBH_UINT {{\*? *}}[[RESULT]] -; EG: FFBH_UINT {{\*? *}}[[RESULT]] -; EG: FFBH_UINT {{\*? *}}[[RESULT]] -define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { - %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16 - %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone - store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16 - ret void -} diff --git a/test/CodeGen/R600/ctpop.ll b/test/CodeGen/R600/ctpop.ll deleted file mode 100644 index 0a031c5e24d..00000000000 --- a/test/CodeGen/R600/ctpop.ll +++ /dev/null @@ -1,300 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC -check-prefix=VI %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare i32 @llvm.ctpop.i32(i32) nounwind readnone -declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone -declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone -declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) nounwind readnone -declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readnone - -; FUNC-LABEL: {{^}}s_ctpop_i32: -; GCN: s_load_dword [[SVAL:s[0-9]+]], -; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[SVAL]] -; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] -; GCN: buffer_store_dword [[VRESULT]], -; GCN: s_endpgm - -; EG: BCNT_INT -define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { - %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone - store i32 %ctpop, i32 addrspace(1)* %out, align 4 - ret void -} - -; XXX - Why 0 in register? -; FUNC-LABEL: {{^}}v_ctpop_i32: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], -; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 0 -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm - -; EG: BCNT_INT -define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 - %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone - store i32 %ctpop, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32: -; GCN: buffer_load_dword [[VAL1:v[0-9]+]], -; GCN: buffer_load_dword [[VAL0:v[0-9]+]], -; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], [[VAL1]], 0 -; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]] -; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]] -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm - -; EG: BCNT_INT -; EG: BCNT_INT -define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind { - %val0 = load i32, i32 addrspace(1)* %in0, align 4 - %val1 = load i32, i32 addrspace(1)* %in1, align 4 - %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone - %ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone - %add = add i32 %ctpop0, %ctpop1 - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_ctpop_add_sgpr_i32: -; GCN: buffer_load_dword [[VAL0:v[0-9]+]], -; GCN-NEXT: s_waitcnt -; GCN-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}} -; GCN-NEXT: buffer_store_dword [[RESULT]], -; GCN: s_endpgm -define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind { - %val0 = load i32, i32 addrspace(1)* %in0, align 4 - %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone - %add = add i32 %ctpop0, %sval - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_ctpop_v2i32: -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: s_endpgm - -; EG: BCNT_INT -; EG: BCNT_INT -define void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind { - %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8 - %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone - store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_ctpop_v4i32: -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: s_endpgm - -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -define void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind { - %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16 - %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone - store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: {{^}}v_ctpop_v8i32: -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: s_endpgm - -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -define void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind { - %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32 - %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone - store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32 - ret void -} - -; FUNC-LABEL: {{^}}v_ctpop_v16i32: -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: s_endpgm - -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -; EG: BCNT_INT -define void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind { - %val = load <16 x i32>, <16 x i32> addrspace(1)* %in, align 32 - %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone - store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32 - ret void -} - -; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], -; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4 -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm - -; EG: BCNT_INT -define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 - %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone - %add = add i32 %ctpop, 4 - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant_inv: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], -; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4 -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm - -; EG: BCNT_INT -define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 - %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone - %add = add i32 4, %ctpop - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_ctpop_i32_add_literal: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], -; GCN: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f -; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]] -; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]] -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm -define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 - %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone - %add = add i32 %ctpop, 99999 - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_ctpop_i32_add_var: -; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], -; GCN-DAG: s_load_dword [[VAR:s[0-9]+]], -; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm - -; EG: BCNT_INT -define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 - %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone - %add = add i32 %ctpop, %const - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_ctpop_i32_add_var_inv: -; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], -; GCN-DAG: s_load_dword [[VAR:s[0-9]+]], -; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm - -; EG: BCNT_INT -define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 - %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone - %add = add i32 %const, %ctpop - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_ctpop_i32_add_vvar_inv: -; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], {{0$}} -; GCN-DAG: buffer_load_dword [[VAR:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offset:16 -; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] -; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm - -; EG: BCNT_INT -define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 - %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone - %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 4 - %const = load i32, i32 addrspace(1)* %gep, align 4 - %add = add i32 %const, %ctpop - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FIXME: We currently disallow SALU instructions in all branches, -; but there are some cases when the should be allowed. - -; FUNC-LABEL: {{^}}ctpop_i32_in_br: -; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xd -; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x34 -; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]] -; GCN: v_mov_b32_e32 [[RESULT]], [[SRESULT]] -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm -; EG: BCNT_INT -define void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) { -entry: - %tmp0 = icmp eq i32 %cond, 0 - br i1 %tmp0, label %if, label %else - -if: - %tmp2 = call i32 @llvm.ctpop.i32(i32 %ctpop_arg) - br label %endif - -else: - %tmp3 = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %tmp4 = load i32, i32 addrspace(1)* %tmp3 - br label %endif - -endif: - %tmp5 = phi i32 [%tmp2, %if], [%tmp4, %else] - store i32 %tmp5, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/ctpop64.ll b/test/CodeGen/R600/ctpop64.ll deleted file mode 100644 index e1a0ee3ea21..00000000000 --- a/test/CodeGen/R600/ctpop64.ll +++ /dev/null @@ -1,124 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s - -declare i64 @llvm.ctpop.i64(i64) nounwind readnone -declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone -declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone -declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone -declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>) nounwind readnone - -; FUNC-LABEL: {{^}}s_ctpop_i64: -; SI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; GCN: s_bcnt1_i32_b64 [[SRESULT:s[0-9]+]], [[SVAL]] -; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] -; GCN: buffer_store_dword [[VRESULT]], -; GCN: s_endpgm -define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind { - %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone - %truncctpop = trunc i64 %ctpop to i32 - store i32 %truncctpop, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_ctpop_i64: -; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, -; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0 -; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] -; VI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm -define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { - %val = load i64, i64 addrspace(1)* %in, align 8 - %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone - %truncctpop = trunc i64 %ctpop to i32 - store i32 %truncctpop, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}s_ctpop_v2i64: -; GCN: s_bcnt1_i32_b64 -; GCN: s_bcnt1_i32_b64 -; GCN: s_endpgm -define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind { - %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone - %truncctpop = trunc <2 x i64> %ctpop to <2 x i32> - store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_ctpop_v4i64: -; GCN: s_bcnt1_i32_b64 -; GCN: s_bcnt1_i32_b64 -; GCN: s_bcnt1_i32_b64 -; GCN: s_bcnt1_i32_b64 -; GCN: s_endpgm -define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind { - %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone - %truncctpop = trunc <4 x i64> %ctpop to <4 x i32> - store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: {{^}}v_ctpop_v2i64: -; GCN: v_bcnt_u32_b32 -; GCN: v_bcnt_u32_b32 -; GCN: v_bcnt_u32_b32 -; GCN: v_bcnt_u32_b32 -; GCN: s_endpgm -define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind { - %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 - %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone - %truncctpop = trunc <2 x i64> %ctpop to <2 x i32> - store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_ctpop_v4i64: -; GCN: v_bcnt_u32_b32 -; GCN: v_bcnt_u32_b32 -; GCN: v_bcnt_u32_b32 -; GCN: v_bcnt_u32_b32 -; GCN: v_bcnt_u32_b32 -; GCN: v_bcnt_u32_b32 -; GCN: v_bcnt_u32_b32 -; GCN: v_bcnt_u32_b32 -; GCN: s_endpgm -define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind { - %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32 - %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone - %truncctpop = trunc <4 x i64> %ctpop to <4 x i32> - store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16 - ret void -} - -; FIXME: We currently disallow SALU instructions in all branches, -; but there are some cases when the should be allowed. - -; FUNC-LABEL: {{^}}ctpop_i64_in_br: -; SI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xd -; VI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x34 -; GCN: s_bcnt1_i32_b64 [[RESULT:s[0-9]+]], {{s\[}}[[LOVAL]]:[[HIVAL]]{{\]}} -; GCN: v_mov_b32_e32 v[[VLO:[0-9]+]], [[RESULT]] -; GCN: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HIVAL]] -; GCN: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]{{\]}} -; GCN: s_endpgm -define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) { -entry: - %tmp0 = icmp eq i32 %cond, 0 - br i1 %tmp0, label %if, label %else - -if: - %tmp2 = call i64 @llvm.ctpop.i64(i64 %ctpop_arg) - br label %endif - -else: - %tmp3 = getelementptr i64, i64 addrspace(1)* %in, i32 1 - %tmp4 = load i64, i64 addrspace(1)* %tmp3 - br label %endif - -endif: - %tmp5 = phi i64 [%tmp2, %if], [%tmp4, %else] - store i64 %tmp5, i64 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/cttz_zero_undef.ll b/test/CodeGen/R600/cttz_zero_undef.ll deleted file mode 100644 index 56fcb51fe14..00000000000 --- a/test/CodeGen/R600/cttz_zero_undef.ll +++ /dev/null @@ -1,71 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone -declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone -declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone - -; FUNC-LABEL: {{^}}s_cttz_zero_undef_i32: -; SI: s_load_dword [[VAL:s[0-9]+]], -; SI: s_ff1_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]] -; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] -; SI: buffer_store_dword [[VRESULT]], -; SI: s_endpgm -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] -; EG: FFBL_INT {{\*? *}}[[RESULT]] -define void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { - %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone - store i32 %cttz, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_cttz_zero_undef_i32: -; SI: buffer_load_dword [[VAL:v[0-9]+]], -; SI: v_ffbl_b32_e32 [[RESULT:v[0-9]+]], [[VAL]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] -; EG: FFBL_INT {{\*? *}}[[RESULT]] -define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr, align 4 - %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone - store i32 %cttz, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_cttz_zero_undef_v2i32: -; SI: buffer_load_dwordx2 -; SI: v_ffbl_b32_e32 -; SI: v_ffbl_b32_e32 -; SI: buffer_store_dwordx2 -; SI: s_endpgm -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} -; EG: FFBL_INT {{\*? *}}[[RESULT]] -; EG: FFBL_INT {{\*? *}}[[RESULT]] -define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { - %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8 - %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone - store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_cttz_zero_undef_v4i32: -; SI: buffer_load_dwordx4 -; SI: v_ffbl_b32_e32 -; SI: v_ffbl_b32_e32 -; SI: v_ffbl_b32_e32 -; SI: v_ffbl_b32_e32 -; SI: buffer_store_dwordx4 -; SI: s_endpgm -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} -; EG: FFBL_INT {{\*? *}}[[RESULT]] -; EG: FFBL_INT {{\*? *}}[[RESULT]] -; EG: FFBL_INT {{\*? *}}[[RESULT]] -; EG: FFBL_INT {{\*? *}}[[RESULT]] -define void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { - %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16 - %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone - store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16 - ret void -} diff --git a/test/CodeGen/R600/cvt_f32_ubyte.ll b/test/CodeGen/R600/cvt_f32_ubyte.ll deleted file mode 100644 index 3399d9da29e..00000000000 --- a/test/CodeGen/R600/cvt_f32_ubyte.ll +++ /dev/null @@ -1,196 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}load_i8_to_f32: -; SI: buffer_load_ubyte [[LOADREG:v[0-9]+]], -; SI-NOT: bfe -; SI-NOT: lshr -; SI: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]] -; SI: buffer_store_dword [[CONV]], -define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { - %load = load i8, i8 addrspace(1)* %in, align 1 - %cvt = uitofp i8 %load to float - store float %cvt, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}load_v2i8_to_v2f32: -; SI: buffer_load_ushort [[LOADREG:v[0-9]+]], -; SI-NOT: bfe -; SI-NOT: lshr -; SI-NOT: and -; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]] -; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, -define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2 - %cvt = uitofp <2 x i8> %load to <2 x float> - store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}load_v3i8_to_v3f32: -; SI-NOT: bfe -; SI-NOT: v_cvt_f32_ubyte3_e32 -; SI-DAG: v_cvt_f32_ubyte2_e32 -; SI-DAG: v_cvt_f32_ubyte1_e32 -; SI-DAG: v_cvt_f32_ubyte0_e32 -; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, -define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 - %cvt = uitofp <3 x i8> %load to <3 x float> - store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}load_v4i8_to_v4f32: -; SI: buffer_load_dword [[LOADREG:v[0-9]+]] -; SI-NOT: bfe -; SI-NOT: lshr -; SI-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]] -; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[LOADREG]] -; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]] -; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, -define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 - %cvt = uitofp <4 x i8> %load to <4 x float> - store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 - ret void -} - -; This should not be adding instructions to shift into the correct -; position in the word for the component. - -; SI-LABEL: {{^}}load_v4i8_to_v4f32_unaligned: -; SI: buffer_load_ubyte [[LOADREG3:v[0-9]+]] -; SI: buffer_load_ubyte [[LOADREG2:v[0-9]+]] -; SI: buffer_load_ubyte [[LOADREG1:v[0-9]+]] -; SI: buffer_load_ubyte [[LOADREG0:v[0-9]+]] -; SI-NOT: v_lshlrev_b32 -; SI-NOT: v_or_b32 - -; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG0]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG1]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, [[LOADREG2]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]], [[LOADREG3]] - -; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, -define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1 - %cvt = uitofp <4 x i8> %load to <4 x float> - store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 - ret void -} - -; XXX - This should really still be able to use the v_cvt_f32_ubyte0 -; for each component, but computeKnownBits doesn't handle vectors very -; well. - -; SI-LABEL: {{^}}load_v4i8_to_v4f32_2_uses: -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: v_cvt_f32_ubyte0_e32 -; SI: v_cvt_f32_ubyte0_e32 -; SI: v_cvt_f32_ubyte0_e32 -; SI: v_cvt_f32_ubyte0_e32 - -; XXX - replace with this when v4i8 loads aren't scalarized anymore. -; XSI: buffer_load_dword -; XSI: v_cvt_f32_u32_e32 -; XSI: v_cvt_f32_u32_e32 -; XSI: v_cvt_f32_u32_e32 -; XSI: v_cvt_f32_u32_e32 -; SI: s_endpgm -define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 - %cvt = uitofp <4 x i8> %load to <4 x float> - store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 - %add = add <4 x i8> %load, ; Second use of %load - store <4 x i8> %add, <4 x i8> addrspace(1)* %out2, align 4 - ret void -} - -; Make sure this doesn't crash. -; SI-LABEL: {{^}}load_v7i8_to_v7f32: -; SI: s_endpgm -define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1 - %cvt = uitofp <7 x i8> %load to <7 x float> - store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}load_v8i8_to_v8f32: -; SI: buffer_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}}, -; SI-NOT: bfe -; SI-NOT: lshr -; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]] -; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[LOLOAD]] -; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[LOLOAD]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[LOLOAD]] -; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[HILOAD]] -; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[HILOAD]] -; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[HILOAD]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[HILOAD]] -; SI-NOT: bfe -; SI-NOT: lshr -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8 - %cvt = uitofp <8 x i8> %load to <8 x float> - store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}i8_zext_inreg_i32_to_f32: -; SI: buffer_load_dword [[LOADREG:v[0-9]+]], -; SI: v_add_i32_e32 [[ADD:v[0-9]+]], 2, [[LOADREG]] -; SI-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]] -; SI: buffer_store_dword [[CONV]], -define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %load = load i32, i32 addrspace(1)* %in, align 4 - %add = add i32 %load, 2 - %inreg = and i32 %add, 255 - %cvt = uitofp i32 %inreg to float - store float %cvt, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}i8_zext_inreg_hi1_to_f32: -define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { - %load = load i32, i32 addrspace(1)* %in, align 4 - %inreg = and i32 %load, 65280 - %shr = lshr i32 %inreg, 8 - %cvt = uitofp i32 %shr to float - store float %cvt, float addrspace(1)* %out, align 4 - ret void -} - - -; We don't get these ones because of the zext, but instcombine removes -; them so it shouldn't really matter. -define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { - %load = load i8, i8 addrspace(1)* %in, align 1 - %ext = zext i8 %load to i32 - %cvt = uitofp i32 %ext to float - store float %cvt, float addrspace(1)* %out, align 4 - ret void -} - -define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { - %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1 - %ext = zext <4 x i8> %load to <4 x i32> - %cvt = uitofp <4 x i32> %ext to <4 x float> - store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 - ret void -} diff --git a/test/CodeGen/R600/cvt_flr_i32_f32.ll b/test/CodeGen/R600/cvt_flr_i32_f32.ll deleted file mode 100644 index 2dd3a9f2a77..00000000000 --- a/test/CodeGen/R600/cvt_flr_i32_f32.ll +++ /dev/null @@ -1,86 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -enable-no-nans-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare float @llvm.fabs.f32(float) #1 -declare float @llvm.floor.f32(float) #1 - -; FUNC-LABEL: {{^}}cvt_flr_i32_f32_0: -; SI-SAFE-NOT: v_cvt_flr_i32_f32 -; SI-NOT: add -; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}} -; SI: s_endpgm -define void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 { - %floor = call float @llvm.floor.f32(float %x) #1 - %cvt = fptosi float %floor to i32 - store i32 %cvt, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}cvt_flr_i32_f32_1: -; SI: v_add_f32_e64 [[TMP:v[0-9]+]], 1.0, s{{[0-9]+}} -; SI-SAFE-NOT: v_cvt_flr_i32_f32 -; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, [[TMP]] -; SI: s_endpgm -define void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 { - %fadd = fadd float %x, 1.0 - %floor = call float @llvm.floor.f32(float %fadd) #1 - %cvt = fptosi float %floor to i32 - store i32 %cvt, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}cvt_flr_i32_f32_fabs: -; SI-NOT: add -; SI-SAFE-NOT: v_cvt_flr_i32_f32 -; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}| -; SI: s_endpgm -define void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 { - %x.fabs = call float @llvm.fabs.f32(float %x) #1 - %floor = call float @llvm.floor.f32(float %x.fabs) #1 - %cvt = fptosi float %floor to i32 - store i32 %cvt, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}cvt_flr_i32_f32_fneg: -; SI-NOT: add -; SI-SAFE-NOT: v_cvt_flr_i32_f32 -; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}} -; SI: s_endpgm -define void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 { - %x.fneg = fsub float -0.000000e+00, %x - %floor = call float @llvm.floor.f32(float %x.fneg) #1 - %cvt = fptosi float %floor to i32 - store i32 %cvt, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}cvt_flr_i32_f32_fabs_fneg: -; SI-NOT: add -; SI-SAFE-NOT: v_cvt_flr_i32_f32 -; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -|s{{[0-9]+}}| -; SI: s_endpgm -define void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 { - %x.fabs = call float @llvm.fabs.f32(float %x) #1 - %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs - %floor = call float @llvm.floor.f32(float %x.fabs.fneg) #1 - %cvt = fptosi float %floor to i32 - store i32 %cvt, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}no_cvt_flr_i32_f32_0: -; SI-NOT: v_cvt_flr_i32_f32 -; SI: v_floor_f32 -; SI: v_cvt_u32_f32_e32 -; SI: s_endpgm -define void @no_cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 { - %floor = call float @llvm.floor.f32(float %x) #1 - %cvt = fptoui float %floor to i32 - store i32 %cvt, i32 addrspace(1)* %out - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/cvt_rpi_i32_f32.ll b/test/CodeGen/R600/cvt_rpi_i32_f32.ll deleted file mode 100644 index 864ac40260b..00000000000 --- a/test/CodeGen/R600/cvt_rpi_i32_f32.ll +++ /dev/null @@ -1,83 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -enable-no-nans-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s - -declare float @llvm.fabs.f32(float) #1 -declare float @llvm.floor.f32(float) #1 - -; FUNC-LABEL: {{^}}cvt_rpi_i32_f32: -; SI-SAFE-NOT: v_cvt_rpi_i32_f32 -; SI-NONAN: v_cvt_rpi_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}} -; SI: s_endpgm -define void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 { - %fadd = fadd float %x, 0.5 - %floor = call float @llvm.floor.f32(float %fadd) #1 - %cvt = fptosi float %floor to i32 - store i32 %cvt, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}cvt_rpi_i32_f32_fabs: -; SI-SAFE-NOT: v_cvt_rpi_i32_f32 -; SI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}} -; SI: s_endpgm -define void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 { - %x.fabs = call float @llvm.fabs.f32(float %x) #1 - %fadd = fadd float %x.fabs, 0.5 - %floor = call float @llvm.floor.f32(float %fadd) #1 - %cvt = fptosi float %floor to i32 - store i32 %cvt, i32 addrspace(1)* %out - ret void -} - -; FIXME: This doesn't work because it forms fsub 0.5, x -; FUNC-LABEL: {{^}}cvt_rpi_i32_f32_fneg: -; XSI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}} -; SI: v_sub_f32_e64 [[TMP:v[0-9]+]], 0.5, s{{[0-9]+}} -; SI-SAFE-NOT: v_cvt_flr_i32_f32 -; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]] -; SI: s_endpgm -define void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 { - %x.fneg = fsub float -0.000000e+00, %x - %fadd = fadd float %x.fneg, 0.5 - %floor = call float @llvm.floor.f32(float %fadd) #1 - %cvt = fptosi float %floor to i32 - store i32 %cvt, i32 addrspace(1)* %out - ret void -} - -; FIXME: This doesn't work for same reason as above -; FUNC-LABEL: {{^}}cvt_rpi_i32_f32_fabs_fneg: -; SI-SAFE-NOT: v_cvt_rpi_i32_f32 -; XSI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, -|s{{[0-9]+}}| - -; SI: v_sub_f32_e64 [[TMP:v[0-9]+]], 0.5, |s{{[0-9]+}}| -; SI-SAFE-NOT: v_cvt_flr_i32_f32 -; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]] -; SI: s_endpgm -define void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 { - %x.fabs = call float @llvm.fabs.f32(float %x) #1 - %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs - %fadd = fadd float %x.fabs.fneg, 0.5 - %floor = call float @llvm.floor.f32(float %fadd) #1 - %cvt = fptosi float %floor to i32 - store i32 %cvt, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}no_cvt_rpi_i32_f32_0: -; SI-NOT: v_cvt_rpi_i32_f32 -; SI: v_add_f32 -; SI: v_floor_f32 -; SI: v_cvt_u32_f32 -; SI: s_endpgm -define void @no_cvt_rpi_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 { - %fadd = fadd float %x, 0.5 - %floor = call float @llvm.floor.f32(float %fadd) #1 - %cvt = fptoui float %floor to i32 - store i32 %cvt, i32 addrspace(1)* %out - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll b/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll deleted file mode 100644 index fb43ff4fbdd..00000000000 --- a/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll +++ /dev/null @@ -1,36 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; This test is for a bug in -; DAGCombiner::reduceBuildVecConvertToConvertBuildVec() where -; the wrong type was being passed to -; TargetLowering::getOperationAction() when checking the legality of -; ISD::UINT_TO_FP and ISD::SINT_TO_FP opcodes. - - -; CHECK: {{^}}sint: -; CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -define void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %sint = load i32, i32 addrspace(1) * %in - %conv = sitofp i32 %sint to float - %0 = insertelement <4 x float> undef, float %conv, i32 0 - %splat = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer - store <4 x float> %splat, <4 x float> addrspace(1)* %out - ret void -} - -;CHECK: {{^}}uint: -;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -define void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %uint = load i32, i32 addrspace(1) * %in - %conv = uitofp i32 %uint to float - %0 = insertelement <4 x float> undef, float %conv, i32 0 - %splat = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer - store <4 x float> %splat, <4 x float> addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/debug.ll b/test/CodeGen/R600/debug.ll deleted file mode 100644 index a2e0e878b74..00000000000 --- a/test/CodeGen/R600/debug.ll +++ /dev/null @@ -1,10 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs -mattr=dumpcode -filetype=obj | FileCheck --check-prefix=SI --check-prefix=FUNC %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=dumpcode -filetype=obj | FileCheck --check-prefix=SI --check-prefix=FUNC %s - -; Test for a crash in the custom assembly dump code. - -; SI: s_endpgm -define void @test(i32 addrspace(1)* %out) { - store i32 0, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/default-fp-mode.ll b/test/CodeGen/R600/default-fp-mode.ll deleted file mode 100644 index da8e91454b9..00000000000 --- a/test/CodeGen/R600/default-fp-mode.ll +++ /dev/null @@ -1,36 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=FP64-DENORMAL -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=FP32-DENORMAL -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=BOTH-DENORMAL -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=NO-DENORMAL -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -mattr=+fp64-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=FP64-DENORMAL -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=FP32-DENORMAL -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=BOTH-DENORMAL -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=NO-DENORMAL -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp64-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}test_kernel: - -; DEFAULT: FloatMode: 192 -; DEFAULT: IeeeMode: 0 - -; FP64-DENORMAL: FloatMode: 192 -; FP64-DENORMAL: IeeeMode: 0 - -; FP32-DENORMAL: FloatMode: 48 -; FP32-DENORMAL: IeeeMode: 0 - -; BOTH-DENORMAL: FloatMode: 240 -; BOTH-DENORMAL: IeeeMode: 0 - -; NO-DENORMAL: FloatMode: 0 -; NO-DENORMAL: IeeeMode: 0 -define void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind { - store float 0.0, float addrspace(1)* %out0 - store double 0.0, double addrspace(1)* %out1 - ret void -} diff --git a/test/CodeGen/R600/disconnected-predset-break-bug.ll b/test/CodeGen/R600/disconnected-predset-break-bug.ll deleted file mode 100644 index cdd2c0cd4f4..00000000000 --- a/test/CodeGen/R600/disconnected-predset-break-bug.ll +++ /dev/null @@ -1,29 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; PRED_SET* instructions must be tied to any instruction that uses their -; result. This tests that there are no instructions between the PRED_SET* -; and the PREDICATE_BREAK in this loop. - -; CHECK: {{^}}loop_ge: -; CHECK: LOOP_START_DX10 -; CHECK: ALU_PUSH_BEFORE -; CHECK-NEXT: JUMP -; CHECK-NEXT: LOOP_BREAK -define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) nounwind { -entry: - %cmp5 = icmp sgt i32 %iterations, 0 - br i1 %cmp5, label %for.body, label %for.end - -for.body: ; preds = %for.body, %entry - %i.07.in = phi i32 [ %i.07, %for.body ], [ %iterations, %entry ] - %ai.06 = phi i32 [ %add, %for.body ], [ 0, %entry ] - %i.07 = add nsw i32 %i.07.in, -1 - %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %ai.06 - store i32 %i.07, i32 addrspace(1)* %arrayidx, align 4 - %add = add nsw i32 %ai.06, 1 - %exitcond = icmp eq i32 %add, %iterations - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - ret void -} diff --git a/test/CodeGen/R600/dot4-folding.ll b/test/CodeGen/R600/dot4-folding.ll deleted file mode 100644 index 4df7b63bf98..00000000000 --- a/test/CodeGen/R600/dot4-folding.ll +++ /dev/null @@ -1,27 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; Exactly one constant vector can be folded into dot4, which means exactly -; 4 MOV instructions -; CHECK: {{^}}main: -; CHECK: MOV -; CHECK: MOV -; CHECK: MOV -; CHECK: MOV -; CHECK-NOT: MOV -; CHECK-NOT: MOV -; CHECK-NOT: MOV -; CHECK-NOT: MOV - -define void @main(float addrspace(1)* %out) { -main_body: - %0 = load <4 x float>, <4 x float> addrspace(8)* null - %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %2 = call float @llvm.AMDGPU.dp4(<4 x float> %0,<4 x float> %1) - %3 = insertelement <4 x float> undef, float %2, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %3, i32 0, i32 0) - ret void -} - -declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) -attributes #1 = { readnone } diff --git a/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll b/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll deleted file mode 100644 index e7e13d6178c..00000000000 --- a/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll +++ /dev/null @@ -1,69 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI --check-prefix=CHECK %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=CI --check-prefix=CHECK %s - -declare i32 @llvm.r600.read.tidig.x() #0 -declare void @llvm.AMDGPU.barrier.local() #1 - -; Function Attrs: nounwind -; CHECK-LABEL: {{^}}signed_ds_offset_addressing_loop: -; CHECK: BB0_1: -; CHECK: v_add_i32_e32 [[VADDR:v[0-9]+]], -; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] -; SI-DAG: v_add_i32_e32 [[VADDR4:v[0-9]+]], 4, [[VADDR]] -; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR4]] -; SI-DAG: v_add_i32_e32 [[VADDR0x80:v[0-9]+]], 0x80, [[VADDR]] -; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x80]] -; SI-DAG: v_add_i32_e32 [[VADDR0x84:v[0-9]+]], 0x84, [[VADDR]] -; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x84]] -; SI-DAG: v_add_i32_e32 [[VADDR0x100:v[0-9]+]], 0x100, [[VADDR]] -; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x100]] - -; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset1:1 -; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:32 offset1:33 -; CI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] offset:256 -; CHECK: s_endpgm -define void @signed_ds_offset_addressing_loop(float addrspace(1)* noalias nocapture %out, float addrspace(3)* noalias nocapture readonly %lptr, i32 %n) #2 { -entry: - %x.i = tail call i32 @llvm.r600.read.tidig.x() #0 - %mul = shl nsw i32 %x.i, 1 - br label %for.body - -for.body: ; preds = %for.body, %entry - %sum.03 = phi float [ 0.000000e+00, %entry ], [ %add13, %for.body ] - %offset.02 = phi i32 [ %mul, %entry ], [ %add14, %for.body ] - %k.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ] - tail call void @llvm.AMDGPU.barrier.local() #1 - %arrayidx = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %offset.02 - %tmp = load float, float addrspace(3)* %arrayidx, align 4 - %add1 = add nsw i32 %offset.02, 1 - %arrayidx2 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add1 - %tmp1 = load float, float addrspace(3)* %arrayidx2, align 4 - %add3 = add nsw i32 %offset.02, 32 - %arrayidx4 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add3 - %tmp2 = load float, float addrspace(3)* %arrayidx4, align 4 - %add5 = add nsw i32 %offset.02, 33 - %arrayidx6 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add5 - %tmp3 = load float, float addrspace(3)* %arrayidx6, align 4 - %add7 = add nsw i32 %offset.02, 64 - %arrayidx8 = getelementptr inbounds float, float addrspace(3)* %lptr, i32 %add7 - %tmp4 = load float, float addrspace(3)* %arrayidx8, align 4 - %add9 = fadd float %tmp, %tmp1 - %add10 = fadd float %add9, %tmp2 - %add11 = fadd float %add10, %tmp3 - %add12 = fadd float %add11, %tmp4 - %add13 = fadd float %sum.03, %add12 - %inc = add nsw i32 %k.01, 1 - %add14 = add nsw i32 %offset.02, 97 - %exitcond = icmp eq i32 %inc, 8 - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body - %tmp5 = sext i32 %x.i to i64 - %arrayidx15 = getelementptr inbounds float, float addrspace(1)* %out, i64 %tmp5 - store float %add13, float addrspace(1)* %arrayidx15, align 4 - ret void -} - -attributes #0 = { nounwind readnone } -attributes #1 = { noduplicate nounwind } -attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/test/CodeGen/R600/ds_read2.ll b/test/CodeGen/R600/ds_read2.ll deleted file mode 100644 index 5929898f8bd..00000000000 --- a/test/CodeGen/R600/ds_read2.ll +++ /dev/null @@ -1,515 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s - -; FIXME: We don't get cases where the address was an SGPR because we -; get a copy to the address register for each one. - -@lds = addrspace(3) global [512 x float] undef, align 4 - @lds.f64 = addrspace(3) global [512 x double] undef, align 8 - -; SI-LABEL: @simple_read2_f32 -; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:8 -; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @simple_read2_f32(float addrspace(1)* %out) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - %val1 = load float, float addrspace(3)* %arrayidx1, align 4 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; SI-LABEL: @simple_read2_f32_max_offset -; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:255 -; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 255 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - %val1 = load float, float addrspace(3)* %arrayidx1, align 4 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; SI-LABEL: @simple_read2_f32_too_far -; SI-NOT ds_read2_b32 -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028 -; SI: s_endpgm -define void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 257 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - %val1 = load float, float addrspace(3)* %arrayidx1, align 4 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; SI-LABEL: @simple_read2_f32_x2 -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8 -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27 -; SI: s_endpgm -define void @simple_read2_f32_x2(float addrspace(1)* %out) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 0 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - - %idx.1 = add nsw i32 %tid.x, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 - %val1 = load float, float addrspace(3)* %arrayidx1, align 4 - %sum.0 = fadd float %val0, %val1 - - %idx.2 = add nsw i32 %tid.x, 11 - %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 - %val2 = load float, float addrspace(3)* %arrayidx2, align 4 - - %idx.3 = add nsw i32 %tid.x, 27 - %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 - %val3 = load float, float addrspace(3)* %arrayidx3, align 4 - %sum.1 = fadd float %val2, %val3 - - %sum = fadd float %sum.0, %sum.1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0 - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; Make sure there is an instruction between the two sets of reads. -; SI-LABEL: @simple_read2_f32_x2_barrier -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8 -; SI: s_barrier -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27 -; SI: s_endpgm -define void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 0 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - - %idx.1 = add nsw i32 %tid.x, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 - %val1 = load float, float addrspace(3)* %arrayidx1, align 4 - %sum.0 = fadd float %val0, %val1 - - call void @llvm.AMDGPU.barrier.local() #2 - - %idx.2 = add nsw i32 %tid.x, 11 - %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 - %val2 = load float, float addrspace(3)* %arrayidx2, align 4 - - %idx.3 = add nsw i32 %tid.x, 27 - %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 - %val3 = load float, float addrspace(3)* %arrayidx3, align 4 - %sum.1 = fadd float %val2, %val3 - - %sum = fadd float %sum.0, %sum.1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0 - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; For some reason adding something to the base address for the first -; element results in only folding the inner pair. - -; SI-LABEL: @simple_read2_f32_x2_nonzero_base -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:2 offset1:8 -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27 -; SI: s_endpgm -define void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - - %idx.1 = add nsw i32 %tid.x, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 - %val1 = load float, float addrspace(3)* %arrayidx1, align 4 - %sum.0 = fadd float %val0, %val1 - - %idx.2 = add nsw i32 %tid.x, 11 - %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 - %val2 = load float, float addrspace(3)* %arrayidx2, align 4 - - %idx.3 = add nsw i32 %tid.x, 27 - %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 - %val3 = load float, float addrspace(3)* %arrayidx3, align 4 - %sum.1 = fadd float %val2, %val3 - - %sum = fadd float %sum.0, %sum.1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0 - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; Be careful of vectors of pointers. We don't know if the 2 pointers -; in the vectors are really the same base, so this is not safe to -; merge. -; Base pointers come from different subregister of same super -; register. We can't safely merge this. - -; SI-LABEL: @read2_ptr_is_subreg_arg_f32 -; SI-NOT: ds_read2_b32 -; SI: ds_read_b32 -; SI: ds_read_b32 -; SI: s_endpgm -define void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 - %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 - %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1 - %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 - %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 - %val0 = load float, float addrspace(3)* %gep.0, align 4 - %val1 = load float, float addrspace(3)* %gep.1, align 4 - %add.x = add nsw i32 %x.i, 8 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; Apply a constant scalar offset after the pointer vector extract. We -; are rejecting merges that have the same, constant 0 offset, so make -; sure we are really rejecting it because of the different -; subregisters. - -; SI-LABEL: @read2_ptr_is_subreg_arg_offset_f32 -; SI-NOT: ds_read2_b32 -; SI: ds_read_b32 -; SI: ds_read_b32 -; SI: s_endpgm -define void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 - %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 - %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1 - %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 - %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 - - ; Apply an additional offset after the vector that will be more obviously folded. - %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8 - - %val0 = load float, float addrspace(3)* %gep.0, align 4 - %val1 = load float, float addrspace(3)* %gep.1.offset, align 4 - %add.x = add nsw i32 %x.i, 8 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; We should be able to merge in this case, but probably not worth the effort. -; SI-NOT: ds_read2_b32 -; SI: ds_read_b32 -; SI: ds_read_b32 -; SI: s_endpgm -define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %ptr.0 = insertelement <2 x [512 x float] addrspace(3)*> undef, [512 x float] addrspace(3)* @lds, i32 0 - %ptr.1 = insertelement <2 x [512 x float] addrspace(3)*> %ptr.0, [512 x float] addrspace(3)* @lds, i32 1 - %x.i.v.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 - %x.i.v.1 = insertelement <2 x i32> %x.i.v.0, i32 %x.i, i32 1 - %idx = add <2 x i32> %x.i.v.1, - %gep = getelementptr inbounds [512 x float], <2 x [512 x float] addrspace(3)*> %ptr.1, <2 x i32> , <2 x i32> %idx - %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 - %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 - %val0 = load float, float addrspace(3)* %gep.0, align 4 - %val1 = load float, float addrspace(3)* %gep.1, align 4 - %add.x = add nsw i32 %x.i, 8 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; SI-LABEL: @simple_read2_f32_volatile_0 -; SI-NOT ds_read2_b32 -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32 -; SI: s_endpgm -define void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - %val0 = load volatile float, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - %val1 = load float, float addrspace(3)* %arrayidx1, align 4 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; SI-LABEL: @simple_read2_f32_volatile_1 -; SI-NOT ds_read2_b32 -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32 -; SI: s_endpgm -define void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - %val1 = load volatile float, float addrspace(3)* %arrayidx1, align 4 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; Can't fold since not correctly aligned. -; XXX: This isn't really testing anything useful now. I think CI -; allows unaligned LDS accesses, which would be a problem here. -; SI-LABEL: @unaligned_read2_f32 -; SI-NOT: ds_read2_b32 -; SI: s_endpgm -define void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i - %val0 = load float, float addrspace(3)* %arrayidx0, align 1 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x - %val1 = load float, float addrspace(3)* %arrayidx1, align 1 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; SI-LABEL: @misaligned_2_simple_read2_f32 -; SI-NOT: ds_read2_b32 -; SI: s_endpgm -define void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i - %val0 = load float, float addrspace(3)* %arrayidx0, align 2 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x - %val1 = load float, float addrspace(3)* %arrayidx1, align 2 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; SI-LABEL: @simple_read2_f64 -; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, {{v[0-9]+}} -; SI: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset1:8 -; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} -; SI: buffer_store_dwordx2 [[RESULT]] -; SI: s_endpgm -define void @simple_read2_f64(double addrspace(1)* %out) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i - %val0 = load double, double addrspace(3)* %arrayidx0, align 8 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x - %val1 = load double, double addrspace(3)* %arrayidx1, align 8 - %sum = fadd double %val0, %val1 - %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i - store double %sum, double addrspace(1)* %out.gep, align 8 - ret void -} - -; SI-LABEL: @simple_read2_f64_max_offset -; SI: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:255 -; SI: s_endpgm -define void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i - %val0 = load double, double addrspace(3)* %arrayidx0, align 8 - %add.x = add nsw i32 %x.i, 255 - %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x - %val1 = load double, double addrspace(3)* %arrayidx1, align 8 - %sum = fadd double %val0, %val1 - %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i - store double %sum, double addrspace(1)* %out.gep, align 8 - ret void -} - -; SI-LABEL: @simple_read2_f64_too_far -; SI-NOT ds_read2_b64 -; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} -; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:2056 -; SI: s_endpgm -define void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i - %val0 = load double, double addrspace(3)* %arrayidx0, align 8 - %add.x = add nsw i32 %x.i, 257 - %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x - %val1 = load double, double addrspace(3)* %arrayidx1, align 8 - %sum = fadd double %val0, %val1 - %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i - store double %sum, double addrspace(1)* %out.gep, align 8 - ret void -} - -; Alignment only 4 -; SI-LABEL: @misaligned_read2_f64 -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1 -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:14 offset1:15 -; SI: s_endpgm -define void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i - %val0 = load double, double addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 7 - %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x - %val1 = load double, double addrspace(3)* %arrayidx1, align 4 - %sum = fadd double %val0, %val1 - %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i - store double %sum, double addrspace(1)* %out.gep, align 4 - ret void -} - -@foo = addrspace(3) global [4 x i32] undef, align 4 - -; SI-LABEL: @load_constant_adjacent_offsets -; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1 -define void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) { - %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 - %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 - %sum = add i32 %val0, %val1 - store i32 %sum, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: @load_constant_disjoint_offsets -; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2 -define void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) { - %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 - %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 - %sum = add i32 %val0, %val1 - store i32 %sum, i32 addrspace(1)* %out, align 4 - ret void -} - -@bar = addrspace(3) global [4 x i64] undef, align 4 - -; SI-LABEL: @load_misaligned64_constant_offsets -; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1 -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3 -define void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) { - %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 - %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 - %sum = add i64 %val0, %val1 - store i64 %sum, i64 addrspace(1)* %out, align 8 - ret void -} - -@bar.large = addrspace(3) global [4096 x i64] undef, align 4 - -; SI-LABEL: @load_misaligned64_constant_large_offsets -; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}} -; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000 -; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1 -; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1 -; SI: s_endpgm -define void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) { - %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 - %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4 - %sum = add i64 %val0, %val1 - store i64 %sum, i64 addrspace(1)* %out, align 8 - ret void -} - -@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4 -@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4 - -define void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 { - %x.i = tail call i32 @llvm.r600.read.tgid.x() #1 - %y.i = tail call i32 @llvm.r600.read.tidig.y() #1 - %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i - %tmp16 = load float, float addrspace(3)* %arrayidx44, align 4 - %add47 = add nsw i32 %x.i, 1 - %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47 - %tmp17 = load float, float addrspace(3)* %arrayidx48, align 4 - %add51 = add nsw i32 %x.i, 16 - %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51 - %tmp18 = load float, float addrspace(3)* %arrayidx52, align 4 - %add55 = add nsw i32 %x.i, 17 - %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55 - %tmp19 = load float, float addrspace(3)* %arrayidx56, align 4 - %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i - %tmp20 = load float, float addrspace(3)* %arrayidx60, align 4 - %add63 = add nsw i32 %y.i, 1 - %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63 - %tmp21 = load float, float addrspace(3)* %arrayidx64, align 4 - %add67 = add nsw i32 %y.i, 32 - %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67 - %tmp22 = load float, float addrspace(3)* %arrayidx68, align 4 - %add71 = add nsw i32 %y.i, 33 - %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71 - %tmp23 = load float, float addrspace(3)* %arrayidx72, align 4 - %add75 = add nsw i32 %y.i, 64 - %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75 - %tmp24 = load float, float addrspace(3)* %arrayidx76, align 4 - %add79 = add nsw i32 %y.i, 65 - %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79 - %tmp25 = load float, float addrspace(3)* %arrayidx80, align 4 - %sum.0 = fadd float %tmp16, %tmp17 - %sum.1 = fadd float %sum.0, %tmp18 - %sum.2 = fadd float %sum.1, %tmp19 - %sum.3 = fadd float %sum.2, %tmp20 - %sum.4 = fadd float %sum.3, %tmp21 - %sum.5 = fadd float %sum.4, %tmp22 - %sum.6 = fadd float %sum.5, %tmp23 - %sum.7 = fadd float %sum.6, %tmp24 - %sum.8 = fadd float %sum.7, %tmp25 - store float %sum.8, float addrspace(1)* %C, align 4 - ret void -} - -define void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 { - %load = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4 - store <2 x i32> %load, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - -define void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 { - %load = load i64, i64 addrspace(3)* %in, align 4 - store i64 %load, i64 addrspace(1)* %out, align 8 - ret void -} - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tgid.x() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tgid.y() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tidig.x() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tidig.y() #1 - -; Function Attrs: noduplicate nounwind -declare void @llvm.AMDGPU.barrier.local() #2 - -attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } -attributes #2 = { noduplicate nounwind } diff --git a/test/CodeGen/R600/ds_read2_offset_order.ll b/test/CodeGen/R600/ds_read2_offset_order.ll deleted file mode 100644 index 9ea9a5a2617..00000000000 --- a/test/CodeGen/R600/ds_read2_offset_order.ll +++ /dev/null @@ -1,45 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s - -; XFAIL: * - -@lds = addrspace(3) global [512 x float] undef, align 4 - -; SI-LABEL: {{^}}offset_order: - -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:56 -; SI: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:0 offset1:4 -; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3 -; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:11 offset1:1 - -define void @offset_order(float addrspace(1)* %out) { -entry: - %ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 0 - %val0 = load float, float addrspace(3)* %ptr0 - - %ptr1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 256 - %val1 = load float, float addrspace(3)* %ptr1 - %add1 = fadd float %val0, %val1 - - %ptr2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 3 - %val2 = load float, float addrspace(3)* %ptr2 - %add2 = fadd float %add1, %val2 - - %ptr3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 2 - %val3 = load float, float addrspace(3)* %ptr3 - %add3 = fadd float %add2, %val3 - - %ptr4 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 12 - %val4 = load float, float addrspace(3)* %ptr4 - %add4 = fadd float %add3, %val4 - - %ptr5 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 14 - %val5 = load float, float addrspace(3)* %ptr5 - %add5 = fadd float %add4, %val5 - - %ptr6 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 11 - %val6 = load float, float addrspace(3)* %ptr6 - %add6 = fadd float %add5, %val6 - store float %add6, float addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/ds_read2st64.ll b/test/CodeGen/R600/ds_read2st64.ll deleted file mode 100644 index 54b3b45636d..00000000000 --- a/test/CodeGen/R600/ds_read2st64.ll +++ /dev/null @@ -1,272 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s - -@lds = addrspace(3) global [512 x float] undef, align 4 -@lds.f64 = addrspace(3) global [512 x double] undef, align 8 - - -; SI-LABEL: @simple_read2st64_f32_0_1 -; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1 -; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 64 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - %val1 = load float, float addrspace(3)* %arrayidx1, align 4 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; SI-LABEL: @simple_read2st64_f32_1_2 -; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2 -; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %add.x.0 = add nsw i32 %x.i, 64 - %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0 - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - %add.x.1 = add nsw i32 %x.i, 128 - %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1 - %val1 = load float, float addrspace(3)* %arrayidx1, align 4 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; SI-LABEL: @simple_read2st64_f32_max_offset -; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:255 -; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]] -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %add.x.0 = add nsw i32 %x.i, 64 - %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0 - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - %add.x.1 = add nsw i32 %x.i, 16320 - %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1 - %val1 = load float, float addrspace(3)* %arrayidx1, align 4 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; SI-LABEL: @simple_read2st64_f32_over_max_offset -; SI-NOT: ds_read2st64_b32 -; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}} -; SI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256 -; SI: ds_read_b32 {{v[0-9]+}}, [[BIGADD]] -; SI: s_endpgm -define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %add.x.0 = add nsw i32 %x.i, 64 - %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0 - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - %add.x.1 = add nsw i32 %x.i, 16384 - %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1 - %val1 = load float, float addrspace(3)* %arrayidx1, align 4 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; SI-LABEL: @odd_invalid_read2st64_f32_0 -; SI-NOT: ds_read2st64_b32 -; SI: s_endpgm -define void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 63 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - %val1 = load float, float addrspace(3)* %arrayidx1, align 4 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; SI-LABEL: @odd_invalid_read2st64_f32_1 -; SI-NOT: ds_read2st64_b32 -; SI: s_endpgm -define void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %add.x.0 = add nsw i32 %x.i, 64 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0 - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - %add.x.1 = add nsw i32 %x.i, 127 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.1 - %val1 = load float, float addrspace(3)* %arrayidx1, align 4 - %sum = fadd float %val0, %val1 - %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i - store float %sum, float addrspace(1)* %out.gep, align 4 - ret void -} - -; SI-LABEL: @simple_read2st64_f64_0_1 -; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1 -; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} -; SI: buffer_store_dwordx2 [[RESULT]] -; SI: s_endpgm -define void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i - %val0 = load double, double addrspace(3)* %arrayidx0, align 8 - %add.x = add nsw i32 %x.i, 64 - %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x - %val1 = load double, double addrspace(3)* %arrayidx1, align 8 - %sum = fadd double %val0, %val1 - %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i - store double %sum, double addrspace(1)* %out.gep, align 8 - ret void -} - -; SI-LABEL: @simple_read2st64_f64_1_2 -; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2 -; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} -; SI: buffer_store_dwordx2 [[RESULT]] -; SI: s_endpgm -define void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %add.x.0 = add nsw i32 %x.i, 64 - %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 - %val0 = load double, double addrspace(3)* %arrayidx0, align 8 - %add.x.1 = add nsw i32 %x.i, 128 - %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1 - %val1 = load double, double addrspace(3)* %arrayidx1, align 8 - %sum = fadd double %val0, %val1 - %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i - store double %sum, double addrspace(1)* %out.gep, align 8 - ret void -} - -; Alignment only - -; SI-LABEL: @misaligned_read2st64_f64 -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1 -; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129 -; SI: s_endpgm -define void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i - %val0 = load double, double addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 64 - %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x - %val1 = load double, double addrspace(3)* %arrayidx1, align 4 - %sum = fadd double %val0, %val1 - %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i - store double %sum, double addrspace(1)* %out.gep, align 4 - ret void -} - -; The maximum is not the usual 0xff because 0xff * 8 * 64 > 0xffff -; SI-LABEL: @simple_read2st64_f64_max_offset -; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4 offset1:127 -; SI: s_waitcnt lgkmcnt(0) -; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} -; SI: buffer_store_dwordx2 [[RESULT]] -; SI: s_endpgm -define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %add.x.0 = add nsw i32 %x.i, 256 - %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 - %val0 = load double, double addrspace(3)* %arrayidx0, align 8 - %add.x.1 = add nsw i32 %x.i, 8128 - %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1 - %val1 = load double, double addrspace(3)* %arrayidx1, align 8 - %sum = fadd double %val0, %val1 - %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i - store double %sum, double addrspace(1)* %out.gep, align 8 - ret void -} - -; SI-LABEL: @simple_read2st64_f64_over_max_offset -; SI-NOT: ds_read2st64_b64 -; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}} -; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512 -; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]] -; SI: s_endpgm -define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %add.x.0 = add nsw i32 %x.i, 64 - %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 - %val0 = load double, double addrspace(3)* %arrayidx0, align 8 - %add.x.1 = add nsw i32 %x.i, 8192 - %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1 - %val1 = load double, double addrspace(3)* %arrayidx1, align 8 - %sum = fadd double %val0, %val1 - %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i - store double %sum, double addrspace(1)* %out.gep, align 8 - ret void -} - -; SI-LABEL: @invalid_read2st64_f64_odd_offset -; SI-NOT: ds_read2st64_b64 -; SI: s_endpgm -define void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %add.x.0 = add nsw i32 %x.i, 64 - %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 - %val0 = load double, double addrspace(3)* %arrayidx0, align 8 - %add.x.1 = add nsw i32 %x.i, 8129 - %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1 - %val1 = load double, double addrspace(3)* %arrayidx1, align 8 - %sum = fadd double %val0, %val1 - %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i - store double %sum, double addrspace(1)* %out.gep, align 8 - ret void -} - -; The stride of 8 elements is 8 * 8 bytes. We need to make sure the -; stride in elements, not bytes, is a multiple of 64. - -; SI-LABEL: @byte_size_only_divisible_64_read2_f64 -; SI-NOT: ds_read2st_b64 -; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8 -; SI: s_endpgm -define void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i - %val0 = load double, double addrspace(3)* %arrayidx0, align 8 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x - %val1 = load double, double addrspace(3)* %arrayidx1, align 8 - %sum = fadd double %val0, %val1 - %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i - store double %sum, double addrspace(1)* %out.gep, align 4 - ret void -} - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tgid.x() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tgid.y() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tidig.x() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tidig.y() #1 - -; Function Attrs: noduplicate nounwind -declare void @llvm.AMDGPU.barrier.local() #2 - -attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } -attributes #2 = { noduplicate nounwind } diff --git a/test/CodeGen/R600/ds_write2.ll b/test/CodeGen/R600/ds_write2.ll deleted file mode 100644 index b553d3459e4..00000000000 --- a/test/CodeGen/R600/ds_write2.ll +++ /dev/null @@ -1,425 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s - -@lds = addrspace(3) global [512 x float] undef, align 4 -@lds.f64 = addrspace(3) global [512 x double] undef, align 8 - - -; SI-LABEL: @simple_write2_one_val_f32 -; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]] -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8 -; SI: s_endpgm -define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i - %val = load float, float addrspace(1)* %in.gep, align 4 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - store float %val, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - store float %val, float addrspace(3)* %arrayidx1, align 4 - ret void -} - -; SI-LABEL: @simple_write2_two_val_f32 -; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 -; SI: s_endpgm -define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i - %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 - %val0 = load float, float addrspace(1)* %in.gep.0, align 4 - %val1 = load float, float addrspace(1)* %in.gep.1, align 4 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - store float %val0, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - store float %val1, float addrspace(3)* %arrayidx1, align 4 - ret void -} - -; SI-LABEL: @simple_write2_two_val_f32_volatile_0 -; SI-NOT: ds_write2_b32 -; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} -; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32 -; SI: s_endpgm -define void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i - %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i - %val0 = load float, float addrspace(1)* %in0.gep, align 4 - %val1 = load float, float addrspace(1)* %in1.gep, align 4 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - store volatile float %val0, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - store float %val1, float addrspace(3)* %arrayidx1, align 4 - ret void -} - -; SI-LABEL: @simple_write2_two_val_f32_volatile_1 -; SI-NOT: ds_write2_b32 -; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} -; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32 -; SI: s_endpgm -define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i - %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i - %val0 = load float, float addrspace(1)* %in0.gep, align 4 - %val1 = load float, float addrspace(1)* %in1.gep, align 4 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - store float %val0, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - store volatile float %val1, float addrspace(3)* %arrayidx1, align 4 - ret void -} - -; 2 data subregisters from different super registers. -; SI-LABEL: @simple_write2_two_val_subreg2_mixed_f32 -; SI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}} -; SI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}} -; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 -; SI: s_endpgm -define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i - %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1 - %val0 = load <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8 - %val1 = load <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8 - %val0.0 = extractelement <2 x float> %val0, i32 0 - %val1.1 = extractelement <2 x float> %val1, i32 1 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - store float %val0.0, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - store float %val1.1, float addrspace(3)* %arrayidx1, align 4 - ret void -} - -; SI-LABEL: @simple_write2_two_val_subreg2_f32 -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 -; SI: s_endpgm -define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i - %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8 - %val0 = extractelement <2 x float> %val, i32 0 - %val1 = extractelement <2 x float> %val, i32 1 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - store float %val0, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - store float %val1, float addrspace(3)* %arrayidx1, align 4 - ret void -} - -; SI-LABEL: @simple_write2_two_val_subreg4_f32 -; SI-DAG: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 -; SI: s_endpgm -define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i - %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16 - %val0 = extractelement <4 x float> %val, i32 0 - %val1 = extractelement <4 x float> %val, i32 3 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - store float %val0, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - store float %val1, float addrspace(3)* %arrayidx1, align 4 - ret void -} - -; SI-LABEL: @simple_write2_two_val_max_offset_f32 -; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255 -; SI: s_endpgm -define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i - %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 - %val0 = load float, float addrspace(1)* %in.gep.0, align 4 - %val1 = load float, float addrspace(1)* %in.gep.1, align 4 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - store float %val0, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 255 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - store float %val1, float addrspace(3)* %arrayidx1, align 4 - ret void -} - -; SI-LABEL: @simple_write2_two_val_too_far_f32 -; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} -; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028 -; SI: s_endpgm -define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i - %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i - %val0 = load float, float addrspace(1)* %in0.gep, align 4 - %val1 = load float, float addrspace(1)* %in1.gep, align 4 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - store float %val0, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 257 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - store float %val1, float addrspace(3)* %arrayidx1, align 4 - ret void -} - -; SI-LABEL: @simple_write2_two_val_f32_x2 -; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8 -; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 -; SI: s_endpgm -define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x - %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x - %val0 = load float, float addrspace(1)* %in0.gep, align 4 - %val1 = load float, float addrspace(1)* %in1.gep, align 4 - - %idx.0 = add nsw i32 %tid.x, 0 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 - store float %val0, float addrspace(3)* %arrayidx0, align 4 - - %idx.1 = add nsw i32 %tid.x, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 - store float %val1, float addrspace(3)* %arrayidx1, align 4 - - %idx.2 = add nsw i32 %tid.x, 11 - %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 - store float %val0, float addrspace(3)* %arrayidx2, align 4 - - %idx.3 = add nsw i32 %tid.x, 27 - %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 - store float %val1, float addrspace(3)* %arrayidx3, align 4 - - ret void -} - -; SI-LABEL: @simple_write2_two_val_f32_x2_nonzero_base -; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8 -; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 -; SI: s_endpgm -define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x - %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x - %val0 = load float, float addrspace(1)* %in0.gep, align 4 - %val1 = load float, float addrspace(1)* %in1.gep, align 4 - - %idx.0 = add nsw i32 %tid.x, 3 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0 - store float %val0, float addrspace(3)* %arrayidx0, align 4 - - %idx.1 = add nsw i32 %tid.x, 8 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1 - store float %val1, float addrspace(3)* %arrayidx1, align 4 - - %idx.2 = add nsw i32 %tid.x, 11 - %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2 - store float %val0, float addrspace(3)* %arrayidx2, align 4 - - %idx.3 = add nsw i32 %tid.x, 27 - %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3 - store float %val1, float addrspace(3)* %arrayidx3, align 4 - - ret void -} - -; SI-LABEL: @write2_ptr_subreg_arg_two_val_f32 -; SI-NOT: ds_write2_b32 -; SI: ds_write_b32 -; SI: ds_write_b32 -; SI: s_endpgm -define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i - %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i - %val0 = load float, float addrspace(1)* %in0.gep, align 4 - %val1 = load float, float addrspace(1)* %in1.gep, align 4 - - %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 - %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0 - %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1 - %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0 - %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1 - - ; Apply an additional offset after the vector that will be more obviously folded. - %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8 - store float %val0, float addrspace(3)* %gep.0, align 4 - - %add.x = add nsw i32 %x.i, 8 - store float %val1, float addrspace(3)* %gep.1.offset, align 4 - ret void -} - -; SI-LABEL: @simple_write2_one_val_f64 -; SI: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], -; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} -; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8 -; SI: s_endpgm -define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i - %val = load double, double addrspace(1)* %in.gep, align 8 - %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i - store double %val, double addrspace(3)* %arrayidx0, align 8 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x - store double %val, double addrspace(3)* %arrayidx1, align 8 - ret void -} - -; SI-LABEL: @misaligned_simple_write2_one_val_f64 -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:1 -; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15 -; SI: s_endpgm -define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i - %val = load double, double addrspace(1)* %in.gep, align 8 - %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i - store double %val, double addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 7 - %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x - store double %val, double addrspace(3)* %arrayidx1, align 4 - ret void -} - -; SI-LABEL: @simple_write2_two_val_f64 -; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} -; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 -; SI: s_endpgm -define void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i - %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1 - %val0 = load double, double addrspace(1)* %in.gep.0, align 8 - %val1 = load double, double addrspace(1)* %in.gep.1, align 8 - %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i - store double %val0, double addrspace(3)* %arrayidx0, align 8 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x - store double %val1, double addrspace(3)* %arrayidx1, align 8 - ret void -} - -@foo = addrspace(3) global [4 x i32] undef, align 4 - -; SI-LABEL: @store_constant_adjacent_offsets -; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 -define void @store_constant_adjacent_offsets() { - store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 - store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 - ret void -} - -; SI-LABEL: @store_constant_disjoint_offsets -; SI-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}} -; SI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; SI: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2 -define void @store_constant_disjoint_offsets() { - store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 - store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 - ret void -} - -@bar = addrspace(3) global [4 x i64] undef, align 4 - -; SI-LABEL: @store_misaligned64_constant_offsets -; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 -; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 -define void @store_misaligned64_constant_offsets() { - store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 - store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 - ret void -} - -@bar.large = addrspace(3) global [4096 x i64] undef, align 4 - -; SI-LABEL: @store_misaligned64_constant_large_offsets -; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}} -; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}} -; SI-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 -; SI-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 -; SI: s_endpgm -define void @store_misaligned64_constant_large_offsets() { - store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 - store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4 - ret void -} - -@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4 -@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4 - -define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 { - %x.i = tail call i32 @llvm.r600.read.tgid.x() #1 - %y.i = tail call i32 @llvm.r600.read.tidig.y() #1 - %val = load float, float addrspace(1)* %in - %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i - store float %val, float addrspace(3)* %arrayidx44, align 4 - %add47 = add nsw i32 %x.i, 1 - %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47 - store float %val, float addrspace(3)* %arrayidx48, align 4 - %add51 = add nsw i32 %x.i, 16 - %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51 - store float %val, float addrspace(3)* %arrayidx52, align 4 - %add55 = add nsw i32 %x.i, 17 - %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55 - store float %val, float addrspace(3)* %arrayidx56, align 4 - %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i - store float %val, float addrspace(3)* %arrayidx60, align 4 - %add63 = add nsw i32 %y.i, 1 - %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63 - store float %val, float addrspace(3)* %arrayidx64, align 4 - %add67 = add nsw i32 %y.i, 32 - %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67 - store float %val, float addrspace(3)* %arrayidx68, align 4 - %add71 = add nsw i32 %y.i, 33 - %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71 - store float %val, float addrspace(3)* %arrayidx72, align 4 - %add75 = add nsw i32 %y.i, 64 - %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75 - store float %val, float addrspace(3)* %arrayidx76, align 4 - %add79 = add nsw i32 %y.i, 65 - %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79 - store float %val, float addrspace(3)* %arrayidx80, align 4 - ret void -} - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tgid.x() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tgid.y() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tidig.x() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tidig.y() #1 - -; Function Attrs: noduplicate nounwind -declare void @llvm.AMDGPU.barrier.local() #2 - -attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } -attributes #2 = { noduplicate nounwind } diff --git a/test/CodeGen/R600/ds_write2st64.ll b/test/CodeGen/R600/ds_write2st64.ll deleted file mode 100644 index 1d9d881c5c7..00000000000 --- a/test/CodeGen/R600/ds_write2st64.ll +++ /dev/null @@ -1,119 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s - - -@lds = addrspace(3) global [512 x float] undef, align 4 - - -; SI-LABEL: @simple_write2st64_one_val_f32_0_1 -; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]] -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:1 -; SI: s_endpgm -define void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i - %val = load float, float addrspace(1)* %in.gep, align 4 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i - store float %val, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 64 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x - store float %val, float addrspace(3)* %arrayidx1, align 4 - ret void -} - -; SI-LABEL: @simple_write2st64_two_val_f32_2_5 -; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5 -; SI: s_endpgm -define void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i - %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 - %val0 = load float, float addrspace(1)* %in.gep.0, align 4 - %val1 = load float, float addrspace(1)* %in.gep.1, align 4 - %add.x.0 = add nsw i32 %x.i, 128 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0 - store float %val0, float addrspace(3)* %arrayidx0, align 4 - %add.x.1 = add nsw i32 %x.i, 320 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.1 - store float %val1, float addrspace(3)* %arrayidx1, align 4 - ret void -} - -; SI-LABEL: @simple_write2st64_two_val_max_offset_f32 -; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255 -; SI: s_endpgm -define void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i - %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 - %val0 = load float, float addrspace(1)* %in.gep.0, align 4 - %val1 = load float, float addrspace(1)* %in.gep.1, align 4 - %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i - store float %val0, float addrspace(3)* %arrayidx0, align 4 - %add.x = add nsw i32 %x.i, 16320 - %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x - store float %val1, float addrspace(3)* %arrayidx1, align 4 - ret void -} - -; SI-LABEL: @simple_write2st64_two_val_max_offset_f64 -; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; SI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], -; SI: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127 -; SI: s_endpgm -define void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i - %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1 - %val0 = load double, double addrspace(1)* %in.gep.0, align 8 - %val1 = load double, double addrspace(1)* %in.gep.1, align 8 - %add.x.0 = add nsw i32 %x.i, 256 - %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0 - store double %val0, double addrspace(3)* %arrayidx0, align 8 - %add.x.1 = add nsw i32 %x.i, 8128 - %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1 - store double %val1, double addrspace(3)* %arrayidx1, align 8 - ret void -} - -; SI-LABEL: @byte_size_only_divisible_64_write2st64_f64 -; SI-NOT: ds_write2st64_b64 -; SI: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:8 -; SI: s_endpgm -define void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 { - %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i - %val = load double, double addrspace(1)* %in.gep, align 8 - %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i - store double %val, double addrspace(3)* %arrayidx0, align 8 - %add.x = add nsw i32 %x.i, 8 - %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x - store double %val, double addrspace(3)* %arrayidx1, align 8 - ret void -} - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tgid.x() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tgid.y() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tidig.x() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tidig.y() #1 - -; Function Attrs: noduplicate nounwind -declare void @llvm.AMDGPU.barrier.local() #2 - -attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } -attributes #2 = { noduplicate nounwind } diff --git a/test/CodeGen/R600/elf.ll b/test/CodeGen/R600/elf.ll deleted file mode 100644 index d0fd06a3437..00000000000 --- a/test/CodeGen/R600/elf.ll +++ /dev/null @@ -1,34 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TYPICAL %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TONGA %s -; RUN: llc < %s -march=amdgcn -mcpu=carrizo -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s -; RUN: llc < %s -march=amdgcn -mcpu=carrizo -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TYPICAL %s - -; Test that we don't try to produce a COFF file on windows -; RUN: llc < %s -mtriple=amdgcn-pc-mingw -mcpu=SI -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols - | FileCheck --check-prefix=ELF %s - -; ELF: Format: ELF32 -; ELF: Name: .AMDGPU.config -; ELF: Type: SHT_PROGBITS - -; ELF: Symbol { -; ELF: Name: test -; ELF: Binding: Global - -; CONFIG: .section .AMDGPU.config -; CONFIG-NEXT: .long 45096 -; TYPICAL-NEXT: .long 0 -; TONGA-NEXT: .long 576 -; CONFIG: .align 256 -; CONFIG: test: -define void @test(i32 %p) #0 { - %i = add i32 %p, 2 - %r = bitcast i32 %i to float - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) - ret void -} - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } ; Pixel Shader diff --git a/test/CodeGen/R600/elf.r600.ll b/test/CodeGen/R600/elf.r600.ll deleted file mode 100644 index 51cd0850093..00000000000 --- a/test/CodeGen/R600/elf.r600.ll +++ /dev/null @@ -1,17 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood -filetype=obj | llvm-readobj -s - | FileCheck --check-prefix=ELF %s -; RUN: llc < %s -march=r600 -mcpu=redwood -o - | FileCheck --check-prefix=CONFIG %s - -; ELF: Format: ELF32 -; ELF: Name: .AMDGPU.config - -; CONFIG: .section .AMDGPU.config -; CONFIG-NEXT: .long 166100 -; CONFIG-NEXT: .long 2 -; CONFIG-NEXT: .long 165900 -; CONFIG-NEXT: .long 0 -define void @test(float addrspace(1)* %out, i32 %p) { - %i = add i32 %p, 2 - %r = bitcast i32 %i to float - store float %r, float addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/empty-function.ll b/test/CodeGen/R600/empty-function.ll deleted file mode 100644 index a060900811e..00000000000 --- a/test/CodeGen/R600/empty-function.ll +++ /dev/null @@ -1,21 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -; Make sure we don't assert on empty functions - -; SI: .text -; SI-LABEL: {{^}}empty_function_ret: -; SI: s_endpgm -; SI: codeLenInByte = 4 -define void @empty_function_ret() #0 { - ret void -} - -; SI: .text -; SI-LABEL: {{^}}empty_function_unreachable: -; SI: codeLenInByte = 0 -define void @empty_function_unreachable() #0 { - unreachable -} - -attributes #0 = { nounwind } diff --git a/test/CodeGen/R600/endcf-loop-header.ll b/test/CodeGen/R600/endcf-loop-header.ll deleted file mode 100644 index 267a323c506..00000000000 --- a/test/CodeGen/R600/endcf-loop-header.ll +++ /dev/null @@ -1,34 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s - -; This tests that the llvm.SI.end.cf intrinsic is not inserted into the -; loop block. This intrinsic will be lowered to s_or_b64 by the code -; generator. - -; CHECK-LABEL: {{^}}test: - -; This is was lowered from the llvm.SI.end.cf intrinsic: -; CHECK: s_or_b64 exec, exec - -; CHECK: [[LOOP_LABEL:[0-9A-Za-z_]+]]: ; %loop{{$}} -; CHECK-NOT: s_or_b64 exec, exec -; CHECK: s_cbranch_execnz [[LOOP_LABEL]] -define void @test(i32 addrspace(1)* %out, i32 %cond) { -entry: - %tmp0 = icmp eq i32 %cond, 0 - br i1 %tmp0, label %if, label %loop - -if: - store i32 0, i32 addrspace(1)* %out - br label %loop - -loop: - %tmp1 = phi i32 [0, %entry], [0, %if], [%inc, %loop] - %inc = add i32 %tmp1, %cond - %tmp2 = icmp ugt i32 %inc, 10 - br i1 %tmp2, label %done, label %loop - -done: - %tmp3 = getelementptr i32, i32 addrspace(1)* %out, i64 1 - store i32 %inc, i32 addrspace(1)* %tmp3 - ret void -} diff --git a/test/CodeGen/R600/extload-private.ll b/test/CodeGen/R600/extload-private.ll deleted file mode 100644 index 294c3a9c678..00000000000 --- a/test/CodeGen/R600/extload-private.ll +++ /dev/null @@ -1,46 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}load_i8_sext_private: -; SI: buffer_load_sbyte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen -define void @load_i8_sext_private(i32 addrspace(1)* %out) { -entry: - %tmp0 = alloca i8 - %tmp1 = load i8, i8* %tmp0 - %tmp2 = sext i8 %tmp1 to i32 - store i32 %tmp2, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_i8_zext_private: -; SI: buffer_load_ubyte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen -define void @load_i8_zext_private(i32 addrspace(1)* %out) { -entry: - %tmp0 = alloca i8 - %tmp1 = load i8, i8* %tmp0 - %tmp2 = zext i8 %tmp1 to i32 - store i32 %tmp2, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_i16_sext_private: -; SI: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen -define void @load_i16_sext_private(i32 addrspace(1)* %out) { -entry: - %tmp0 = alloca i16 - %tmp1 = load i16, i16* %tmp0 - %tmp2 = sext i16 %tmp1 to i32 - store i32 %tmp2, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_i16_zext_private: -; SI: buffer_load_ushort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen -define void @load_i16_zext_private(i32 addrspace(1)* %out) { -entry: - %tmp0 = alloca i16 - %tmp1 = load i16, i16* %tmp0 - %tmp2 = zext i16 %tmp1 to i32 - store i32 %tmp2, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/extload.ll b/test/CodeGen/R600/extload.ll deleted file mode 100644 index 662eb7a9716..00000000000 --- a/test/CodeGen/R600/extload.ll +++ /dev/null @@ -1,53 +0,0 @@ -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}anyext_load_i8: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]], -; EG: VTX_READ_32 [[VAL]] - -define void @anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind { - %cast = bitcast i8 addrspace(1)* %src to i32 addrspace(1)* - %load = load i32, i32 addrspace(1)* %cast, align 1 - %x = bitcast i32 %load to <4 x i8> - %castOut = bitcast i8 addrspace(1)* %out to <4 x i8> addrspace(1)* - store <4 x i8> %x, <4 x i8> addrspace(1)* %castOut, align 1 - ret void -} - -; FUNC-LABEL: {{^}}anyext_load_i16: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]], -; EG: VTX_READ_32 [[VAL]] - -define void @anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind { - %cast = bitcast i16 addrspace(1)* %src to i32 addrspace(1)* - %load = load i32, i32 addrspace(1)* %cast, align 1 - %x = bitcast i32 %load to <2 x i16> - %castOut = bitcast i16 addrspace(1)* %out to <2 x i16> addrspace(1)* - store <2 x i16> %x, <2 x i16> addrspace(1)* %castOut, align 1 - ret void -} - -; FUNC-LABEL: {{^}}anyext_load_lds_i8: -; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]] -; EG: LDS_WRITE * [[VAL]] -define void @anyext_load_lds_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind { - %cast = bitcast i8 addrspace(3)* %src to i32 addrspace(3)* - %load = load i32, i32 addrspace(3)* %cast, align 1 - %x = bitcast i32 %load to <4 x i8> - %castOut = bitcast i8 addrspace(3)* %out to <4 x i8> addrspace(3)* - store <4 x i8> %x, <4 x i8> addrspace(3)* %castOut, align 1 - ret void -} - -; FUNC-LABEL: {{^}}anyext_load_lds_i16: -; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]] -; EG: LDS_WRITE * [[VAL]] -define void @anyext_load_lds_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind { - %cast = bitcast i16 addrspace(3)* %src to i32 addrspace(3)* - %load = load i32, i32 addrspace(3)* %cast, align 1 - %x = bitcast i32 %load to <2 x i16> - %castOut = bitcast i16 addrspace(3)* %out to <2 x i16> addrspace(3)* - store <2 x i16> %x, <2 x i16> addrspace(3)* %castOut, align 1 - ret void -} diff --git a/test/CodeGen/R600/extract_vector_elt_i16.ll b/test/CodeGen/R600/extract_vector_elt_i16.ll deleted file mode 100644 index c7572efc6f5..00000000000 --- a/test/CodeGen/R600/extract_vector_elt_i16.ll +++ /dev/null @@ -1,30 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}extract_vector_elt_v2i16: -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_store_short -; SI: buffer_store_short -define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> %foo) nounwind { - %p0 = extractelement <2 x i16> %foo, i32 0 - %p1 = extractelement <2 x i16> %foo, i32 1 - %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 - store i16 %p1, i16 addrspace(1)* %out, align 2 - store i16 %p0, i16 addrspace(1)* %out1, align 2 - ret void -} - -; FUNC-LABEL: {{^}}extract_vector_elt_v4i16: -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_store_short -; SI: buffer_store_short -define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) nounwind { - %p0 = extractelement <4 x i16> %foo, i32 0 - %p1 = extractelement <4 x i16> %foo, i32 2 - %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 - store i16 %p1, i16 addrspace(1)* %out, align 2 - store i16 %p0, i16 addrspace(1)* %out1, align 2 - ret void -} diff --git a/test/CodeGen/R600/fabs.f64.ll b/test/CodeGen/R600/fabs.f64.ll deleted file mode 100644 index 3c6136c1a7b..00000000000 --- a/test/CodeGen/R600/fabs.f64.ll +++ /dev/null @@ -1,97 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -declare double @fabs(double) readnone -declare double @llvm.fabs.f64(double) readnone -declare <2 x double> @llvm.fabs.v2f64(<2 x double>) readnone -declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone - -; FUNC-LABEL: {{^}}v_fabs_f64: -; SI: v_and_b32 -; SI: s_endpgm -define void @v_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %tidext = sext i32 %tid to i64 - %gep = getelementptr double, double addrspace(1)* %in, i64 %tidext - %val = load double, double addrspace(1)* %gep, align 8 - %fabs = call double @llvm.fabs.f64(double %val) - store double %fabs, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fabs_f64: -; SI: v_and_b32 -; SI-NOT: v_and_b32 -; SI: s_endpgm -define void @fabs_f64(double addrspace(1)* %out, double %in) { - %fabs = call double @llvm.fabs.f64(double %in) - store double %fabs, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fabs_v2f64: -; SI: v_and_b32 -; SI: v_and_b32 -; SI: s_endpgm -define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { - %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in) - store <2 x double> %fabs, <2 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fabs_v4f64: -; SI: v_and_b32 -; SI: v_and_b32 -; SI: v_and_b32 -; SI: v_and_b32 -; SI: s_endpgm -define void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { - %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in) - store <4 x double> %fabs, <4 x double> addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}fabs_fold_f64: -; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-NOT: and -; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}} -; SI: s_endpgm -define void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) { - %fabs = call double @llvm.fabs.f64(double %in0) - %fmul = fmul double %fabs, %in1 - store double %fmul, double addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}fabs_fn_fold_f64: -; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-NOT: and -; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}} -; SI: s_endpgm -define void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) { - %fabs = call double @fabs(double %in0) - %fmul = fmul double %fabs, %in1 - store double %fmul, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fabs_free_f64: -; SI: v_and_b32 -; SI: s_endpgm -define void @fabs_free_f64(double addrspace(1)* %out, i64 %in) { - %bc= bitcast i64 %in to double - %fabs = call double @llvm.fabs.f64(double %bc) - store double %fabs, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fabs_fn_free_f64: -; SI: v_and_b32 -; SI: s_endpgm -define void @fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) { - %bc= bitcast i64 %in to double - %fabs = call double @fabs(double %bc) - store double %fabs, double addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/fabs.ll b/test/CodeGen/R600/fabs.ll deleted file mode 100644 index 419a73d0266..00000000000 --- a/test/CodeGen/R600/fabs.ll +++ /dev/null @@ -1,101 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s - - -; DAGCombiner will transform: -; (fabs (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF)) -; unless isFabsFree returns true - -; FUNC-LABEL: {{^}}fabs_fn_free: -; R600-NOT: AND -; R600: |PV.{{[XYZW]}}| - -; GCN: v_and_b32 - -define void @fabs_fn_free(float addrspace(1)* %out, i32 %in) { - %bc= bitcast i32 %in to float - %fabs = call float @fabs(float %bc) - store float %fabs, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fabs_free: -; R600-NOT: AND -; R600: |PV.{{[XYZW]}}| - -; GCN: v_and_b32 - -define void @fabs_free(float addrspace(1)* %out, i32 %in) { - %bc= bitcast i32 %in to float - %fabs = call float @llvm.fabs.f32(float %bc) - store float %fabs, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fabs_f32: -; R600: |{{(PV|T[0-9])\.[XYZW]}}| - -; GCN: v_and_b32 -define void @fabs_f32(float addrspace(1)* %out, float %in) { - %fabs = call float @llvm.fabs.f32(float %in) - store float %fabs, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fabs_v2f32: -; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; R600: |{{(PV|T[0-9])\.[XYZW]}}| - -; GCN: v_and_b32 -; GCN: v_and_b32 -define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { - %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) - store <2 x float> %fabs, <2 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fabs_v4f32: -; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; R600: |{{(PV|T[0-9])\.[XYZW]}}| - -; GCN: v_and_b32 -; GCN: v_and_b32 -; GCN: v_and_b32 -; GCN: v_and_b32 -define void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { - %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) - store <4 x float> %fabs, <4 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}fabs_fn_fold: -; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb -; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c -; GCN-NOT: and -; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}} -define void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) { - %fabs = call float @fabs(float %in0) - %fmul = fmul float %fabs, %in1 - store float %fmul, float addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}fabs_fold: -; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb -; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c -; GCN-NOT: and -; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}} -define void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) { - %fabs = call float @llvm.fabs.f32(float %in0) - %fmul = fmul float %fabs, %in1 - store float %fmul, float addrspace(1)* %out - ret void -} - -declare float @fabs(float) readnone -declare float @llvm.fabs.f32(float) readnone -declare <2 x float> @llvm.fabs.v2f32(<2 x float>) readnone -declare <4 x float> @llvm.fabs.v4f32(<4 x float>) readnone diff --git a/test/CodeGen/R600/fadd.ll b/test/CodeGen/R600/fadd.ll deleted file mode 100644 index 5fac328c598..00000000000 --- a/test/CodeGen/R600/fadd.ll +++ /dev/null @@ -1,64 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC - -; FUNC-LABEL: {{^}}fadd_f32: -; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W -; SI: v_add_f32 -define void @fadd_f32(float addrspace(1)* %out, float %a, float %b) { - %add = fadd float %a, %b - store float %add, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}fadd_v2f32: -; R600-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z -; R600-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y -; SI: v_add_f32 -; SI: v_add_f32 -define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { - %add = fadd <2 x float> %a, %b - store <2 x float> %add, <2 x float> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}fadd_v4f32: -; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -define void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 - %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16 - %b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16 - %result = fadd <4 x float> %a, %b - store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: {{^}}fadd_v8f32: -; R600: ADD -; R600: ADD -; R600: ADD -; R600: ADD -; R600: ADD -; R600: ADD -; R600: ADD -; R600: ADD -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -define void @fadd_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) { - %add = fadd <8 x float> %a, %b - store <8 x float> %add, <8 x float> addrspace(1)* %out, align 32 - ret void -} diff --git a/test/CodeGen/R600/fadd64.ll b/test/CodeGen/R600/fadd64.ll deleted file mode 100644 index 485c55870c4..00000000000 --- a/test/CodeGen/R600/fadd64.ll +++ /dev/null @@ -1,14 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -; CHECK: {{^}}fadd_f64: -; CHECK: v_add_f64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}} - -define void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2) { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 - %r2 = fadd double %r0, %r1 - store double %r2, double addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/fceil.ll b/test/CodeGen/R600/fceil.ll deleted file mode 100644 index f23e8919d73..00000000000 --- a/test/CodeGen/R600/fceil.ll +++ /dev/null @@ -1,132 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare float @llvm.ceil.f32(float) nounwind readnone -declare <2 x float> @llvm.ceil.v2f32(<2 x float>) nounwind readnone -declare <3 x float> @llvm.ceil.v3f32(<3 x float>) nounwind readnone -declare <4 x float> @llvm.ceil.v4f32(<4 x float>) nounwind readnone -declare <8 x float> @llvm.ceil.v8f32(<8 x float>) nounwind readnone -declare <16 x float> @llvm.ceil.v16f32(<16 x float>) nounwind readnone - -; FUNC-LABEL: {{^}}fceil_f32: -; SI: v_ceil_f32_e32 -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] -; EG: CEIL {{\*? *}}[[RESULT]] -define void @fceil_f32(float addrspace(1)* %out, float %x) { - %y = call float @llvm.ceil.f32(float %x) nounwind readnone - store float %y, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fceil_v2f32: -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} -; EG: CEIL {{\*? *}}[[RESULT]] -; EG: CEIL {{\*? *}}[[RESULT]] -define void @fceil_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) { - %y = call <2 x float> @llvm.ceil.v2f32(<2 x float> %x) nounwind readnone - store <2 x float> %y, <2 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fceil_v3f32: -; FIXME-SI: v_ceil_f32_e32 -; FIXME-SI: v_ceil_f32_e32 -; FIXME-SI: v_ceil_f32_e32 -; FIXME-EG: v3 is treated as v2 and v1, hence 2 stores -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}} -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}} -; EG-DAG: CEIL {{\*? *}}[[RESULT1]] -; EG-DAG: CEIL {{\*? *}}[[RESULT2]] -; EG-DAG: CEIL {{\*? *}}[[RESULT2]] -define void @fceil_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) { - %y = call <3 x float> @llvm.ceil.v3f32(<3 x float> %x) nounwind readnone - store <3 x float> %y, <3 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fceil_v4f32: -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}} -; EG: CEIL {{\*? *}}[[RESULT]] -; EG: CEIL {{\*? *}}[[RESULT]] -; EG: CEIL {{\*? *}}[[RESULT]] -; EG: CEIL {{\*? *}}[[RESULT]] -define void @fceil_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) { - %y = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) nounwind readnone - store <4 x float> %y, <4 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fceil_v8f32: -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}} -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}} -; EG-DAG: CEIL {{\*? *}}[[RESULT1]] -; EG-DAG: CEIL {{\*? *}}[[RESULT1]] -; EG-DAG: CEIL {{\*? *}}[[RESULT1]] -; EG-DAG: CEIL {{\*? *}}[[RESULT1]] -; EG-DAG: CEIL {{\*? *}}[[RESULT2]] -; EG-DAG: CEIL {{\*? *}}[[RESULT2]] -; EG-DAG: CEIL {{\*? *}}[[RESULT2]] -; EG-DAG: CEIL {{\*? *}}[[RESULT2]] -define void @fceil_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) { - %y = call <8 x float> @llvm.ceil.v8f32(<8 x float> %x) nounwind readnone - store <8 x float> %y, <8 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fceil_v16f32: -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; SI: v_ceil_f32_e32 -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}} -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}} -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT3:T[0-9]+]]{{\.[XYZW]}} -; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT4:T[0-9]+]]{{\.[XYZW]}} -; EG-DAG: CEIL {{\*? *}}[[RESULT1]] -; EG-DAG: CEIL {{\*? *}}[[RESULT1]] -; EG-DAG: CEIL {{\*? *}}[[RESULT1]] -; EG-DAG: CEIL {{\*? *}}[[RESULT1]] -; EG-DAG: CEIL {{\*? *}}[[RESULT2]] -; EG-DAG: CEIL {{\*? *}}[[RESULT2]] -; EG-DAG: CEIL {{\*? *}}[[RESULT2]] -; EG-DAG: CEIL {{\*? *}}[[RESULT2]] -; EG-DAG: CEIL {{\*? *}}[[RESULT3]] -; EG-DAG: CEIL {{\*? *}}[[RESULT3]] -; EG-DAG: CEIL {{\*? *}}[[RESULT3]] -; EG-DAG: CEIL {{\*? *}}[[RESULT3]] -; EG-DAG: CEIL {{\*? *}}[[RESULT4]] -; EG-DAG: CEIL {{\*? *}}[[RESULT4]] -; EG-DAG: CEIL {{\*? *}}[[RESULT4]] -; EG-DAG: CEIL {{\*? *}}[[RESULT4]] -define void @fceil_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) { - %y = call <16 x float> @llvm.ceil.v16f32(<16 x float> %x) nounwind readnone - store <16 x float> %y, <16 x float> addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/fceil64.ll b/test/CodeGen/R600/fceil64.ll deleted file mode 100644 index e8c34f0141e..00000000000 --- a/test/CodeGen/R600/fceil64.ll +++ /dev/null @@ -1,105 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s - -declare double @llvm.ceil.f64(double) nounwind readnone -declare <2 x double> @llvm.ceil.v2f64(<2 x double>) nounwind readnone -declare <3 x double> @llvm.ceil.v3f64(<3 x double>) nounwind readnone -declare <4 x double> @llvm.ceil.v4f64(<4 x double>) nounwind readnone -declare <8 x double> @llvm.ceil.v8f64(<8 x double>) nounwind readnone -declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone - -; FUNC-LABEL: {{^}}fceil_f64: -; CI: v_ceil_f64_e32 -; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014 -; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 -; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01 -; SI: s_lshr_b64 -; SI: s_not_b64 -; SI: s_and_b64 -; SI: cmp_gt_i32 -; SI: cndmask_b32 -; SI: cndmask_b32 -; SI: cmp_lt_i32 -; SI: cndmask_b32 -; SI: cndmask_b32 -; SI-DAG: v_cmp_lt_f64 -; SI-DAG: v_cmp_lg_f64 -; SI: s_and_b64 -; SI: v_cndmask_b32 -; SI: v_cndmask_b32 -; SI: v_add_f64 -; SI: s_endpgm -define void @fceil_f64(double addrspace(1)* %out, double %x) { - %y = call double @llvm.ceil.f64(double %x) nounwind readnone - store double %y, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fceil_v2f64: -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -define void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) { - %y = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x) nounwind readnone - store <2 x double> %y, <2 x double> addrspace(1)* %out - ret void -} - -; FIXME-FUNC-LABEL: {{^}}fceil_v3f64: -; FIXME-CI: v_ceil_f64_e32 -; FIXME-CI: v_ceil_f64_e32 -; FIXME-CI: v_ceil_f64_e32 -; define void @fceil_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) { -; %y = call <3 x double> @llvm.ceil.v3f64(<3 x double> %x) nounwind readnone -; store <3 x double> %y, <3 x double> addrspace(1)* %out -; ret void -; } - -; FUNC-LABEL: {{^}}fceil_v4f64: -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -define void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) { - %y = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) nounwind readnone - store <4 x double> %y, <4 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fceil_v8f64: -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -define void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) { - %y = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x) nounwind readnone - store <8 x double> %y, <8 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fceil_v16f64: -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -; CI: v_ceil_f64_e32 -define void @fceil_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) { - %y = call <16 x double> @llvm.ceil.v16f64(<16 x double> %x) nounwind readnone - store <16 x double> %y, <16 x double> addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/fcmp-cnd.ll b/test/CodeGen/R600/fcmp-cnd.ll deleted file mode 100644 index 530274f920f..00000000000 --- a/test/CodeGen/R600/fcmp-cnd.ll +++ /dev/null @@ -1,14 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;Not checking arguments 2 and 3 to CNDE, because they may change between -;registers and literal.x depending on what the optimizer does. -;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) { -entry: - %0 = load float, float addrspace(1)* %in - %cmp = fcmp oeq float %0, 0.000000e+00 - %value = select i1 %cmp, i32 2, i32 3 - store i32 %value, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/fcmp-cnde-int-args.ll b/test/CodeGen/R600/fcmp-cnde-int-args.ll deleted file mode 100644 index c402805feb3..00000000000 --- a/test/CodeGen/R600/fcmp-cnde-int-args.ll +++ /dev/null @@ -1,16 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; This test checks a bug in R600TargetLowering::LowerSELECT_CC where the -; chance to optimize the fcmp + select instructions to SET* was missed -; due to the fact that the operands to fcmp and select had different types - -; CHECK: SET{{[A-Z]+}}_DX10 - -define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) { -entry: - %0 = load float, float addrspace(1)* %in - %cmp = fcmp oeq float %0, 0.000000e+00 - %value = select i1 %cmp, i32 -1, i32 0 - store i32 %value, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/fcmp.ll b/test/CodeGen/R600/fcmp.ll deleted file mode 100644 index 5207ab57bad..00000000000 --- a/test/CodeGen/R600/fcmp.ll +++ /dev/null @@ -1,38 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; CHECK: {{^}}fcmp_sext: -; CHECK: SETE_DX10 T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -define void @fcmp_sext(i32 addrspace(1)* %out, float addrspace(1)* %in) { -entry: - %0 = load float, float addrspace(1)* %in - %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %in, i32 1 - %1 = load float, float addrspace(1)* %arrayidx1 - %cmp = fcmp oeq float %0, %1 - %sext = sext i1 %cmp to i32 - store i32 %sext, i32 addrspace(1)* %out - ret void -} - -; This test checks that a setcc node with f32 operands is lowered to a -; SET*_DX10 instruction. Previously we were lowering this to: -; SET* + FP_TO_SINT - -; CHECK: {{^}}fcmp_br: -; CHECK: SET{{[N]*}}E_DX10 * T{{[0-9]+\.[XYZW],}} -; CHECK-NEXT {{[0-9]+(5.0}} - -define void @fcmp_br(i32 addrspace(1)* %out, float %in) { -entry: - %0 = fcmp oeq float %in, 5.0 - br i1 %0, label %IF, label %ENDIF - -IF: - %1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - store i32 0, i32 addrspace(1)* %1 - br label %ENDIF - -ENDIF: - store i32 0, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/fcmp64.ll b/test/CodeGen/R600/fcmp64.ll deleted file mode 100644 index 053ab0ed7aa..00000000000 --- a/test/CodeGen/R600/fcmp64.ll +++ /dev/null @@ -1,74 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -; CHECK-LABEL: {{^}}flt_f64: -; CHECK: v_cmp_nge_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} -define void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2) { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 - %r2 = fcmp ult double %r0, %r1 - %r3 = zext i1 %r2 to i32 - store i32 %r3, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}fle_f64: -; CHECK: v_cmp_ngt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} -define void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2) { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 - %r2 = fcmp ule double %r0, %r1 - %r3 = zext i1 %r2 to i32 - store i32 %r3, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}fgt_f64: -; CHECK: v_cmp_nle_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} -define void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2) { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 - %r2 = fcmp ugt double %r0, %r1 - %r3 = zext i1 %r2 to i32 - store i32 %r3, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}fge_f64: -; CHECK: v_cmp_nlt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} -define void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2) { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 - %r2 = fcmp uge double %r0, %r1 - %r3 = zext i1 %r2 to i32 - store i32 %r3, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}fne_f64: -; CHECK: v_cmp_neq_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} -define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2) { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 - %r2 = fcmp une double %r0, %r1 - %r3 = select i1 %r2, double %r0, double %r1 - store double %r3, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}feq_f64: -; CHECK: v_cmp_nlg_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} -define void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2) { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 - %r2 = fcmp ueq double %r0, %r1 - %r3 = select i1 %r2, double %r0, double %r1 - store double %r3, double addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/fconst64.ll b/test/CodeGen/R600/fconst64.ll deleted file mode 100644 index 89af37545c9..00000000000 --- a/test/CodeGen/R600/fconst64.ll +++ /dev/null @@ -1,13 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -; CHECK: {{^}}fconst_f64: -; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0x40140000 -; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0 - -define void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) { - %r1 = load double, double addrspace(1)* %in - %r2 = fadd double %r1, 5.000000e+00 - store double %r2, double addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/fcopysign.f32.ll b/test/CodeGen/R600/fcopysign.f32.ll deleted file mode 100644 index b719d5a3978..00000000000 --- a/test/CodeGen/R600/fcopysign.f32.ll +++ /dev/null @@ -1,53 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - - -declare float @llvm.copysign.f32(float, float) nounwind readnone -declare <2 x float> @llvm.copysign.v2f32(<2 x float>, <2 x float>) nounwind readnone -declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) nounwind readnone - -; Try to identify arg based on higher address. -; FUNC-LABEL: {{^}}test_copysign_f32: -; SI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0xb -; SI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0xc -; VI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0x2c -; VI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0x30 -; GCN-DAG: v_mov_b32_e32 [[VSIGN:v[0-9]+]], [[SSIGN]] -; GCN-DAG: v_mov_b32_e32 [[VMAG:v[0-9]+]], [[SMAG]] -; GCN-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff -; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[SCONST]], [[VMAG]], [[VSIGN]] -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm - -; EG: BFI_INT -define void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign) nounwind { - %result = call float @llvm.copysign.f32(float %mag, float %sign) - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_copysign_v2f32: -; GCN: s_endpgm - -; EG: BFI_INT -; EG: BFI_INT -define void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %mag, <2 x float> %sign) nounwind { - %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign) - store <2 x float> %result, <2 x float> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}test_copysign_v4f32: -; GCN: s_endpgm - -; EG: BFI_INT -; EG: BFI_INT -; EG: BFI_INT -; EG: BFI_INT -define void @test_copysign_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %mag, <4 x float> %sign) nounwind { - %result = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign) - store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16 - ret void -} - diff --git a/test/CodeGen/R600/fcopysign.f64.ll b/test/CodeGen/R600/fcopysign.f64.ll deleted file mode 100644 index 3d8c5599308..00000000000 --- a/test/CodeGen/R600/fcopysign.f64.ll +++ /dev/null @@ -1,40 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s - -declare double @llvm.copysign.f64(double, double) nounwind readnone -declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) nounwind readnone -declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind readnone - -; FUNC-LABEL: {{^}}test_copysign_f64: -; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd -; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34 -; GCN-DAG: v_mov_b32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]] -; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]] -; GCN-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff -; GCN: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]] -; GCN: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]] -; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}} -; GCN: s_endpgm -define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind { - %result = call double @llvm.copysign.f64(double %mag, double %sign) - store double %result, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}test_copysign_v2f64: -; GCN: s_endpgm -define void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %mag, <2 x double> %sign) nounwind { - %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign) - store <2 x double> %result, <2 x double> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}test_copysign_v4f64: -; GCN: s_endpgm -define void @test_copysign_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %mag, <4 x double> %sign) nounwind { - %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign) - store <4 x double> %result, <4 x double> addrspace(1)* %out, align 8 - ret void -} diff --git a/test/CodeGen/R600/fdiv.f64.ll b/test/CodeGen/R600/fdiv.f64.ll deleted file mode 100644 index 7c022e38c80..00000000000 --- a/test/CodeGen/R600/fdiv.f64.ll +++ /dev/null @@ -1,96 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=COMMON %s -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=COMMON %s - - -; COMMON-LABEL: {{^}}fdiv_f64: -; COMMON-DAG: buffer_load_dwordx2 [[NUM:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0 -; COMMON-DAG: buffer_load_dwordx2 [[DEN:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0 offset:8 -; CI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[DEN]], [[DEN]], [[NUM]] -; CI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], vcc, [[NUM]], [[DEN]], [[NUM]] - -; Check for div_scale bug workaround on SI -; SI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[DEN]], [[DEN]], [[NUM]] -; SI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[NUM]], [[DEN]], [[NUM]] - -; COMMON-DAG: v_rcp_f64_e32 [[RCP_SCALE0:v\[[0-9]+:[0-9]+\]]], [[SCALE0]] - -; SI-DAG: v_cmp_eq_i32_e32 vcc, {{v[0-9]+}}, {{v[0-9]+}} -; SI-DAG: v_cmp_eq_i32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}} -; SI-DAG: s_xor_b64 vcc, [[CMP0]], vcc - -; COMMON-DAG: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[RCP_SCALE0]], 1.0 -; COMMON-DAG: v_fma_f64 [[FMA1:v\[[0-9]+:[0-9]+\]]], [[RCP_SCALE0]], [[FMA0]], [[RCP_SCALE0]] -; COMMON-DAG: v_fma_f64 [[FMA2:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[FMA1]], 1.0 -; COMMON-DAG: v_fma_f64 [[FMA3:v\[[0-9]+:[0-9]+\]]], [[FMA1]], [[FMA2]], [[FMA1]] -; COMMON-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[SCALE1]], [[FMA3]] -; COMMON-DAG: v_fma_f64 [[FMA4:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[MUL]], [[SCALE1]] -; COMMON: v_div_fmas_f64 [[FMAS:v\[[0-9]+:[0-9]+\]]], [[FMA4]], [[FMA3]], [[MUL]] -; COMMON: v_div_fixup_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[FMAS]], [[DEN]], [[NUM]] -; COMMON: buffer_store_dwordx2 [[RESULT]] -; COMMON: s_endpgm -define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) nounwind { - %gep.1 = getelementptr double, double addrspace(1)* %in, i32 1 - %num = load double, double addrspace(1)* %in - %den = load double, double addrspace(1)* %gep.1 - %result = fdiv double %num, %den - store double %result, double addrspace(1)* %out - ret void -} - -; COMMON-LABEL: {{^}}fdiv_f64_s_v: -define void @fdiv_f64_s_v(double addrspace(1)* %out, double addrspace(1)* %in, double %num) nounwind { - %den = load double, double addrspace(1)* %in - %result = fdiv double %num, %den - store double %result, double addrspace(1)* %out - ret void -} - -; COMMON-LABEL: {{^}}fdiv_f64_v_s: -define void @fdiv_f64_v_s(double addrspace(1)* %out, double addrspace(1)* %in, double %den) nounwind { - %num = load double, double addrspace(1)* %in - %result = fdiv double %num, %den - store double %result, double addrspace(1)* %out - ret void -} - -; COMMON-LABEL: {{^}}fdiv_f64_s_s: -define void @fdiv_f64_s_s(double addrspace(1)* %out, double %num, double %den) nounwind { - %result = fdiv double %num, %den - store double %result, double addrspace(1)* %out - ret void -} - -; COMMON-LABEL: {{^}}v_fdiv_v2f64: -define void @v_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) nounwind { - %gep.1 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in, i32 1 - %num = load <2 x double>, <2 x double> addrspace(1)* %in - %den = load <2 x double>, <2 x double> addrspace(1)* %gep.1 - %result = fdiv <2 x double> %num, %den - store <2 x double> %result, <2 x double> addrspace(1)* %out - ret void -} - -; COMMON-LABEL: {{^}}s_fdiv_v2f64: -define void @s_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %num, <2 x double> %den) { - %result = fdiv <2 x double> %num, %den - store <2 x double> %result, <2 x double> addrspace(1)* %out - ret void -} - -; COMMON-LABEL: {{^}}v_fdiv_v4f64: -define void @v_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) nounwind { - %gep.1 = getelementptr <4 x double>, <4 x double> addrspace(1)* %in, i32 1 - %num = load <4 x double>, <4 x double> addrspace(1)* %in - %den = load <4 x double>, <4 x double> addrspace(1)* %gep.1 - %result = fdiv <4 x double> %num, %den - store <4 x double> %result, <4 x double> addrspace(1)* %out - ret void -} - -; COMMON-LABEL: {{^}}s_fdiv_v4f64: -define void @s_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %num, <4 x double> %den) { - %result = fdiv <4 x double> %num, %den - store <4 x double> %result, <4 x double> addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/fdiv.ll b/test/CodeGen/R600/fdiv.ll deleted file mode 100644 index 7cbf8733639..00000000000 --- a/test/CodeGen/R600/fdiv.ll +++ /dev/null @@ -1,68 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -; These tests check that fdiv is expanded correctly and also test that the -; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate -; instruction groups. - -; FUNC-LABEL: {{^}}fdiv_f32: -; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z -; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS - -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fdiv float %a, %b - store float %0, float addrspace(1)* %out - ret void -} - - - -; FUNC-LABEL: {{^}}fdiv_v2f32: -; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z -; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS - -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { -entry: - %0 = fdiv <2 x float> %a, %b - store <2 x float> %0, <2 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fdiv_v4f32: -; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS - -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 - %a = load <4 x float>, <4 x float> addrspace(1) * %in - %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr - %result = fdiv <4 x float> %a, %b - store <4 x float> %result, <4 x float> addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/fetch-limits.r600.ll b/test/CodeGen/R600/fetch-limits.r600.ll deleted file mode 100644 index e7160ef5d72..00000000000 --- a/test/CodeGen/R600/fetch-limits.r600.ll +++ /dev/null @@ -1,48 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=r600 | FileCheck %s -; RUN: llc < %s -march=r600 -mcpu=rs880 | FileCheck %s -; RUN: llc < %s -march=r600 -mcpu=rv670 | FileCheck %s - -; R600 supports 8 fetches in a clause -; CHECK: {{^}}fetch_limits_r600: -; CHECK: Fetch clause -; CHECK: Fetch clause - -define void @fetch_limits_r600() #0 { -entry: - %0 = load <4 x float>, <4 x float> addrspace(8)* null - %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) - %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) - %5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) - %6 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6) - %7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) - %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) - %res0 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %0, i32 0, i32 0, i32 1) - %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %1, i32 0, i32 0, i32 1) - %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %2, i32 0, i32 0, i32 1) - %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %3, i32 0, i32 0, i32 1) - %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %4, i32 0, i32 0, i32 1) - %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %5, i32 0, i32 0, i32 1) - %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %6, i32 0, i32 0, i32 1) - %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %7, i32 0, i32 0, i32 1) - %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1) - %a = fadd <4 x float> %res0, %res1 - %b = fadd <4 x float> %res2, %res3 - %c = fadd <4 x float> %res4, %res5 - %d = fadd <4 x float> %res6, %res7 - %e = fadd <4 x float> %res8, %a - - %bc = fadd <4 x float> %b, %c - %de = fadd <4 x float> %d, %e - - %bcde = fadd <4 x float> %bc, %de - - call void @llvm.R600.store.swizzle(<4 x float> %bcde, i32 0, i32 1) - ret void -} - -attributes #0 = { "ShaderType"="0" } ; Pixel Shader - -declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) diff --git a/test/CodeGen/R600/fetch-limits.r700+.ll b/test/CodeGen/R600/fetch-limits.r700+.ll deleted file mode 100644 index acaea2aa794..00000000000 --- a/test/CodeGen/R600/fetch-limits.r700+.ll +++ /dev/null @@ -1,81 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=rv710 | FileCheck %s -; RUN: llc < %s -march=r600 -mcpu=rv730 | FileCheck %s -; RUN: llc < %s -march=r600 -mcpu=rv770 | FileCheck %s -; RUN: llc < %s -march=r600 -mcpu=cedar | FileCheck %s -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -; RUN: llc < %s -march=r600 -mcpu=sumo | FileCheck %s -; RUN: llc < %s -march=r600 -mcpu=juniper | FileCheck %s -; RUN: llc < %s -march=r600 -mcpu=cypress | FileCheck %s -; RUN: llc < %s -march=r600 -mcpu=barts | FileCheck %s -; RUN: llc < %s -march=r600 -mcpu=turks | FileCheck %s -; RUN: llc < %s -march=r600 -mcpu=caicos | FileCheck %s -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s - -; r700+ supports 16 fetches in a clause -; CHECK: {{^}}fetch_limits_r700: -; CHECK: Fetch clause -; CHECK: Fetch clause - -define void @fetch_limits_r700() #0 { -entry: - %0 = load <4 x float>, <4 x float> addrspace(8)* null - %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) - %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) - %5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) - %6 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6) - %7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) - %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) - %9 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) - %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10) - %11 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11) - %12 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12) - %13 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 13) - %14 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14) - %15 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 15) - %16 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16) - %res0 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %0, i32 0, i32 0, i32 1) - %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %1, i32 0, i32 0, i32 1) - %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %2, i32 0, i32 0, i32 1) - %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %3, i32 0, i32 0, i32 1) - %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %4, i32 0, i32 0, i32 1) - %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %5, i32 0, i32 0, i32 1) - %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %6, i32 0, i32 0, i32 1) - %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %7, i32 0, i32 0, i32 1) - %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1) - %res9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %9, i32 0, i32 0, i32 1) - %res10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %10, i32 0, i32 0, i32 1) - %res11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %11, i32 0, i32 0, i32 1) - %res12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %12, i32 0, i32 0, i32 1) - %res13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %13, i32 0, i32 0, i32 1) - %res14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %14, i32 0, i32 0, i32 1) - %res15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %15, i32 0, i32 0, i32 1) - %res16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %16, i32 0, i32 0, i32 1) - %a = fadd <4 x float> %res0, %res1 - %b = fadd <4 x float> %res2, %res3 - %c = fadd <4 x float> %res4, %res5 - %d = fadd <4 x float> %res6, %res7 - %e = fadd <4 x float> %res8, %res9 - %f = fadd <4 x float> %res10, %res11 - %g = fadd <4 x float> %res12, %res13 - %h = fadd <4 x float> %res14, %res15 - %i = fadd <4 x float> %res16, %a - - %bc = fadd <4 x float> %b, %c - %de = fadd <4 x float> %d, %e - %fg = fadd <4 x float> %f, %g - %hi = fadd <4 x float> %h, %i - - %bcde = fadd <4 x float> %bc, %de - %fghi = fadd <4 x float> %fg, %hi - - %bcdefghi = fadd <4 x float> %bcde, %fghi - call void @llvm.R600.store.swizzle(<4 x float> %bcdefghi, i32 0, i32 1) - ret void -} - -attributes #0 = { "ShaderType"="0" } ; Pixel Shader - -declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) diff --git a/test/CodeGen/R600/ffloor.f64.ll b/test/CodeGen/R600/ffloor.f64.ll deleted file mode 100644 index 45f8382c392..00000000000 --- a/test/CodeGen/R600/ffloor.f64.ll +++ /dev/null @@ -1,127 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s - -declare double @llvm.fabs.f64(double %Val) -declare double @llvm.floor.f64(double) nounwind readnone -declare <2 x double> @llvm.floor.v2f64(<2 x double>) nounwind readnone -declare <3 x double> @llvm.floor.v3f64(<3 x double>) nounwind readnone -declare <4 x double> @llvm.floor.v4f64(<4 x double>) nounwind readnone -declare <8 x double> @llvm.floor.v8f64(<8 x double>) nounwind readnone -declare <16 x double> @llvm.floor.v16f64(<16 x double>) nounwind readnone - -; FUNC-LABEL: {{^}}ffloor_f64: -; CI: v_floor_f64_e32 -; SI: v_fract_f64_e32 -; SI: v_min_f64 -; SI: v_cmp_class_f64_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_add_f64 -; SI: s_endpgm -define void @ffloor_f64(double addrspace(1)* %out, double %x) { - %y = call double @llvm.floor.f64(double %x) nounwind readnone - store double %y, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}ffloor_f64_neg: -; CI: v_floor_f64_e64 -; SI: v_fract_f64_e64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT:s[[0-9]+:[0-9]+]]] -; SI: v_min_f64 -; SI: v_cmp_class_f64_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT]] -; SI: s_endpgm -define void @ffloor_f64_neg(double addrspace(1)* %out, double %x) { - %neg = fsub double 0.0, %x - %y = call double @llvm.floor.f64(double %neg) nounwind readnone - store double %y, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}ffloor_f64_neg_abs: -; CI: v_floor_f64_e64 -; SI: v_fract_f64_e64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT:s[[0-9]+:[0-9]+]]]| -; SI: v_min_f64 -; SI: v_cmp_class_f64_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT]]| -; SI: s_endpgm -define void @ffloor_f64_neg_abs(double addrspace(1)* %out, double %x) { - %abs = call double @llvm.fabs.f64(double %x) - %neg = fsub double 0.0, %abs - %y = call double @llvm.floor.f64(double %neg) nounwind readnone - store double %y, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}ffloor_v2f64: -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) { - %y = call <2 x double> @llvm.floor.v2f64(<2 x double> %x) nounwind readnone - store <2 x double> %y, <2 x double> addrspace(1)* %out - ret void -} - -; FIXME-FUNC-LABEL: {{^}}ffloor_v3f64: -; FIXME-CI: v_floor_f64_e32 -; FIXME-CI: v_floor_f64_e32 -; FIXME-CI: v_floor_f64_e32 -; define void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) { -; %y = call <3 x double> @llvm.floor.v3f64(<3 x double> %x) nounwind readnone -; store <3 x double> %y, <3 x double> addrspace(1)* %out -; ret void -; } - -; FUNC-LABEL: {{^}}ffloor_v4f64: -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -define void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) { - %y = call <4 x double> @llvm.floor.v4f64(<4 x double> %x) nounwind readnone - store <4 x double> %y, <4 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}ffloor_v8f64: -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -define void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) { - %y = call <8 x double> @llvm.floor.v8f64(<8 x double> %x) nounwind readnone - store <8 x double> %y, <8 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}ffloor_v16f64: -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -; CI: v_floor_f64_e32 -define void @ffloor_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) { - %y = call <16 x double> @llvm.floor.v16f64(<16 x double> %x) nounwind readnone - store <16 x double> %y, <16 x double> addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/ffloor.ll b/test/CodeGen/R600/ffloor.ll deleted file mode 100644 index 61c46ac2bc0..00000000000 --- a/test/CodeGen/R600/ffloor.ll +++ /dev/null @@ -1,49 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}floor_f32: -; SI: v_floor_f32_e32 -; R600: FLOOR -define void @floor_f32(float addrspace(1)* %out, float %in) { - %tmp = call float @llvm.floor.f32(float %in) #0 - store float %tmp, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}floor_v2f32: -; SI: v_floor_f32_e32 -; SI: v_floor_f32_e32 - -define void @floor_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { - %tmp = call <2 x float> @llvm.floor.v2f32(<2 x float> %in) #0 - store <2 x float> %tmp, <2 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}floor_v4f32: -; SI: v_floor_f32_e32 -; SI: v_floor_f32_e32 -; SI: v_floor_f32_e32 -; SI: v_floor_f32_e32 - -; R600: FLOOR -; R600: FLOOR -; R600: FLOOR -; R600: FLOOR -define void @floor_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { - %tmp = call <4 x float> @llvm.floor.v4f32(<4 x float> %in) #0 - store <4 x float> %tmp, <4 x float> addrspace(1)* %out - ret void -} - -; Function Attrs: nounwind readonly -declare float @llvm.floor.f32(float) #0 - -; Function Attrs: nounwind readonly -declare <2 x float> @llvm.floor.v2f32(<2 x float>) #0 - -; Function Attrs: nounwind readonly -declare <4 x float> @llvm.floor.v4f32(<4 x float>) #0 - -attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/R600/flat-address-space.ll b/test/CodeGen/R600/flat-address-space.ll deleted file mode 100644 index 8ceca078f2d..00000000000 --- a/test/CodeGen/R600/flat-address-space.ll +++ /dev/null @@ -1,184 +0,0 @@ -; RUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s -; RUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s -; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s -; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s - -; Disable optimizations in case there are optimizations added that -; specialize away generic pointer accesses. - - -; CHECK-LABEL: {{^}}branch_use_flat_i32: -; CHECK: flat_store_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} -; CHECK: s_endpgm -define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 { -entry: - %cmp = icmp ne i32 %c, 0 - br i1 %cmp, label %local, label %global - -local: - %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32 addrspace(4)* - br label %end - -global: - %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* - br label %end - -end: - %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ] - store i32 %x, i32 addrspace(4)* %fptr, align 4 -; %val = load i32, i32 addrspace(4)* %fptr, align 4 -; store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - - - -; These testcases might become useless when there are optimizations to -; remove generic pointers. - -; CHECK-LABEL: {{^}}store_flat_i32: -; CHECK: v_mov_b32_e32 v[[DATA:[0-9]+]], {{s[0-9]+}} -; CHECK: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], {{s[0-9]+}} -; CHECK: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], {{s[0-9]+}} -; CHECK: flat_store_dword v[[DATA]], v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 { - %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* - store i32 %x, i32 addrspace(4)* %fptr, align 4 - ret void -} - -; CHECK-LABEL: {{^}}store_flat_i64: -; CHECK: flat_store_dwordx2 -define void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 { - %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)* - store i64 %x, i64 addrspace(4)* %fptr, align 8 - ret void -} - -; CHECK-LABEL: {{^}}store_flat_v4i32: -; CHECK: flat_store_dwordx4 -define void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 { - %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)* - store <4 x i32> %x, <4 x i32> addrspace(4)* %fptr, align 16 - ret void -} - -; CHECK-LABEL: {{^}}store_flat_trunc_i16: -; CHECK: flat_store_short -define void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 { - %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)* - %y = trunc i32 %x to i16 - store i16 %y, i16 addrspace(4)* %fptr, align 2 - ret void -} - -; CHECK-LABEL: {{^}}store_flat_trunc_i8: -; CHECK: flat_store_byte -define void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 { - %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)* - %y = trunc i32 %x to i8 - store i8 %y, i8 addrspace(4)* %fptr, align 2 - ret void -} - - - -; CHECK-LABEL @load_flat_i32: -; CHECK: flat_load_dword -define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 { - %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)* - %fload = load i32, i32 addrspace(4)* %fptr, align 4 - store i32 %fload, i32 addrspace(1)* %out, align 4 - ret void -} - -; CHECK-LABEL @load_flat_i64: -; CHECK: flat_load_dwordx2 -define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 { - %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)* - %fload = load i64, i64 addrspace(4)* %fptr, align 4 - store i64 %fload, i64 addrspace(1)* %out, align 8 - ret void -} - -; CHECK-LABEL @load_flat_v4i32: -; CHECK: flat_load_dwordx4 -define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 { - %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)* - %fload = load <4 x i32>, <4 x i32> addrspace(4)* %fptr, align 4 - store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8 - ret void -} - -; CHECK-LABEL @sextload_flat_i8: -; CHECK: flat_load_sbyte -define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 { - %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)* - %fload = load i8, i8 addrspace(4)* %fptr, align 4 - %ext = sext i8 %fload to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -; CHECK-LABEL @zextload_flat_i8: -; CHECK: flat_load_ubyte -define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 { - %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)* - %fload = load i8, i8 addrspace(4)* %fptr, align 4 - %ext = zext i8 %fload to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -; CHECK-LABEL @sextload_flat_i16: -; CHECK: flat_load_sshort -define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 { - %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)* - %fload = load i16, i16 addrspace(4)* %fptr, align 4 - %ext = sext i16 %fload to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -; CHECK-LABEL @zextload_flat_i16: -; CHECK: flat_load_ushort -define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 { - %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)* - %fload = load i16, i16 addrspace(4)* %fptr, align 4 - %ext = zext i16 %fload to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - - - -; TODO: This should not be zero when registers are used for small -; scratch allocations again. - -; Check for prologue initializing special SGPRs pointing to scratch. -; CHECK-LABEL: {{^}}store_flat_scratch: -; CHECK: s_movk_i32 flat_scratch_lo, 0 -; CHECK-NO-PROMOTE: s_movk_i32 flat_scratch_hi, 0x28{{$}} -; CHECK-PROMOTE: s_movk_i32 flat_scratch_hi, 0x0{{$}} -; CHECK: flat_store_dword -; CHECK: s_barrier -; CHECK: flat_load_dword -define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 { - %alloca = alloca i32, i32 9, align 4 - %x = call i32 @llvm.r600.read.tidig.x() #3 - %pptr = getelementptr i32, i32* %alloca, i32 %x - %fptr = addrspacecast i32* %pptr to i32 addrspace(4)* - store i32 %x, i32 addrspace(4)* %fptr - ; Dummy call - call void @llvm.AMDGPU.barrier.local() #1 - %reload = load i32, i32 addrspace(4)* %fptr, align 4 - store i32 %reload, i32 addrspace(1)* %out, align 4 - ret void -} - -declare void @llvm.AMDGPU.barrier.local() #1 -declare i32 @llvm.r600.read.tidig.x() #3 - -attributes #0 = { nounwind } -attributes #1 = { nounwind noduplicate } -attributes #3 = { nounwind readnone } diff --git a/test/CodeGen/R600/floor.ll b/test/CodeGen/R600/floor.ll deleted file mode 100644 index c6bfb8567a0..00000000000 --- a/test/CodeGen/R600/floor.ll +++ /dev/null @@ -1,15 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s - -; CHECK: FLOOR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @test(<4 x float> inreg %reg0) #0 { - %r0 = extractelement <4 x float> %reg0, i32 0 - %r1 = call float @floor(float %r0) - %vec = insertelement <4 x float> undef, float %r1, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) - ret void -} - -declare float @floor(float) readonly -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } diff --git a/test/CodeGen/R600/fma-combine.ll b/test/CodeGen/R600/fma-combine.ll deleted file mode 100644 index bd574b87711..00000000000 --- a/test/CodeGen/R600/fma-combine.ll +++ /dev/null @@ -1,368 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-FASTFMAF -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-SLOWFMAF -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.r600.read.tidig.x() #0 -declare double @llvm.fabs.f64(double) #0 -declare double @llvm.fma.f64(double, double, double) #0 -declare float @llvm.fma.f32(float, float, float) #0 - -; (fadd (fmul x, y), z) -> (fma x, y, z) -; FUNC-LABEL: {{^}}combine_to_fma_f64_0: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] -; SI: buffer_store_dwordx2 [[RESULT]] -define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 - %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid - - %a = load double, double addrspace(1)* %gep.0 - %b = load double, double addrspace(1)* %gep.1 - %c = load double, double addrspace(1)* %gep.2 - - %mul = fmul double %a, %b - %fma = fadd double %mul, %c - store double %fma, double addrspace(1)* %gep.out - ret void -} - -; (fadd (fmul x, y), z) -> (fma x, y, z) -; FUNC-LABEL: {{^}}combine_to_fma_f64_0_2use: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} -; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] -; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[D]] -; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI: s_endpgm -define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 - %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid - %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0 - %b = load double, double addrspace(1)* %gep.1 - %c = load double, double addrspace(1)* %gep.2 - %d = load double, double addrspace(1)* %gep.3 - - %mul = fmul double %a, %b - %fma0 = fadd double %mul, %c - %fma1 = fadd double %mul, %d - store double %fma0, double addrspace(1)* %gep.out.0 - store double %fma1, double addrspace(1)* %gep.out.1 - ret void -} - -; (fadd x, (fmul y, z)) -> (fma y, z, x) -; FUNC-LABEL: {{^}}combine_to_fma_f64_1: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]] -; SI: buffer_store_dwordx2 [[RESULT]] -define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 - %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid - - %a = load double, double addrspace(1)* %gep.0 - %b = load double, double addrspace(1)* %gep.1 - %c = load double, double addrspace(1)* %gep.2 - - %mul = fmul double %a, %b - %fma = fadd double %c, %mul - store double %fma, double addrspace(1)* %gep.out - ret void -} - -; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) -; FUNC-LABEL: {{^}}combine_to_fma_fsub_0_f64: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]] -; SI: buffer_store_dwordx2 [[RESULT]] -define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 - %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid - - %a = load double, double addrspace(1)* %gep.0 - %b = load double, double addrspace(1)* %gep.1 - %c = load double, double addrspace(1)* %gep.2 - - %mul = fmul double %a, %b - %fma = fsub double %mul, %c - store double %fma, double addrspace(1)* %gep.out - ret void -} - -; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) -; FUNC-LABEL: {{^}}combine_to_fma_fsub_f64_0_2use: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} -; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]] -; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]] -; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI: s_endpgm -define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 - %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid - %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0 - %b = load double, double addrspace(1)* %gep.1 - %c = load double, double addrspace(1)* %gep.2 - %d = load double, double addrspace(1)* %gep.3 - - %mul = fmul double %a, %b - %fma0 = fsub double %mul, %c - %fma1 = fsub double %mul, %d - store double %fma0, double addrspace(1)* %gep.out.0 - store double %fma1, double addrspace(1)* %gep.out.1 - ret void -} - -; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) -; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]] -; SI: buffer_store_dwordx2 [[RESULT]] -define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 - %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid - - %a = load double, double addrspace(1)* %gep.0 - %b = load double, double addrspace(1)* %gep.1 - %c = load double, double addrspace(1)* %gep.2 - - %mul = fmul double %a, %b - %fma = fsub double %c, %mul - store double %fma, double addrspace(1)* %gep.out - ret void -} - -; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) -; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64_2use: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} -; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]] -; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[D]] -; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI: s_endpgm -define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 - %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid - %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0 - %b = load double, double addrspace(1)* %gep.1 - %c = load double, double addrspace(1)* %gep.2 - %d = load double, double addrspace(1)* %gep.3 - - %mul = fmul double %a, %b - %fma0 = fsub double %c, %mul - %fma1 = fsub double %d, %mul - store double %fma0, double addrspace(1)* %gep.out.0 - store double %fma1, double addrspace(1)* %gep.out.1 - ret void -} - -; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) -; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] -; SI: buffer_store_dwordx2 [[RESULT]] -define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 - %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid - - %a = load double, double addrspace(1)* %gep.0 - %b = load double, double addrspace(1)* %gep.1 - %c = load double, double addrspace(1)* %gep.2 - - %mul = fmul double %a, %b - %mul.neg = fsub double -0.0, %mul - %fma = fsub double %mul.neg, %c - - store double %fma, double addrspace(1)* %gep.out - ret void -} - -; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) -; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_neg: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] -; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[D]] -; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI: s_endpgm -define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 - %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid - %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0 - %b = load double, double addrspace(1)* %gep.1 - %c = load double, double addrspace(1)* %gep.2 - %d = load double, double addrspace(1)* %gep.3 - - %mul = fmul double %a, %b - %mul.neg = fsub double -0.0, %mul - %fma0 = fsub double %mul.neg, %c - %fma1 = fsub double %mul.neg, %d - - store double %fma0, double addrspace(1)* %gep.out.0 - store double %fma1, double addrspace(1)* %gep.out.1 - ret void -} - -; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) -; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_mul: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]] -; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]] -; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI: s_endpgm -define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 - %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid - %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0 - %b = load double, double addrspace(1)* %gep.1 - %c = load double, double addrspace(1)* %gep.2 - %d = load double, double addrspace(1)* %gep.3 - - %mul = fmul double %a, %b - %mul.neg = fsub double -0.0, %mul - %fma0 = fsub double %mul.neg, %c - %fma1 = fsub double %mul, %d - - store double %fma0, double addrspace(1)* %gep.out.0 - store double %fma1, double addrspace(1)* %gep.out.1 - ret void -} - -; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) - -; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_0_f64: -; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} -; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}} -; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]], -[[Z]] -; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]] -; SI: buffer_store_dwordx2 [[RESULT]] -define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 - %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4 - %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid - - %x = load double, double addrspace(1)* %gep.0 - %y = load double, double addrspace(1)* %gep.1 - %z = load double, double addrspace(1)* %gep.2 - %u = load double, double addrspace(1)* %gep.3 - %v = load double, double addrspace(1)* %gep.4 - - %tmp0 = fmul double %u, %v - %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0 - %tmp2 = fsub double %tmp1, %z - - store double %tmp2, double addrspace(1)* %gep.out - ret void -} - -; fold (fsub x, (fma y, z, (fmul u, v))) -; -> (fma (fneg y), z, (fma (fneg u), v, x)) - -; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_1_f64: -; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} -; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}} -; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}} -; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[U]], [[V]], [[X]] -; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]] -; SI: buffer_store_dwordx2 [[RESULT]] -define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3 - %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4 - %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid - - %x = load double, double addrspace(1)* %gep.0 - %y = load double, double addrspace(1)* %gep.1 - %z = load double, double addrspace(1)* %gep.2 - %u = load double, double addrspace(1)* %gep.3 - %v = load double, double addrspace(1)* %gep.4 - - %tmp0 = fmul double %u, %v - %tmp1 = call double @llvm.fma.f64(double %y, double %z, double %tmp0) #0 - %tmp2 = fsub double %x, %tmp1 - - store double %tmp2, double addrspace(1)* %gep.out - ret void -} - -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind } diff --git a/test/CodeGen/R600/fma.f64.ll b/test/CodeGen/R600/fma.f64.ll deleted file mode 100644 index 0a55ef77855..00000000000 --- a/test/CodeGen/R600/fma.f64.ll +++ /dev/null @@ -1,47 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare double @llvm.fma.f64(double, double, double) nounwind readnone -declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone -declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) nounwind readnone - - -; FUNC-LABEL: {{^}}fma_f64: -; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -define void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2, double addrspace(1)* %in3) { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 - %r2 = load double, double addrspace(1)* %in3 - %r3 = tail call double @llvm.fma.f64(double %r0, double %r1, double %r2) - store double %r3, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fma_v2f64: -; SI: v_fma_f64 -; SI: v_fma_f64 -define void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, - <2 x double> addrspace(1)* %in2, <2 x double> addrspace(1)* %in3) { - %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1 - %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2 - %r2 = load <2 x double>, <2 x double> addrspace(1)* %in3 - %r3 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2) - store <2 x double> %r3, <2 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fma_v4f64: -; SI: v_fma_f64 -; SI: v_fma_f64 -; SI: v_fma_f64 -; SI: v_fma_f64 -define void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1, - <4 x double> addrspace(1)* %in2, <4 x double> addrspace(1)* %in3) { - %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1 - %r1 = load <4 x double>, <4 x double> addrspace(1)* %in2 - %r2 = load <4 x double>, <4 x double> addrspace(1)* %in3 - %r3 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %r0, <4 x double> %r1, <4 x double> %r2) - store <4 x double> %r3, <4 x double> addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/fma.ll b/test/CodeGen/R600/fma.ll deleted file mode 100644 index d6024aa0b4c..00000000000 --- a/test/CodeGen/R600/fma.ll +++ /dev/null @@ -1,92 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare float @llvm.fma.f32(float, float, float) nounwind readnone -declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone -declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -; FUNC-LABEL: {{^}}fma_f32: -; SI: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}} - -; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, -; EG: FMA {{\*? *}}[[RES]] -define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1, - float addrspace(1)* %in2, float addrspace(1)* %in3) { - %r0 = load float, float addrspace(1)* %in1 - %r1 = load float, float addrspace(1)* %in2 - %r2 = load float, float addrspace(1)* %in3 - %r3 = tail call float @llvm.fma.f32(float %r0, float %r1, float %r2) - store float %r3, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fma_v2f32: -; SI: v_fma_f32 -; SI: v_fma_f32 - -; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].[[CHLO:[XYZW]]][[CHHI:[XYZW]]], {{T[0-9]\.[XYZW]}}, -; EG-DAG: FMA {{\*? *}}[[RES]].[[CHLO]] -; EG-DAG: FMA {{\*? *}}[[RES]].[[CHHI]] -define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1, - <2 x float> addrspace(1)* %in2, <2 x float> addrspace(1)* %in3) { - %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1 - %r1 = load <2 x float>, <2 x float> addrspace(1)* %in2 - %r2 = load <2 x float>, <2 x float> addrspace(1)* %in3 - %r3 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2) - store <2 x float> %r3, <2 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fma_v4f32: -; SI: v_fma_f32 -; SI: v_fma_f32 -; SI: v_fma_f32 -; SI: v_fma_f32 - -; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].{{[XYZW][XYZW][XYZW][XYZW]}}, {{T[0-9]\.[XYZW]}}, -; EG-DAG: FMA {{\*? *}}[[RES]].X -; EG-DAG: FMA {{\*? *}}[[RES]].Y -; EG-DAG: FMA {{\*? *}}[[RES]].Z -; EG-DAG: FMA {{\*? *}}[[RES]].W -define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1, - <4 x float> addrspace(1)* %in2, <4 x float> addrspace(1)* %in3) { - %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1 - %r1 = load <4 x float>, <4 x float> addrspace(1)* %in2 - %r2 = load <4 x float>, <4 x float> addrspace(1)* %in3 - %r3 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %r0, <4 x float> %r1, <4 x float> %r2) - store <4 x float> %r3, <4 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: @fma_commute_mul_inline_imm_f32 -; SI: v_fma_f32 {{v[0-9]+}}, 2.0, {{v[0-9]+}}, {{v[0-9]+}} -define void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid - %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %in.a.gep, align 4 - %b = load float, float addrspace(1)* %in.b.gep, align 4 - - %fma = call float @llvm.fma.f32(float %a, float 2.0, float %b) - store float %fma, float addrspace(1)* %out.gep, align 4 - ret void -} - -; FUNC-LABEL: @fma_commute_mul_s_f32 -define void @fma_commute_mul_s_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b, float %b) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid - %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %in.a.gep, align 4 - %c = load float, float addrspace(1)* %in.b.gep, align 4 - - %fma = call float @llvm.fma.f32(float %a, float %b, float %c) - store float %fma, float addrspace(1)* %out.gep, align 4 - ret void -} diff --git a/test/CodeGen/R600/fmad.ll b/test/CodeGen/R600/fmad.ll deleted file mode 100644 index 935e35123f4..00000000000 --- a/test/CodeGen/R600/fmad.ll +++ /dev/null @@ -1,19 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;CHECK: MULADD_IEEE * {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -define void @test(<4 x float> inreg %reg0) #0 { - %r0 = extractelement <4 x float> %reg0, i32 0 - %r1 = extractelement <4 x float> %reg0, i32 1 - %r2 = extractelement <4 x float> %reg0, i32 2 - %r3 = fmul float %r0, %r1 - %r4 = fadd float %r3, %r2 - %vec = insertelement <4 x float> undef, float %r4, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) - ret void -} - -declare float @fabs(float ) readnone -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } \ No newline at end of file diff --git a/test/CodeGen/R600/fmax.ll b/test/CodeGen/R600/fmax.ll deleted file mode 100644 index d7127f485c7..00000000000 --- a/test/CodeGen/R600/fmax.ll +++ /dev/null @@ -1,17 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;CHECK: MAX * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -define void @test(<4 x float> inreg %reg0) #0 { - %r0 = extractelement <4 x float> %reg0, i32 0 - %r1 = extractelement <4 x float> %reg0, i32 1 - %r2 = fcmp oge float %r0, %r1 - %r3 = select i1 %r2, float %r0, float %r1 - %vec = insertelement <4 x float> undef, float %r3, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) - ret void -} - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } \ No newline at end of file diff --git a/test/CodeGen/R600/fmax3.f64.ll b/test/CodeGen/R600/fmax3.f64.ll deleted file mode 100644 index f78c71b2826..00000000000 --- a/test/CodeGen/R600/fmax3.f64.ll +++ /dev/null @@ -1,24 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare double @llvm.maxnum.f64(double, double) nounwind readnone - -; SI-LABEL: {{^}}test_fmax3_f64: -; SI-DAG: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}], 0{{$}} -; SI-DAG: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}], 0 offset:8 -; SI-DAG: buffer_load_dwordx2 [[REGC:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}], 0 offset:16 -; SI: v_max_f64 [[REGA]], [[REGA]], [[REGB]] -; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGC]] -; SI: buffer_store_dwordx2 [[RESULT]], -; SI: s_endpgm -define void @test_fmax3_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind { - %bptr = getelementptr double, double addrspace(1)* %aptr, i32 1 - %cptr = getelementptr double, double addrspace(1)* %aptr, i32 2 - %a = load double, double addrspace(1)* %aptr, align 8 - %b = load double, double addrspace(1)* %bptr, align 8 - %c = load double, double addrspace(1)* %cptr, align 8 - %f0 = call double @llvm.maxnum.f64(double %a, double %b) nounwind readnone - %f1 = call double @llvm.maxnum.f64(double %f0, double %c) nounwind readnone - store double %f1, double addrspace(1)* %out, align 8 - ret void -} diff --git a/test/CodeGen/R600/fmax3.ll b/test/CodeGen/R600/fmax3.ll deleted file mode 100644 index c3028a6217d..00000000000 --- a/test/CodeGen/R600/fmax3.ll +++ /dev/null @@ -1,39 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare float @llvm.maxnum.f32(float, float) nounwind readnone - -; SI-LABEL: {{^}}test_fmax3_olt_0: -; SI: buffer_load_dword [[REGC:v[0-9]+]] -; SI: buffer_load_dword [[REGB:v[0-9]+]] -; SI: buffer_load_dword [[REGA:v[0-9]+]] -; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { - %a = load float, float addrspace(1)* %aptr, align 4 - %b = load float, float addrspace(1)* %bptr, align 4 - %c = load float, float addrspace(1)* %cptr, align 4 - %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone - %f1 = call float @llvm.maxnum.f32(float %f0, float %c) nounwind readnone - store float %f1, float addrspace(1)* %out, align 4 - ret void -} - -; Commute operand of second fmax -; SI-LABEL: {{^}}test_fmax3_olt_1: -; SI: buffer_load_dword [[REGB:v[0-9]+]] -; SI: buffer_load_dword [[REGA:v[0-9]+]] -; SI: buffer_load_dword [[REGC:v[0-9]+]] -; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { - %a = load float, float addrspace(1)* %aptr, align 4 - %b = load float, float addrspace(1)* %bptr, align 4 - %c = load float, float addrspace(1)* %cptr, align 4 - %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone - %f1 = call float @llvm.maxnum.f32(float %c, float %f0) nounwind readnone - store float %f1, float addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/fmax_legacy.f64.ll b/test/CodeGen/R600/fmax_legacy.f64.ll deleted file mode 100644 index 828243888ac..00000000000 --- a/test/CodeGen/R600/fmax_legacy.f64.ll +++ /dev/null @@ -1,67 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; Make sure we don't try to form FMAX_LEGACY nodes with f64 - -declare i32 @llvm.r600.read.tidig.x() #1 - -; FUNC-LABEL: @test_fmax_legacy_uge_f64 -define void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0, align 8 - %b = load double, double addrspace(1)* %gep.1, align 8 - - %cmp = fcmp uge double %a, %b - %val = select i1 %cmp, double %a, double %b - store double %val, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: @test_fmax_legacy_oge_f64 -define void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0, align 8 - %b = load double, double addrspace(1)* %gep.1, align 8 - - %cmp = fcmp oge double %a, %b - %val = select i1 %cmp, double %a, double %b - store double %val, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: @test_fmax_legacy_ugt_f64 -define void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0, align 8 - %b = load double, double addrspace(1)* %gep.1, align 8 - - %cmp = fcmp ugt double %a, %b - %val = select i1 %cmp, double %a, double %b - store double %val, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: @test_fmax_legacy_ogt_f64 -define void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0, align 8 - %b = load double, double addrspace(1)* %gep.1, align 8 - - %cmp = fcmp ogt double %a, %b - %val = select i1 %cmp, double %a, double %b - store double %val, double addrspace(1)* %out, align 8 - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/fmax_legacy.ll b/test/CodeGen/R600/fmax_legacy.ll deleted file mode 100644 index 413957d2982..00000000000 --- a/test/CodeGen/R600/fmax_legacy.ll +++ /dev/null @@ -1,116 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s -; RUN: llc -enable-no-nans-fp-math -enable-unsafe-fp-math -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; FIXME: Should replace unsafe-fp-math with no signed zeros. - -declare i32 @llvm.r600.read.tidig.x() #1 - -; FUNC-LABEL: @test_fmax_legacy_uge_f32 -; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] - -; EG: MAX -define void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %cmp = fcmp uge float %a, %b - %val = select i1 %cmp, float %a, float %b - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @test_fmax_legacy_oge_f32 -; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] -; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -; EG: MAX -define void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %cmp = fcmp oge float %a, %b - %val = select i1 %cmp, float %a, float %b - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @test_fmax_legacy_ugt_f32 -; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -; EG: MAX -define void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %cmp = fcmp ugt float %a, %b - %val = select i1 %cmp, float %a, float %b - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @test_fmax_legacy_ogt_f32 -; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] -; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -; EG: MAX -define void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %cmp = fcmp ogt float %a, %b - %val = select i1 %cmp, float %a, float %b - store float %val, float addrspace(1)* %out, align 4 - ret void -} - - -; FUNC-LABEL: @test_fmax_legacy_ogt_f32_multi_use -; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-NOT: v_max_ -; SI: v_cmp_gt_f32 -; SI-NEXT: v_cndmask_b32 -; SI-NOT: v_max_ - -; EG: MAX -define void @test_fmax_legacy_ogt_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %cmp = fcmp ogt float %a, %b - %val = select i1 %cmp, float %a, float %b - store float %val, float addrspace(1)* %out0, align 4 - store i1 %cmp, i1addrspace(1)* %out1 - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/fmaxnum.f64.ll b/test/CodeGen/R600/fmaxnum.f64.ll deleted file mode 100644 index de563cec341..00000000000 --- a/test/CodeGen/R600/fmaxnum.f64.ll +++ /dev/null @@ -1,76 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare double @llvm.maxnum.f64(double, double) #0 -declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>) #0 -declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>) #0 -declare <8 x double> @llvm.maxnum.v8f64(<8 x double>, <8 x double>) #0 -declare <16 x double> @llvm.maxnum.v16f64(<16 x double>, <16 x double>) #0 - -; FUNC-LABEL: @test_fmax_f64 -; SI: v_max_f64 -define void @test_fmax_f64(double addrspace(1)* %out, double %a, double %b) nounwind { - %val = call double @llvm.maxnum.f64(double %a, double %b) #0 - store double %val, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: @test_fmax_v2f64 -; SI: v_max_f64 -; SI: v_max_f64 -define void @test_fmax_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { - %val = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b) #0 - store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: @test_fmax_v4f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -define void @test_fmax_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { - %val = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %a, <4 x double> %b) #0 - store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32 - ret void -} - -; FUNC-LABEL: @test_fmax_v8f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -define void @test_fmax_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { - %val = call <8 x double> @llvm.maxnum.v8f64(<8 x double> %a, <8 x double> %b) #0 - store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64 - ret void -} - -; FUNC-LABEL: @test_fmax_v16f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -; SI: v_max_f64 -define void @test_fmax_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { - %val = call <16 x double> @llvm.maxnum.v16f64(<16 x double> %a, <16 x double> %b) #0 - store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128 - ret void -} - -attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/R600/fmaxnum.ll b/test/CodeGen/R600/fmaxnum.ll deleted file mode 100644 index 3029bd02e4d..00000000000 --- a/test/CodeGen/R600/fmaxnum.ll +++ /dev/null @@ -1,283 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare float @llvm.maxnum.f32(float, float) #0 -declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #0 -declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #0 -declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>) #0 -declare <16 x float> @llvm.maxnum.v16f32(<16 x float>, <16 x float>) #0 - -declare double @llvm.maxnum.f64(double, double) - -; FUNC-LABEL: @test_fmax_f32 -; SI: v_max_f32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG: MAX_DX10 {{.*}}[[OUT]] -define void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwind { - %val = call float @llvm.maxnum.f32(float %a, float %b) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @test_fmax_v2f32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]] -; EG: MAX_DX10 {{.*}}[[OUT]] -; EG: MAX_DX10 {{.*}}[[OUT]] -define void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind { - %val = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %b) #0 - store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: @test_fmax_v4f32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]] -; EG: MAX_DX10 {{.*}}[[OUT]] -; EG: MAX_DX10 {{.*}}[[OUT]] -; EG: MAX_DX10 {{.*}}[[OUT]] -; EG: MAX_DX10 {{.*}}[[OUT]] -define void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind { - %val = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b) #0 - store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: @test_fmax_v8f32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]] -; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].X -; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Y -; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Z -; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].W -; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].X -; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Y -; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Z -; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].W -define void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind { - %val = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %a, <8 x float> %b) #0 - store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32 - ret void -} - -; FUNC-LABEL: @test_fmax_v16f32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT3:T[0-9]+]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT4:T[0-9]+]] -; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].X -; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Y -; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Z -; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].W -; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].X -; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Y -; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Z -; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].W -; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].X -; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].Y -; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].Z -; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].W -; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].X -; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Y -; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Z -; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].W -define void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind { - %val = call <16 x float> @llvm.maxnum.v16f32(<16 x float> %a, <16 x float> %b) #0 - store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64 - ret void -} - -; FUNC-LABEL: @constant_fold_fmax_f32 -; SI-NOT: v_max_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 2.0 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MAX_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind { - %val = call float @llvm.maxnum.f32(float 1.0, float 2.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @constant_fold_fmax_f32_nan_nan -; SI-NOT: v_max_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MAX_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -; EG: 2143289344(nan) -define void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind { - %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @constant_fold_fmax_f32_val_nan -; SI-NOT: v_max_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MAX_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind { - %val = call float @llvm.maxnum.f32(float 1.0, float 0x7FF8000000000000) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @constant_fold_fmax_f32_nan_val -; SI-NOT: v_max_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MAX_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind { - %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 1.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @constant_fold_fmax_f32_p0_p0 -; SI-NOT: v_max_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MAX_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind { - %val = call float @llvm.maxnum.f32(float 0.0, float 0.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @constant_fold_fmax_f32_p0_n0 -; SI-NOT: v_max_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MAX_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind { - %val = call float @llvm.maxnum.f32(float 0.0, float -0.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @constant_fold_fmax_f32_n0_p0 -; SI-NOT: v_max_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MAX_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind { - %val = call float @llvm.maxnum.f32(float -0.0, float 0.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @constant_fold_fmax_f32_n0_n0 -; SI-NOT: v_max_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MAX_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind { - %val = call float @llvm.maxnum.f32(float -0.0, float -0.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @fmax_var_immediate_f32 -; SI: v_max_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}} - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MAX_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind { - %val = call float @llvm.maxnum.f32(float %a, float 2.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @fmax_immediate_var_f32 -; SI: v_max_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}} - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind { - %val = call float @llvm.maxnum.f32(float 2.0, float %a) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @fmax_var_literal_f32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000 -; SI: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind { - %val = call float @llvm.maxnum.f32(float %a, float 99.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @fmax_literal_var_f32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000 -; SI: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) nounwind { - %val = call float @llvm.maxnum.f32(float 99.0, float %a) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/R600/fmin.ll b/test/CodeGen/R600/fmin.ll deleted file mode 100644 index defa8c09638..00000000000 --- a/test/CodeGen/R600/fmin.ll +++ /dev/null @@ -1,17 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;CHECK: MIN * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -define void @test(<4 x float> inreg %reg0) #0 { - %r0 = extractelement <4 x float> %reg0, i32 0 - %r1 = extractelement <4 x float> %reg0, i32 1 - %r2 = fcmp uge float %r0, %r1 - %r3 = select i1 %r2, float %r1, float %r0 - %vec = insertelement <4 x float> undef, float %r3, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) - ret void -} - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } \ No newline at end of file diff --git a/test/CodeGen/R600/fmin3.ll b/test/CodeGen/R600/fmin3.ll deleted file mode 100644 index 0a76699b43e..00000000000 --- a/test/CodeGen/R600/fmin3.ll +++ /dev/null @@ -1,40 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare float @llvm.minnum.f32(float, float) nounwind readnone - -; SI-LABEL: {{^}}test_fmin3_olt_0: -; SI: buffer_load_dword [[REGC:v[0-9]+]] -; SI: buffer_load_dword [[REGB:v[0-9]+]] -; SI: buffer_load_dword [[REGA:v[0-9]+]] -; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { - %a = load float, float addrspace(1)* %aptr, align 4 - %b = load float, float addrspace(1)* %bptr, align 4 - %c = load float, float addrspace(1)* %cptr, align 4 - %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone - %f1 = call float @llvm.minnum.f32(float %f0, float %c) nounwind readnone - store float %f1, float addrspace(1)* %out, align 4 - ret void -} - -; Commute operand of second fmin -; SI-LABEL: {{^}}test_fmin3_olt_1: -; SI: buffer_load_dword [[REGB:v[0-9]+]] -; SI: buffer_load_dword [[REGA:v[0-9]+]] -; SI: buffer_load_dword [[REGC:v[0-9]+]] -; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { - %a = load float, float addrspace(1)* %aptr, align 4 - %b = load float, float addrspace(1)* %bptr, align 4 - %c = load float, float addrspace(1)* %cptr, align 4 - %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone - %f1 = call float @llvm.minnum.f32(float %c, float %f0) nounwind readnone - store float %f1, float addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/fmin_legacy.f64.ll b/test/CodeGen/R600/fmin_legacy.f64.ll deleted file mode 100644 index e19a48f3f7e..00000000000 --- a/test/CodeGen/R600/fmin_legacy.f64.ll +++ /dev/null @@ -1,77 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.r600.read.tidig.x() #1 - -; FUNC-LABEL: @test_fmin_legacy_f64 -define void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double> inreg %reg0) #0 { - %r0 = extractelement <4 x double> %reg0, i32 0 - %r1 = extractelement <4 x double> %reg0, i32 1 - %r2 = fcmp uge double %r0, %r1 - %r3 = select i1 %r2, double %r1, double %r0 - %vec = insertelement <4 x double> undef, double %r3, i32 0 - store <4 x double> %vec, <4 x double> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: @test_fmin_legacy_ule_f64 -define void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0, align 8 - %b = load double, double addrspace(1)* %gep.1, align 8 - - %cmp = fcmp ule double %a, %b - %val = select i1 %cmp, double %a, double %b - store double %val, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: @test_fmin_legacy_ole_f64 -define void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0, align 8 - %b = load double, double addrspace(1)* %gep.1, align 8 - - %cmp = fcmp ole double %a, %b - %val = select i1 %cmp, double %a, double %b - store double %val, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: @test_fmin_legacy_olt_f64 -define void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0, align 8 - %b = load double, double addrspace(1)* %gep.1, align 8 - - %cmp = fcmp olt double %a, %b - %val = select i1 %cmp, double %a, double %b - store double %val, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: @test_fmin_legacy_ult_f64 -define void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0, align 8 - %b = load double, double addrspace(1)* %gep.1, align 8 - - %cmp = fcmp ult double %a, %b - %val = select i1 %cmp, double %a, double %b - store double %val, double addrspace(1)* %out, align 8 - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/fmin_legacy.ll b/test/CodeGen/R600/fmin_legacy.ll deleted file mode 100644 index 6a625c239d7..00000000000 --- a/test/CodeGen/R600/fmin_legacy.ll +++ /dev/null @@ -1,123 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -enable-no-nans-fp-math -enable-unsafe-fp-math -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; FIXME: Should replace unsafe-fp-math with no signed zeros. - -declare i32 @llvm.r600.read.tidig.x() #1 - -; FUNC-LABEL: @test_fmin_legacy_f32 -; EG: MIN * -; SI-SAFE: v_min_legacy_f32_e32 -; SI-NONAN: v_min_f32_e32 -define void @test_fmin_legacy_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 { - %r0 = extractelement <4 x float> %reg0, i32 0 - %r1 = extractelement <4 x float> %reg0, i32 1 - %r2 = fcmp uge float %r0, %r1 - %r3 = select i1 %r2, float %r1, float %r0 - %vec = insertelement <4 x float> undef, float %r3, i32 0 - store <4 x float> %vec, <4 x float> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: @test_fmin_legacy_ule_f32 -; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -define void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %cmp = fcmp ule float %a, %b - %val = select i1 %cmp, float %a, float %b - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @test_fmin_legacy_ole_f32 -; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] -; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -define void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %cmp = fcmp ole float %a, %b - %val = select i1 %cmp, float %a, float %b - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @test_fmin_legacy_olt_f32 -; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]] -; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -define void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %cmp = fcmp olt float %a, %b - %val = select i1 %cmp, float %a, float %b - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @test_fmin_legacy_ult_f32 -; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]] -define void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %cmp = fcmp ult float %a, %b - %val = select i1 %cmp, float %a, float %b - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @test_fmin_legacy_ole_f32_multi_use -; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-NOT: v_min -; SI: v_cmp_le_f32 -; SI-NEXT: v_cndmask_b32 -; SI-NOT: v_min -; SI: s_endpgm -define void @test_fmin_legacy_ole_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %cmp = fcmp ole float %a, %b - %val0 = select i1 %cmp, float %a, float %b - store float %val0, float addrspace(1)* %out0, align 4 - store i1 %cmp, i1 addrspace(1)* %out1 - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/fminnum.f64.ll b/test/CodeGen/R600/fminnum.f64.ll deleted file mode 100644 index 0f929d6a81f..00000000000 --- a/test/CodeGen/R600/fminnum.f64.ll +++ /dev/null @@ -1,76 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare double @llvm.minnum.f64(double, double) #0 -declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>) #0 -declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>) #0 -declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>) #0 -declare <16 x double> @llvm.minnum.v16f64(<16 x double>, <16 x double>) #0 - -; FUNC-LABEL: @test_fmin_f64 -; SI: v_min_f64 -define void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) nounwind { - %val = call double @llvm.minnum.f64(double %a, double %b) #0 - store double %val, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: @test_fmin_v2f64 -; SI: v_min_f64 -; SI: v_min_f64 -define void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind { - %val = call <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b) #0 - store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: @test_fmin_v4f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -define void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind { - %val = call <4 x double> @llvm.minnum.v4f64(<4 x double> %a, <4 x double> %b) #0 - store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32 - ret void -} - -; FUNC-LABEL: @test_fmin_v8f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -define void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind { - %val = call <8 x double> @llvm.minnum.v8f64(<8 x double> %a, <8 x double> %b) #0 - store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64 - ret void -} - -; FUNC-LABEL: @test_fmin_v16f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -; SI: v_min_f64 -define void @test_fmin_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind { - %val = call <16 x double> @llvm.minnum.v16f64(<16 x double> %a, <16 x double> %b) #0 - store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128 - ret void -} - -attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/R600/fminnum.ll b/test/CodeGen/R600/fminnum.ll deleted file mode 100644 index 4d7b52540d8..00000000000 --- a/test/CodeGen/R600/fminnum.ll +++ /dev/null @@ -1,281 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare float @llvm.minnum.f32(float, float) #0 -declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #0 -declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #0 -declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>) #0 -declare <16 x float> @llvm.minnum.v16f32(<16 x float>, <16 x float>) #0 - -; FUNC-LABEL: @test_fmin_f32 -; SI: v_min_f32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG: MIN_DX10 {{.*}}[[OUT]] -define void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwind { - %val = call float @llvm.minnum.f32(float %a, float %b) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @test_fmin_v2f32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]] -; EG: MIN_DX10 {{.*}}[[OUT]] -; EG: MIN_DX10 {{.*}}[[OUT]] -define void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind { - %val = call <2 x float> @llvm.minnum.v2f32(<2 x float> %a, <2 x float> %b) #0 - store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: @test_fmin_v4f32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]] -; EG: MIN_DX10 {{.*}}[[OUT]] -; EG: MIN_DX10 {{.*}}[[OUT]] -; EG: MIN_DX10 {{.*}}[[OUT]] -; EG: MIN_DX10 {{.*}}[[OUT]] -define void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind { - %val = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b) #0 - store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: @test_fmin_v8f32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]] -; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].X -; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Y -; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Z -; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].W -; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].X -; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Y -; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Z -; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].W -define void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind { - %val = call <8 x float> @llvm.minnum.v8f32(<8 x float> %a, <8 x float> %b) #0 - store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32 - ret void -} - -; FUNC-LABEL: @test_fmin_v16f32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT3:T[0-9]+]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT4:T[0-9]+]] -; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].X -; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Y -; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Z -; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].W -; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].X -; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Y -; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Z -; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].W -; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].X -; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].Y -; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].Z -; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].W -; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].X -; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Y -; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Z -; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].W -define void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind { - %val = call <16 x float> @llvm.minnum.v16f32(<16 x float> %a, <16 x float> %b) #0 - store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64 - ret void -} - -; FUNC-LABEL: @constant_fold_fmin_f32 -; SI-NOT: v_min_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MIN_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind { - %val = call float @llvm.minnum.f32(float 1.0, float 2.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @constant_fold_fmin_f32_nan_nan -; SI-NOT: v_min_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MIN_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -; EG: 2143289344({{nan|1\.#QNAN0e\+00}}) -define void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind { - %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @constant_fold_fmin_f32_val_nan -; SI-NOT: v_min_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MIN_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind { - %val = call float @llvm.minnum.f32(float 1.0, float 0x7FF8000000000000) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @constant_fold_fmin_f32_nan_val -; SI-NOT: v_min_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MIN_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind { - %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 1.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @constant_fold_fmin_f32_p0_p0 -; SI-NOT: v_min_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MIN_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind { - %val = call float @llvm.minnum.f32(float 0.0, float 0.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @constant_fold_fmin_f32_p0_n0 -; SI-NOT: v_min_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MIN_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind { - %val = call float @llvm.minnum.f32(float 0.0, float -0.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @constant_fold_fmin_f32_n0_p0 -; SI-NOT: v_min_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MIN_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind { - %val = call float @llvm.minnum.f32(float -0.0, float 0.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @constant_fold_fmin_f32_n0_n0 -; SI-NOT: v_min_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MIN_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind { - %val = call float @llvm.minnum.f32(float -0.0, float -0.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @fmin_var_immediate_f32 -; SI: v_min_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}} - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind { - %val = call float @llvm.minnum.f32(float %a, float 2.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @fmin_immediate_var_f32 -; SI: v_min_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}} - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind { - %val = call float @llvm.minnum.f32(float 2.0, float %a) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @fmin_var_literal_f32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000 -; SI: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind { - %val = call float @llvm.minnum.f32(float %a, float 99.0) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @fmin_literal_var_f32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000 -; SI: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) nounwind { - %val = call float @llvm.minnum.f32(float 99.0, float %a) #0 - store float %val, float addrspace(1)* %out, align 4 - ret void -} - -attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/R600/fmul.ll b/test/CodeGen/R600/fmul.ll deleted file mode 100644 index addc409c9eb..00000000000 --- a/test/CodeGen/R600/fmul.ll +++ /dev/null @@ -1,92 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s - - -; FUNC-LABEL: {{^}}fmul_f32: -; R600: MUL_IEEE {{\** *}}{{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W - -; SI: v_mul_f32 -define void @fmul_f32(float addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fmul float %a, %b - store float %0, float addrspace(1)* %out - ret void -} - -declare float @llvm.R600.load.input(i32) readnone - -declare void @llvm.AMDGPU.store.output(float, i32) - -; FUNC-LABEL: {{^}}fmul_v2f32: -; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}} -; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}} - -; SI: v_mul_f32 -; SI: v_mul_f32 -define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { -entry: - %0 = fmul <2 x float> %a, %b - store <2 x float> %0, <2 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fmul_v4f32: -; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_mul_f32 -; SI: v_mul_f32 -; SI: v_mul_f32 -; SI: v_mul_f32 -define void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 - %a = load <4 x float>, <4 x float> addrspace(1) * %in - %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr - %result = fmul <4 x float> %a, %b - store <4 x float> %result, <4 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_mul_2_k: -; SI: v_mul_f32 -; SI-NOT: v_mul_f32 -; SI: s_endpgm -define void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 { - %y = fmul float %x, 2.0 - %z = fmul float %y, 3.0 - store float %z, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_mul_2_k_inv: -; SI: v_mul_f32 -; SI-NOT: v_mul_f32 -; SI-NOT: v_mad_f32 -; SI: s_endpgm -define void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 { - %y = fmul float %x, 3.0 - %z = fmul float %y, 2.0 - store float %z, float addrspace(1)* %out - ret void -} - -; There should be three multiplies here; %a should be used twice (once -; negated), not duplicated into mul x, 5.0 and mul x, -5.0. -; FUNC-LABEL: {{^}}test_mul_twouse: -; SI: v_mul_f32 -; SI: v_mul_f32 -; SI: v_mul_f32 -; SI-NOT: v_mul_f32 -define void @test_mul_twouse(float addrspace(1)* %out, float %x, float %y) #0 { - %a = fmul float %x, 5.0 - %b = fsub float -0.0, %a - %c = fmul float %b, %y - %d = fmul float %c, %a - store float %d, float addrspace(1)* %out - ret void -} - -attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" } diff --git a/test/CodeGen/R600/fmul64.ll b/test/CodeGen/R600/fmul64.ll deleted file mode 100644 index 3c222eaba89..00000000000 --- a/test/CodeGen/R600/fmul64.ll +++ /dev/null @@ -1,39 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s - -; FUNC-LABEL: {{^}}fmul_f64: -; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2) { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 - %r2 = fmul double %r0, %r1 - store double %r2, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fmul_v2f64: -; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, - <2 x double> addrspace(1)* %in2) { - %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1 - %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2 - %r2 = fmul <2 x double> %r0, %r1 - store <2 x double> %r2, <2 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fmul_v4f64: -; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} -define void @fmul_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1, - <4 x double> addrspace(1)* %in2) { - %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1 - %r1 = load <4 x double>, <4 x double> addrspace(1)* %in2 - %r2 = fmul <4 x double> %r0, %r1 - store <4 x double> %r2, <4 x double> addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/fmuladd.ll b/test/CodeGen/R600/fmuladd.ll deleted file mode 100644 index ae84d841021..00000000000 --- a/test/CodeGen/R600/fmuladd.ll +++ /dev/null @@ -1,199 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s - -declare float @llvm.fmuladd.f32(float, float, float) -declare double @llvm.fmuladd.f64(double, double, double) -declare i32 @llvm.r600.read.tidig.x() nounwind readnone -declare float @llvm.fabs.f32(float) nounwind readnone - -; CHECK-LABEL: {{^}}fmuladd_f32: -; CHECK: v_mad_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}} - -define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1, - float addrspace(1)* %in2, float addrspace(1)* %in3) { - %r0 = load float, float addrspace(1)* %in1 - %r1 = load float, float addrspace(1)* %in2 - %r2 = load float, float addrspace(1)* %in3 - %r3 = tail call float @llvm.fmuladd.f32(float %r0, float %r1, float %r2) - store float %r3, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}fmuladd_f64: -; CHECK: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} - -define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2, double addrspace(1)* %in3) { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 - %r2 = load double, double addrspace(1)* %in3 - %r3 = tail call double @llvm.fmuladd.f64(double %r0, double %r1, double %r2) - store double %r3, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}fmuladd_2.0_a_b_f32 -; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]] -; CHECK: buffer_store_dword [[RESULT]] -define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load float, float addrspace(1)* %gep.0 - %r2 = load float, float addrspace(1)* %gep.1 - - %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2) - store float %r3, float addrspace(1)* %gep.out - ret void -} - -; CHECK-LABEL: {{^}}fmuladd_a_2.0_b_f32 -; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]] -; CHECK: buffer_store_dword [[RESULT]] -define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load float, float addrspace(1)* %gep.0 - %r2 = load float, float addrspace(1)* %gep.1 - - %r3 = tail call float @llvm.fmuladd.f32(float %r1, float 2.0, float %r2) - store float %r3, float addrspace(1)* %gep.out - ret void -} - -; CHECK-LABEL: {{^}}fadd_a_a_b_f32: -; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]] -; CHECK: buffer_store_dword [[RESULT]] -define void @fadd_a_a_b_f32(float addrspace(1)* %out, - float addrspace(1)* %in1, - float addrspace(1)* %in2) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r0 = load float, float addrspace(1)* %gep.0 - %r1 = load float, float addrspace(1)* %gep.1 - - %add.0 = fadd float %r0, %r0 - %add.1 = fadd float %add.0, %r1 - store float %add.1, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}fadd_b_a_a_f32: -; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]] -; CHECK: buffer_store_dword [[RESULT]] -define void @fadd_b_a_a_f32(float addrspace(1)* %out, - float addrspace(1)* %in1, - float addrspace(1)* %in2) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r0 = load float, float addrspace(1)* %gep.0 - %r1 = load float, float addrspace(1)* %gep.1 - - %add.0 = fadd float %r0, %r0 - %add.1 = fadd float %r1, %add.0 - store float %add.1, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32 -; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]] -; CHECK: buffer_store_dword [[RESULT]] -define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load float, float addrspace(1)* %gep.0 - %r2 = load float, float addrspace(1)* %gep.1 - - %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1, float %r2) - store float %r3, float addrspace(1)* %gep.out - ret void -} - - -; CHECK-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32 -; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]] -; CHECK: buffer_store_dword [[RESULT]] -define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load float, float addrspace(1)* %gep.0 - %r2 = load float, float addrspace(1)* %gep.1 - - %r1.fneg = fsub float -0.000000e+00, %r1 - - %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1.fneg, float %r2) - store float %r3, float addrspace(1)* %gep.out - ret void -} - - -; CHECK-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32 -; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]] -; CHECK: buffer_store_dword [[RESULT]] -define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load float, float addrspace(1)* %gep.0 - %r2 = load float, float addrspace(1)* %gep.1 - - %r1.fneg = fsub float -0.000000e+00, %r1 - - %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1.fneg, float %r2) - store float %r3, float addrspace(1)* %gep.out - ret void -} - - -; CHECK-LABEL: {{^}}fmuladd_2.0_a_neg_b_f32 -; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]] -; CHECK: buffer_store_dword [[RESULT]] -define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load float, float addrspace(1)* %gep.0 - %r2 = load float, float addrspace(1)* %gep.1 - - %r2.fneg = fsub float -0.000000e+00, %r2 - - %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2.fneg) - store float %r3, float addrspace(1)* %gep.out - ret void -} diff --git a/test/CodeGen/R600/fnearbyint.ll b/test/CodeGen/R600/fnearbyint.ll deleted file mode 100644 index 4fa9adaabda..00000000000 --- a/test/CodeGen/R600/fnearbyint.ll +++ /dev/null @@ -1,58 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s - -; This should have the exactly the same output as the test for rint, -; so no need to check anything. - -declare float @llvm.nearbyint.f32(float) #0 -declare <2 x float> @llvm.nearbyint.v2f32(<2 x float>) #0 -declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) #0 -declare double @llvm.nearbyint.f64(double) #0 -declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) #0 -declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) #0 - - -define void @fnearbyint_f32(float addrspace(1)* %out, float %in) #1 { -entry: - %0 = call float @llvm.nearbyint.f32(float %in) - store float %0, float addrspace(1)* %out - ret void -} - -define void @fnearbyint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 { -entry: - %0 = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %in) - store <2 x float> %0, <2 x float> addrspace(1)* %out - ret void -} - -define void @fnearbyint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 { -entry: - %0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %in) - store <4 x float> %0, <4 x float> addrspace(1)* %out - ret void -} - -define void @nearbyint_f64(double addrspace(1)* %out, double %in) { -entry: - %0 = call double @llvm.nearbyint.f64(double %in) - store double %0, double addrspace(1)* %out - ret void -} -define void @nearbyint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { -entry: - %0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %in) - store <2 x double> %0, <2 x double> addrspace(1)* %out - ret void -} - -define void @nearbyint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { -entry: - %0 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %in) - store <4 x double> %0, <4 x double> addrspace(1)* %out - ret void -} - -attributes #0 = { nounwind readonly } -attributes #1 = { nounwind } diff --git a/test/CodeGen/R600/fneg-fabs.f64.ll b/test/CodeGen/R600/fneg-fabs.f64.ll deleted file mode 100644 index 8830e827366..00000000000 --- a/test/CodeGen/R600/fneg-fabs.f64.ll +++ /dev/null @@ -1,100 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FIXME: Check something here. Currently it seems fabs + fneg aren't -; into 2 modifiers, although theoretically that should work. - -; FUNC-LABEL: {{^}}fneg_fabs_fadd_f64: -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}| -define void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) { - %fabs = call double @llvm.fabs.f64(double %x) - %fsub = fsub double -0.000000e+00, %fabs - %fadd = fadd double %y, %fsub - store double %fadd, double addrspace(1)* %out, align 8 - ret void -} - -define void @v_fneg_fabs_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %xptr, double addrspace(1)* %yptr) { - %x = load double, double addrspace(1)* %xptr, align 8 - %y = load double, double addrspace(1)* %xptr, align 8 - %fabs = call double @llvm.fabs.f64(double %x) - %fsub = fsub double -0.000000e+00, %fabs - %fadd = fadd double %y, %fsub - store double %fadd, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}fneg_fabs_fmul_f64: -; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|{{v\[[0-9]+:[0-9]+\]}}| -define void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y) { - %fabs = call double @llvm.fabs.f64(double %x) - %fsub = fsub double -0.000000e+00, %fabs - %fmul = fmul double %y, %fsub - store double %fmul, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}fneg_fabs_free_f64: -define void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) { - %bc = bitcast i64 %in to double - %fabs = call double @llvm.fabs.f64(double %bc) - %fsub = fsub double -0.000000e+00, %fabs - store double %fsub, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fneg_fabs_fn_free_f64: -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) { - %bc = bitcast i64 %in to double - %fabs = call double @fabs(double %bc) - %fsub = fsub double -0.000000e+00, %fabs - store double %fsub, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fneg_fabs_f64: -; SI: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}} -; SI: s_load_dwordx2 -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 -; SI-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]] -; SI-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]] -; SI: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}} -define void @fneg_fabs_f64(double addrspace(1)* %out, double %in) { - %fabs = call double @llvm.fabs.f64(double %in) - %fsub = fsub double -0.000000e+00, %fabs - store double %fsub, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}fneg_fabs_v2f64: -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 -; SI-NOT: 0x80000000 -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -define void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { - %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in) - %fsub = fsub <2 x double> , %fabs - store <2 x double> %fsub, <2 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fneg_fabs_v4f64: -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 -; SI-NOT: 0x80000000 -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -define void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { - %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in) - %fsub = fsub <4 x double> , %fabs - store <4 x double> %fsub, <4 x double> addrspace(1)* %out - ret void -} - -declare double @fabs(double) readnone -declare double @llvm.fabs.f64(double) readnone -declare <2 x double> @llvm.fabs.v2f64(<2 x double>) readnone -declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone diff --git a/test/CodeGen/R600/fneg-fabs.ll b/test/CodeGen/R600/fneg-fabs.ll deleted file mode 100644 index 3b4930d9897..00000000000 --- a/test/CodeGen/R600/fneg-fabs.ll +++ /dev/null @@ -1,118 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32: -; SI-NOT: and -; SI: v_sub_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, |{{v[0-9]+}}| -define void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) { - %fabs = call float @llvm.fabs.f32(float %x) - %fsub = fsub float -0.000000e+00, %fabs - %fadd = fadd float %y, %fsub - store float %fadd, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}fneg_fabs_fmul_f32: -; SI-NOT: and -; SI: v_mul_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, -|{{v[0-9]+}}| -; SI-NOT: and -define void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) { - %fabs = call float @llvm.fabs.f32(float %x) - %fsub = fsub float -0.000000e+00, %fabs - %fmul = fmul float %y, %fsub - store float %fmul, float addrspace(1)* %out, align 4 - ret void -} - -; DAGCombiner will transform: -; (fabs (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF)) -; unless isFabsFree returns true - -; FUNC-LABEL: {{^}}fneg_fabs_free_f32: -; R600-NOT: AND -; R600: |PV.{{[XYZW]}}| -; R600: -PV - -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -define void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) { - %bc = bitcast i32 %in to float - %fabs = call float @llvm.fabs.f32(float %bc) - %fsub = fsub float -0.000000e+00, %fabs - store float %fsub, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fneg_fabs_fn_free_f32: -; R600-NOT: AND -; R600: |PV.{{[XYZW]}}| -; R600: -PV - -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -define void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) { - %bc = bitcast i32 %in to float - %fabs = call float @fabs(float %bc) - %fsub = fsub float -0.000000e+00, %fabs - store float %fsub, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fneg_fabs_f32: -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -define void @fneg_fabs_f32(float addrspace(1)* %out, float %in) { - %fabs = call float @llvm.fabs.f32(float %in) - %fsub = fsub float -0.000000e+00, %fabs - store float %fsub, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_fneg_fabs_f32: -; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} -define void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) { - %val = load float, float addrspace(1)* %in, align 4 - %fabs = call float @llvm.fabs.f32(float %val) - %fsub = fsub float -0.000000e+00, %fabs - store float %fsub, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}fneg_fabs_v2f32: -; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; R600: -PV -; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; R600: -PV - -; FIXME: SGPR should be used directly for first src operand. -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 -; SI-NOT: 0x80000000 -; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]] -; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]] -define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { - %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) - %fsub = fsub <2 x float> , %fabs - store <2 x float> %fsub, <2 x float> addrspace(1)* %out - ret void -} - -; FIXME: SGPR should be used directly for first src operand. -; FUNC-LABEL: {{^}}fneg_fabs_v4f32: -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000 -; SI-NOT: 0x80000000 -; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]] -; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]] -; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]] -; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]] -define void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { - %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) - %fsub = fsub <4 x float> , %fabs - store <4 x float> %fsub, <4 x float> addrspace(1)* %out - ret void -} - -declare float @fabs(float) readnone -declare float @llvm.fabs.f32(float) readnone -declare <2 x float> @llvm.fabs.v2f32(<2 x float>) readnone -declare <4 x float> @llvm.fabs.v4f32(<4 x float>) readnone diff --git a/test/CodeGen/R600/fneg.f64.ll b/test/CodeGen/R600/fneg.f64.ll deleted file mode 100644 index aa6df209035..00000000000 --- a/test/CodeGen/R600/fneg.f64.ll +++ /dev/null @@ -1,60 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}fneg_f64: -; GCN: v_xor_b32 -define void @fneg_f64(double addrspace(1)* %out, double %in) { - %fneg = fsub double -0.000000e+00, %in - store double %fneg, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fneg_v2f64: -; GCN: v_xor_b32 -; GCN: v_xor_b32 -define void @fneg_v2f64(<2 x double> addrspace(1)* nocapture %out, <2 x double> %in) { - %fneg = fsub <2 x double> , %in - store <2 x double> %fneg, <2 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fneg_v4f64: -; R600: -PV -; R600: -T -; R600: -PV -; R600: -PV - -; GCN: v_xor_b32 -; GCN: v_xor_b32 -; GCN: v_xor_b32 -; GCN: v_xor_b32 -define void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double> %in) { - %fneg = fsub <4 x double> , %in - store <4 x double> %fneg, <4 x double> addrspace(1)* %out - ret void -} - -; DAGCombiner will transform: -; (fneg (f64 bitcast (i64 a))) => (f64 bitcast (xor (i64 a), 0x80000000)) -; unless the target returns true for isNegFree() - -; FUNC-LABEL: {{^}}fneg_free_f64: -; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, 0, -{{s\[[0-9]+:[0-9]+\]$}} -define void @fneg_free_f64(double addrspace(1)* %out, i64 %in) { - %bc = bitcast i64 %in to double - %fsub = fsub double 0.0, %bc - store double %fsub, double addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}fneg_fold_f64: -; SI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; GCN-NOT: xor -; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -[[NEG_VALUE]], [[NEG_VALUE]] -define void @fneg_fold_f64(double addrspace(1)* %out, double %in) { - %fsub = fsub double -0.0, %in - %fmul = fmul double %fsub, %in - store double %fmul, double addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/fneg.ll b/test/CodeGen/R600/fneg.ll deleted file mode 100644 index a0fd539863c..00000000000 --- a/test/CodeGen/R600/fneg.ll +++ /dev/null @@ -1,70 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}fneg_f32: -; R600: -PV - -; GCN: v_xor_b32 -define void @fneg_f32(float addrspace(1)* %out, float %in) { - %fneg = fsub float -0.000000e+00, %in - store float %fneg, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fneg_v2f32: -; R600: -PV -; R600: -PV - -; GCN: v_xor_b32 -; GCN: v_xor_b32 -define void @fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) { - %fneg = fsub <2 x float> , %in - store <2 x float> %fneg, <2 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fneg_v4f32: -; R600: -PV -; R600: -T -; R600: -PV -; R600: -PV - -; GCN: v_xor_b32 -; GCN: v_xor_b32 -; GCN: v_xor_b32 -; GCN: v_xor_b32 -define void @fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) { - %fneg = fsub <4 x float> , %in - store <4 x float> %fneg, <4 x float> addrspace(1)* %out - ret void -} - -; DAGCombiner will transform: -; (fneg (f32 bitcast (i32 a))) => (f32 bitcast (xor (i32 a), 0x80000000)) -; unless the target returns true for isNegFree() - -; FUNC-LABEL: {{^}}fneg_free_f32: -; R600-NOT: XOR -; R600: -KC0[2].Z - -; XXX: We could use v_add_f32_e64 with the negate bit here instead. -; GCN: v_sub_f32_e64 v{{[0-9]}}, 0, s{{[0-9]+$}} -define void @fneg_free_f32(float addrspace(1)* %out, i32 %in) { - %bc = bitcast i32 %in to float - %fsub = fsub float 0.0, %bc - store float %fsub, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fneg_fold_f32: -; SI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb -; VI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c -; GCN-NOT: xor -; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]] -define void @fneg_fold_f32(float addrspace(1)* %out, float %in) { - %fsub = fsub float -0.0, %in - %fmul = fmul float %fsub, %in - store float %fmul, float addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/fp-classify.ll b/test/CodeGen/R600/fp-classify.ll deleted file mode 100644 index 4fac5176fac..00000000000 --- a/test/CodeGen/R600/fp-classify.ll +++ /dev/null @@ -1,131 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare i1 @llvm.AMDGPU.class.f32(float, i32) #1 -declare i1 @llvm.AMDGPU.class.f64(double, i32) #1 -declare i32 @llvm.r600.read.tidig.x() #1 -declare float @llvm.fabs.f32(float) #1 -declare double @llvm.fabs.f64(double) #1 - -; SI-LABEL: {{^}}test_isinf_pattern: -; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x204{{$}} -; SI: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]] -; SI-NOT: v_cmp -; SI: s_endpgm -define void @test_isinf_pattern(i32 addrspace(1)* nocapture %out, float %x) #0 { - %fabs = tail call float @llvm.fabs.f32(float %x) #1 - %cmp = fcmp oeq float %fabs, 0x7FF0000000000000 - %ext = zext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_not_isinf_pattern_0: -; SI-NOT: v_cmp_class -; SI: s_endpgm -define void @test_not_isinf_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 { - %fabs = tail call float @llvm.fabs.f32(float %x) #1 - %cmp = fcmp ueq float %fabs, 0x7FF0000000000000 - %ext = zext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_not_isinf_pattern_1: -; SI-NOT: v_cmp_class -; SI: s_endpgm -define void @test_not_isinf_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 { - %fabs = tail call float @llvm.fabs.f32(float %x) #1 - %cmp = fcmp oeq float %fabs, 0xFFF0000000000000 - %ext = zext i1 %cmp to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_isfinite_pattern_0: -; SI-NOT: v_cmp -; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1f8{{$}} -; SI: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]] -; SI-NOT: v_cmp -; SI: s_endpgm -define void @test_isfinite_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 { - %ord = fcmp ord float %x, 0.000000e+00 - %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 - %ninf = fcmp une float %x.fabs, 0x7FF0000000000000 - %and = and i1 %ord, %ninf - %ext = zext i1 %and to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -; Use negative infinity -; SI-LABEL: {{^}}test_isfinite_not_pattern_0: -; SI-NOT: v_cmp_class_f32 -; SI: s_endpgm -define void @test_isfinite_not_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 { - %ord = fcmp ord float %x, 0.000000e+00 - %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 - %ninf = fcmp une float %x.fabs, 0xFFF0000000000000 - %and = and i1 %ord, %ninf - %ext = zext i1 %and to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -; No fabs -; SI-LABEL: {{^}}test_isfinite_not_pattern_1: -; SI-NOT: v_cmp_class_f32 -; SI: s_endpgm -define void @test_isfinite_not_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 { - %ord = fcmp ord float %x, 0.000000e+00 - %ninf = fcmp une float %x, 0x7FF0000000000000 - %and = and i1 %ord, %ninf - %ext = zext i1 %and to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -; fabs of different value -; SI-LABEL: {{^}}test_isfinite_not_pattern_2: -; SI-NOT: v_cmp_class_f32 -; SI: s_endpgm -define void @test_isfinite_not_pattern_2(i32 addrspace(1)* nocapture %out, float %x, float %y) #0 { - %ord = fcmp ord float %x, 0.000000e+00 - %x.fabs = tail call float @llvm.fabs.f32(float %y) #1 - %ninf = fcmp une float %x.fabs, 0x7FF0000000000000 - %and = and i1 %ord, %ninf - %ext = zext i1 %and to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -; Wrong ordered compare type -; SI-LABEL: {{^}}test_isfinite_not_pattern_3: -; SI-NOT: v_cmp_class_f32 -; SI: s_endpgm -define void @test_isfinite_not_pattern_3(i32 addrspace(1)* nocapture %out, float %x) #0 { - %ord = fcmp uno float %x, 0.000000e+00 - %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 - %ninf = fcmp une float %x.fabs, 0x7FF0000000000000 - %and = and i1 %ord, %ninf - %ext = zext i1 %and to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -; Wrong unordered compare -; SI-LABEL: {{^}}test_isfinite_not_pattern_4: -; SI-NOT: v_cmp_class_f32 -; SI: s_endpgm -define void @test_isfinite_not_pattern_4(i32 addrspace(1)* nocapture %out, float %x) #0 { - %ord = fcmp ord float %x, 0.000000e+00 - %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 - %ninf = fcmp one float %x.fabs, 0x7FF0000000000000 - %and = and i1 %ord, %ninf - %ext = zext i1 %and to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/fp16_to_fp.ll b/test/CodeGen/R600/fp16_to_fp.ll deleted file mode 100644 index 5a79ca82bc2..00000000000 --- a/test/CodeGen/R600/fp16_to_fp.ll +++ /dev/null @@ -1,29 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone -declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone - -; SI-LABEL: {{^}}test_convert_fp16_to_fp32: -; SI: buffer_load_ushort [[VAL:v[0-9]+]] -; SI: v_cvt_f32_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] -; SI: buffer_store_dword [[RESULT]] -define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { - %val = load i16, i16 addrspace(1)* %in, align 2 - %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone - store float %cvt, float addrspace(1)* %out, align 4 - ret void -} - - -; SI-LABEL: {{^}}test_convert_fp16_to_fp64: -; SI: buffer_load_ushort [[VAL:v[0-9]+]] -; SI: v_cvt_f32_f16_e32 [[RESULT32:v[0-9]+]], [[VAL]] -; SI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[RESULT32]] -; SI: buffer_store_dwordx2 [[RESULT]] -define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { - %val = load i16, i16 addrspace(1)* %in, align 2 - %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone - store double %cvt, double addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/fp32_to_fp16.ll b/test/CodeGen/R600/fp32_to_fp16.ll deleted file mode 100644 index 67925ebd82b..00000000000 --- a/test/CodeGen/R600/fp32_to_fp16.ll +++ /dev/null @@ -1,15 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone - -; SI-LABEL: {{^}}test_convert_fp32_to_fp16: -; SI: buffer_load_dword [[VAL:v[0-9]+]] -; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[VAL]] -; SI: buffer_store_short [[RESULT]] -define void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { - %val = load float, float addrspace(1)* %in, align 4 - %cvt = call i16 @llvm.convert.to.fp16.f32(float %val) nounwind readnone - store i16 %cvt, i16 addrspace(1)* %out, align 2 - ret void -} diff --git a/test/CodeGen/R600/fp_to_sint.f64.ll b/test/CodeGen/R600/fp_to_sint.f64.ll deleted file mode 100644 index 12df6606e8f..00000000000 --- a/test/CodeGen/R600/fp_to_sint.f64.ll +++ /dev/null @@ -1,56 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -; FUNC-LABEL: @fp_to_sint_f64_i32 -; SI: v_cvt_i32_f64_e32 -define void @fp_to_sint_f64_i32(i32 addrspace(1)* %out, double %in) { - %result = fptosi double %in to i32 - store i32 %result, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: @fp_to_sint_v2f64_v2i32 -; SI: v_cvt_i32_f64_e32 -; SI: v_cvt_i32_f64_e32 -define void @fp_to_sint_v2f64_v2i32(<2 x i32> addrspace(1)* %out, <2 x double> %in) { - %result = fptosi <2 x double> %in to <2 x i32> - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: @fp_to_sint_v4f64_v4i32 -; SI: v_cvt_i32_f64_e32 -; SI: v_cvt_i32_f64_e32 -; SI: v_cvt_i32_f64_e32 -; SI: v_cvt_i32_f64_e32 -define void @fp_to_sint_v4f64_v4i32(<4 x i32> addrspace(1)* %out, <4 x double> %in) { - %result = fptosi <4 x double> %in to <4 x i32> - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: @fp_to_sint_i64_f64 -; CI-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] -; CI-DAG: v_trunc_f64_e32 [[TRUNC:v\[[0-9]+:[0-9]+\]]], [[VAL]] -; CI-DAG: s_mov_b32 s[[K0_LO:[0-9]+]], 0{{$}} -; CI-DAG: s_mov_b32 s[[K0_HI:[0-9]+]], 0x3df00000 - -; CI-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[VAL]], s{{\[}}[[K0_LO]]:[[K0_HI]]{{\]}} -; CI-DAG: v_floor_f64_e32 [[FLOOR:v\[[0-9]+:[0-9]+\]]], [[MUL]] - -; CI-DAG: s_mov_b32 s[[K1_HI:[0-9]+]], 0xc1f00000 - -; CI-DAG: v_fma_f64 [[FMA:v\[[0-9]+:[0-9]+\]]], [[FLOOR]], s{{\[[0-9]+}}:[[K1_HI]]{{\]}}, [[TRUNC]] -; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]] -; CI-DAG: v_cvt_i32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]] -; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr double, double addrspace(1)* %in, i32 %tid - %val = load double, double addrspace(1)* %gep, align 8 - %cast = fptosi double %val to i64 - store i64 %cast, i64 addrspace(1)* %out, align 8 - ret void -} diff --git a/test/CodeGen/R600/fp_to_sint.ll b/test/CodeGen/R600/fp_to_sint.ll deleted file mode 100644 index 301a94b4904..00000000000 --- a/test/CodeGen/R600/fp_to_sint.ll +++ /dev/null @@ -1,230 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=EG --check-prefix=FUNC -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC - -declare float @llvm.fabs.f32(float) #0 - -; FUNC-LABEL: {{^}}fp_to_sint_i32: -; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; SI: v_cvt_i32_f32_e32 -; SI: s_endpgm -define void @fp_to_sint_i32(i32 addrspace(1)* %out, float %in) { - %conv = fptosi float %in to i32 - store i32 %conv, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fp_to_sint_i32_fabs: -; SI: v_cvt_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}} -define void @fp_to_sint_i32_fabs(i32 addrspace(1)* %out, float %in) { - %in.fabs = call float @llvm.fabs.f32(float %in) #0 - %conv = fptosi float %in.fabs to i32 - store i32 %conv, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fp_to_sint_v2i32: -; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; SI: v_cvt_i32_f32_e32 -; SI: v_cvt_i32_f32_e32 -define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) { - %result = fptosi <2 x float> %in to <2 x i32> - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fp_to_sint_v4i32: -; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW]}} -; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; SI: v_cvt_i32_f32_e32 -; SI: v_cvt_i32_f32_e32 -; SI: v_cvt_i32_f32_e32 -; SI: v_cvt_i32_f32_e32 -define void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { - %value = load <4 x float>, <4 x float> addrspace(1) * %in - %result = fptosi <4 x float> %value to <4 x i32> - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fp_to_sint_i64: - -; EG-DAG: AND_INT -; EG-DAG: LSHR -; EG-DAG: SUB_INT -; EG-DAG: AND_INT -; EG-DAG: ASHR -; EG-DAG: AND_INT -; EG-DAG: OR_INT -; EG-DAG: SUB_INT -; EG-DAG: LSHL -; EG-DAG: LSHL -; EG-DAG: SUB_INT -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT -; EG-DAG: SETGT_INT -; EG-DAG: XOR_INT -; EG-DAG: XOR_INT -; EG: SUB_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT - -; Check that the compiler doesn't crash with a "cannot select" error -; SI: s_endpgm -define void @fp_to_sint_i64 (i64 addrspace(1)* %out, float %in) { -entry: - %0 = fptosi float %in to i64 - store i64 %0, i64 addrspace(1)* %out - ret void -} - -; FUNC: {{^}}fp_to_sint_v2i64: -; EG-DAG: AND_INT -; EG-DAG: LSHR -; EG-DAG: SUB_INT -; EG-DAG: AND_INT -; EG-DAG: ASHR -; EG-DAG: AND_INT -; EG-DAG: OR_INT -; EG-DAG: SUB_INT -; EG-DAG: LSHL -; EG-DAG: LSHL -; EG-DAG: SUB_INT -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT -; EG-DAG: SETGT_INT -; EG-DAG: XOR_INT -; EG-DAG: XOR_INT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: AND_INT -; EG-DAG: LSHR -; EG-DAG: SUB_INT -; EG-DAG: AND_INT -; EG-DAG: ASHR -; EG-DAG: AND_INT -; EG-DAG: OR_INT -; EG-DAG: SUB_INT -; EG-DAG: LSHL -; EG-DAG: LSHL -; EG-DAG: SUB_INT -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT -; EG-DAG: SETGT_INT -; EG-DAG: XOR_INT -; EG-DAG: XOR_INT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT - -; SI: s_endpgm -define void @fp_to_sint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) { - %conv = fptosi <2 x float> %x to <2 x i64> - store <2 x i64> %conv, <2 x i64> addrspace(1)* %out - ret void -} - -; FUNC: {{^}}fp_to_sint_v4i64: -; EG-DAG: AND_INT -; EG-DAG: LSHR -; EG-DAG: SUB_INT -; EG-DAG: AND_INT -; EG-DAG: ASHR -; EG-DAG: AND_INT -; EG-DAG: OR_INT -; EG-DAG: SUB_INT -; EG-DAG: LSHL -; EG-DAG: LSHL -; EG-DAG: SUB_INT -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT -; EG-DAG: SETGT_INT -; EG-DAG: XOR_INT -; EG-DAG: XOR_INT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: AND_INT -; EG-DAG: LSHR -; EG-DAG: SUB_INT -; EG-DAG: AND_INT -; EG-DAG: ASHR -; EG-DAG: AND_INT -; EG-DAG: OR_INT -; EG-DAG: SUB_INT -; EG-DAG: LSHL -; EG-DAG: LSHL -; EG-DAG: SUB_INT -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT -; EG-DAG: SETGT_INT -; EG-DAG: XOR_INT -; EG-DAG: XOR_INT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: AND_INT -; EG-DAG: LSHR -; EG-DAG: SUB_INT -; EG-DAG: AND_INT -; EG-DAG: ASHR -; EG-DAG: AND_INT -; EG-DAG: OR_INT -; EG-DAG: SUB_INT -; EG-DAG: LSHL -; EG-DAG: LSHL -; EG-DAG: SUB_INT -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT -; EG-DAG: SETGT_INT -; EG-DAG: XOR_INT -; EG-DAG: XOR_INT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: AND_INT -; EG-DAG: LSHR -; EG-DAG: SUB_INT -; EG-DAG: AND_INT -; EG-DAG: ASHR -; EG-DAG: AND_INT -; EG-DAG: OR_INT -; EG-DAG: SUB_INT -; EG-DAG: LSHL -; EG-DAG: LSHL -; EG-DAG: SUB_INT -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT -; EG-DAG: SETGT_INT -; EG-DAG: XOR_INT -; EG-DAG: XOR_INT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT - -; SI: s_endpgm -define void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) { - %conv = fptosi <4 x float> %x to <4 x i64> - store <4 x i64> %conv, <4 x i64> addrspace(1)* %out - ret void -} - -attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/R600/fp_to_uint.f64.ll b/test/CodeGen/R600/fp_to_uint.f64.ll deleted file mode 100644 index 41bc2a78001..00000000000 --- a/test/CodeGen/R600/fp_to_uint.f64.ll +++ /dev/null @@ -1,70 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -; SI-LABEL: {{^}}fp_to_uint_i32_f64: -; SI: v_cvt_u32_f64_e32 -define void @fp_to_uint_i32_f64(i32 addrspace(1)* %out, double %in) { - %cast = fptoui double %in to i32 - store i32 %cast, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: @fp_to_uint_v2i32_v2f64 -; SI: v_cvt_u32_f64_e32 -; SI: v_cvt_u32_f64_e32 -define void @fp_to_uint_v2i32_v2f64(<2 x i32> addrspace(1)* %out, <2 x double> %in) { - %cast = fptoui <2 x double> %in to <2 x i32> - store <2 x i32> %cast, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: @fp_to_uint_v4i32_v4f64 -; SI: v_cvt_u32_f64_e32 -; SI: v_cvt_u32_f64_e32 -; SI: v_cvt_u32_f64_e32 -; SI: v_cvt_u32_f64_e32 -define void @fp_to_uint_v4i32_v4f64(<4 x i32> addrspace(1)* %out, <4 x double> %in) { - %cast = fptoui <4 x double> %in to <4 x i32> - store <4 x i32> %cast, <4 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: @fp_to_uint_i64_f64 -; CI-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] -; CI-DAG: v_trunc_f64_e32 [[TRUNC:v\[[0-9]+:[0-9]+\]]], [[VAL]] -; CI-DAG: s_mov_b32 s[[K0_LO:[0-9]+]], 0{{$}} -; CI-DAG: s_mov_b32 s[[K0_HI:[0-9]+]], 0x3df00000 - -; CI-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[VAL]], s{{\[}}[[K0_LO]]:[[K0_HI]]{{\]}} -; CI-DAG: v_floor_f64_e32 [[FLOOR:v\[[0-9]+:[0-9]+\]]], [[MUL]] - -; CI-DAG: s_mov_b32 s[[K1_HI:[0-9]+]], 0xc1f00000 - -; CI-DAG: v_fma_f64 [[FMA:v\[[0-9]+:[0-9]+\]]], [[FLOOR]], s{{\[[0-9]+}}:[[K1_HI]]{{\]}}, [[TRUNC]] -; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]] -; CI-DAG: v_cvt_u32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]] -; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @fp_to_uint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr double, double addrspace(1)* %in, i32 %tid - %val = load double, double addrspace(1)* %gep, align 8 - %cast = fptoui double %val to i64 - store i64 %cast, i64 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: @fp_to_uint_v2i64_v2f64 -define void @fp_to_uint_v2i64_v2f64(<2 x i64> addrspace(1)* %out, <2 x double> %in) { - %cast = fptoui <2 x double> %in to <2 x i64> - store <2 x i64> %cast, <2 x i64> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: @fp_to_uint_v4i64_v4f64 -define void @fp_to_uint_v4i64_v4f64(<4 x i64> addrspace(1)* %out, <4 x double> %in) { - %cast = fptoui <4 x double> %in to <4 x i64> - store <4 x i64> %cast, <4 x i64> addrspace(1)* %out, align 32 - ret void -} diff --git a/test/CodeGen/R600/fp_to_uint.ll b/test/CodeGen/R600/fp_to_uint.ll deleted file mode 100644 index b7b6ccc238b..00000000000 --- a/test/CodeGen/R600/fp_to_uint.ll +++ /dev/null @@ -1,217 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=EG -check-prefix=FUNC -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC - -; FUNC-LABEL: {{^}}fp_to_uint_f32_to_i32: -; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} - -; SI: v_cvt_u32_f32_e32 -; SI: s_endpgm -define void @fp_to_uint_f32_to_i32 (i32 addrspace(1)* %out, float %in) { - %conv = fptoui float %in to i32 - store i32 %conv, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fp_to_uint_v2f32_to_v2i32: -; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_cvt_u32_f32_e32 -; SI: v_cvt_u32_f32_e32 -define void @fp_to_uint_v2f32_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) { - %result = fptoui <2 x float> %in to <2 x i32> - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fp_to_uint_v4f32_to_v4i32: -; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; SI: v_cvt_u32_f32_e32 -; SI: v_cvt_u32_f32_e32 -; SI: v_cvt_u32_f32_e32 -; SI: v_cvt_u32_f32_e32 - -define void @fp_to_uint_v4f32_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { - %value = load <4 x float>, <4 x float> addrspace(1) * %in - %result = fptoui <4 x float> %value to <4 x i32> - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC: {{^}}fp_to_uint_f32_to_i64: -; EG-DAG: AND_INT -; EG-DAG: LSHR -; EG-DAG: SUB_INT -; EG-DAG: AND_INT -; EG-DAG: ASHR -; EG-DAG: AND_INT -; EG-DAG: OR_INT -; EG-DAG: SUB_INT -; EG-DAG: LSHL -; EG-DAG: LSHL -; EG-DAG: SUB_INT -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT -; EG-DAG: SETGT_INT -; EG-DAG: XOR_INT -; EG-DAG: XOR_INT -; EG: SUB_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT - -; SI: s_endpgm -define void @fp_to_uint_f32_to_i64(i64 addrspace(1)* %out, float %x) { - %conv = fptoui float %x to i64 - store i64 %conv, i64 addrspace(1)* %out - ret void -} - -; FUNC: {{^}}fp_to_uint_v2f32_to_v2i64: -; EG-DAG: AND_INT -; EG-DAG: LSHR -; EG-DAG: SUB_INT -; EG-DAG: AND_INT -; EG-DAG: ASHR -; EG-DAG: AND_INT -; EG-DAG: OR_INT -; EG-DAG: SUB_INT -; EG-DAG: LSHL -; EG-DAG: LSHL -; EG-DAG: SUB_INT -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT -; EG-DAG: SETGT_INT -; EG-DAG: XOR_INT -; EG-DAG: XOR_INT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: AND_INT -; EG-DAG: LSHR -; EG-DAG: SUB_INT -; EG-DAG: AND_INT -; EG-DAG: ASHR -; EG-DAG: AND_INT -; EG-DAG: OR_INT -; EG-DAG: SUB_INT -; EG-DAG: LSHL -; EG-DAG: LSHL -; EG-DAG: SUB_INT -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT -; EG-DAG: SETGT_INT -; EG-DAG: XOR_INT -; EG-DAG: XOR_INT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT - -; SI: s_endpgm -define void @fp_to_uint_v2f32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) { - %conv = fptoui <2 x float> %x to <2 x i64> - store <2 x i64> %conv, <2 x i64> addrspace(1)* %out - ret void -} - -; FUNC: {{^}}fp_to_uint_v4f32_to_v4i64: -; EG-DAG: AND_INT -; EG-DAG: LSHR -; EG-DAG: SUB_INT -; EG-DAG: AND_INT -; EG-DAG: ASHR -; EG-DAG: AND_INT -; EG-DAG: OR_INT -; EG-DAG: SUB_INT -; EG-DAG: LSHL -; EG-DAG: LSHL -; EG-DAG: SUB_INT -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT -; EG-DAG: SETGT_INT -; EG-DAG: XOR_INT -; EG-DAG: XOR_INT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: AND_INT -; EG-DAG: LSHR -; EG-DAG: SUB_INT -; EG-DAG: AND_INT -; EG-DAG: ASHR -; EG-DAG: AND_INT -; EG-DAG: OR_INT -; EG-DAG: SUB_INT -; EG-DAG: LSHL -; EG-DAG: LSHL -; EG-DAG: SUB_INT -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT -; EG-DAG: SETGT_INT -; EG-DAG: XOR_INT -; EG-DAG: XOR_INT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: AND_INT -; EG-DAG: LSHR -; EG-DAG: SUB_INT -; EG-DAG: AND_INT -; EG-DAG: ASHR -; EG-DAG: AND_INT -; EG-DAG: OR_INT -; EG-DAG: SUB_INT -; EG-DAG: LSHL -; EG-DAG: LSHL -; EG-DAG: SUB_INT -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT -; EG-DAG: SETGT_INT -; EG-DAG: XOR_INT -; EG-DAG: XOR_INT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: AND_INT -; EG-DAG: LSHR -; EG-DAG: SUB_INT -; EG-DAG: AND_INT -; EG-DAG: ASHR -; EG-DAG: AND_INT -; EG-DAG: OR_INT -; EG-DAG: SUB_INT -; EG-DAG: LSHL -; EG-DAG: LSHL -; EG-DAG: SUB_INT -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT -; EG-DAG: SETGT_INT -; EG-DAG: XOR_INT -; EG-DAG: XOR_INT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT - -; SI: s_endpgm -define void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) { - %conv = fptoui <4 x float> %x to <4 x i64> - store <4 x i64> %conv, <4 x i64> addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/fpext.ll b/test/CodeGen/R600/fpext.ll deleted file mode 100644 index 734a43be229..00000000000 --- a/test/CodeGen/R600/fpext.ll +++ /dev/null @@ -1,45 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}fpext_f32_to_f64: -; SI: v_cvt_f64_f32_e32 {{v\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} -define void @fpext_f32_to_f64(double addrspace(1)* %out, float %in) { - %result = fpext float %in to double - store double %result, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fpext_v2f32_to_v2f64: -; SI: v_cvt_f64_f32_e32 -; SI: v_cvt_f64_f32_e32 -define void @fpext_v2f32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x float> %in) { - %result = fpext <2 x float> %in to <2 x double> - store <2 x double> %result, <2 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fpext_v4f32_to_v4f64: -; SI: v_cvt_f64_f32_e32 -; SI: v_cvt_f64_f32_e32 -; SI: v_cvt_f64_f32_e32 -; SI: v_cvt_f64_f32_e32 -define void @fpext_v4f32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x float> %in) { - %result = fpext <4 x float> %in to <4 x double> - store <4 x double> %result, <4 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fpext_v8f32_to_v8f64: -; SI: v_cvt_f64_f32_e32 -; SI: v_cvt_f64_f32_e32 -; SI: v_cvt_f64_f32_e32 -; SI: v_cvt_f64_f32_e32 -; SI: v_cvt_f64_f32_e32 -; SI: v_cvt_f64_f32_e32 -; SI: v_cvt_f64_f32_e32 -; SI: v_cvt_f64_f32_e32 -define void @fpext_v8f32_to_v8f64(<8 x double> addrspace(1)* %out, <8 x float> %in) { - %result = fpext <8 x float> %in to <8 x double> - store <8 x double> %result, <8 x double> addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/fptrunc.ll b/test/CodeGen/R600/fptrunc.ll deleted file mode 100644 index 385e10e7baa..00000000000 --- a/test/CodeGen/R600/fptrunc.ll +++ /dev/null @@ -1,45 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}fptrunc_f64_to_f32: -; SI: v_cvt_f32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} -define void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %in) { - %result = fptrunc double %in to float - store float %result, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fptrunc_v2f64_to_v2f32: -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -define void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x double> %in) { - %result = fptrunc <2 x double> %in to <2 x float> - store <2 x float> %result, <2 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fptrunc_v4f64_to_v4f32: -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -define void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x double> %in) { - %result = fptrunc <4 x double> %in to <4 x float> - store <4 x float> %result, <4 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}fptrunc_v8f64_to_v8f32: -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -; SI: v_cvt_f32_f64_e32 -define void @fptrunc_v8f64_to_v8f32(<8 x float> addrspace(1)* %out, <8 x double> %in) { - %result = fptrunc <8 x double> %in to <8 x float> - store <8 x float> %result, <8 x float> addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/frem.ll b/test/CodeGen/R600/frem.ll deleted file mode 100644 index f245ef08cb9..00000000000 --- a/test/CodeGen/R600/frem.ll +++ /dev/null @@ -1,112 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -enable-misched < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -enable-misched < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -enable-misched < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}frem_f32: -; GCN-DAG: buffer_load_dword [[X:v[0-9]+]], {{.*$}} -; GCN-DAG: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16 -; GCN-DAG: v_cmp -; GCN-DAG: v_mul_f32 -; GCN: v_rcp_f32_e32 -; GCN: v_mul_f32_e32 -; GCN: v_mul_f32_e32 -; GCN: v_trunc_f32_e32 -; GCN: v_mad_f32 -; GCN: s_endpgm -define void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, - float addrspace(1)* %in2) #0 { - %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 - %r0 = load float, float addrspace(1)* %in1, align 4 - %r1 = load float, float addrspace(1)* %gep2, align 4 - %r2 = frem float %r0, %r1 - store float %r2, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}unsafe_frem_f32: -; GCN: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16 -; GCN: buffer_load_dword [[X:v[0-9]+]], {{.*}} -; GCN: v_rcp_f32_e32 [[INVY:v[0-9]+]], [[Y]] -; GCN: v_mul_f32_e32 [[DIV:v[0-9]+]], [[INVY]], [[X]] -; GCN: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[DIV]] -; GCN: v_mad_f32 [[RESULT:v[0-9]+]], -[[TRUNC]], [[Y]], [[X]] -; GCN: buffer_store_dword [[RESULT]] -; GCN: s_endpgm -define void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, - float addrspace(1)* %in2) #1 { - %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 - %r0 = load float, float addrspace(1)* %in1, align 4 - %r1 = load float, float addrspace(1)* %gep2, align 4 - %r2 = frem float %r0, %r1 - store float %r2, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}frem_f64: -; GCN: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], {{.*}}, 0 -; GCN: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], {{.*}}, 0 -; GCN-DAG: v_div_fmas_f64 -; GCN-DAG: v_div_scale_f64 -; GCN-DAG: v_mul_f64 -; CI: v_trunc_f64_e32 -; CI: v_mul_f64 -; GCN: v_add_f64 -; GCN: buffer_store_dwordx2 -; GCN: s_endpgm -define void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2) #0 { - %r0 = load double, double addrspace(1)* %in1, align 8 - %r1 = load double, double addrspace(1)* %in2, align 8 - %r2 = frem double %r0, %r1 - store double %r2, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}unsafe_frem_f64: -; GCN: v_rcp_f64_e32 -; GCN: v_mul_f64 -; SI: v_bfe_u32 -; CI: v_trunc_f64_e32 -; GCN: v_fma_f64 -; GCN: s_endpgm -define void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2) #1 { - %r0 = load double, double addrspace(1)* %in1, align 8 - %r1 = load double, double addrspace(1)* %in2, align 8 - %r2 = frem double %r0, %r1 - store double %r2, double addrspace(1)* %out, align 8 - ret void -} - -define void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1, - <2 x float> addrspace(1)* %in2) #0 { - %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4 - %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1, align 8 - %r1 = load <2 x float>, <2 x float> addrspace(1)* %gep2, align 8 - %r2 = frem <2 x float> %r0, %r1 - store <2 x float> %r2, <2 x float> addrspace(1)* %out, align 8 - ret void -} - -define void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1, - <4 x float> addrspace(1)* %in2) #0 { - %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4 - %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1, align 16 - %r1 = load <4 x float>, <4 x float> addrspace(1)* %gep2, align 16 - %r2 = frem <4 x float> %r0, %r1 - store <4 x float> %r2, <4 x float> addrspace(1)* %out, align 16 - ret void -} - -define void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, - <2 x double> addrspace(1)* %in2) #0 { - %gep2 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in2, i32 4 - %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1, align 16 - %r1 = load <2 x double>, <2 x double> addrspace(1)* %gep2, align 16 - %r2 = frem <2 x double> %r0, %r1 - store <2 x double> %r2, <2 x double> addrspace(1)* %out, align 16 - ret void -} - -attributes #0 = { nounwind "unsafe-fp-math"="false" } -attributes #1 = { nounwind "unsafe-fp-math"="true" } diff --git a/test/CodeGen/R600/fsqrt.ll b/test/CodeGen/R600/fsqrt.ll deleted file mode 100644 index 04101346cdf..00000000000 --- a/test/CodeGen/R600/fsqrt.ll +++ /dev/null @@ -1,29 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck %s - -; Run with unsafe-fp-math to make sure nothing tries to turn this into 1 / rsqrt(x) - -; CHECK: {{^}}fsqrt_f32: -; CHECK: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}} - -define void @fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) { - %r0 = load float, float addrspace(1)* %in - %r1 = call float @llvm.sqrt.f32(float %r0) - store float %r1, float addrspace(1)* %out - ret void -} - -; CHECK: {{^}}fsqrt_f64: -; CHECK: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} - -define void @fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) { - %r0 = load double, double addrspace(1)* %in - %r1 = call double @llvm.sqrt.f64(double %r0) - store double %r1, double addrspace(1)* %out - ret void -} - -declare float @llvm.sqrt.f32(float %Val) -declare double @llvm.sqrt.f64(double %Val) diff --git a/test/CodeGen/R600/fsub.ll b/test/CodeGen/R600/fsub.ll deleted file mode 100644 index dfe41cb5b11..00000000000 --- a/test/CodeGen/R600/fsub.ll +++ /dev/null @@ -1,75 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - - -; FUNC-LABEL: {{^}}v_fsub_f32: -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) { - %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 - %a = load float, float addrspace(1)* %in, align 4 - %b = load float, float addrspace(1)* %b_ptr, align 4 - %result = fsub float %a, %b - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}s_fsub_f32: -; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, -KC0[2].W - -; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -define void @s_fsub_f32(float addrspace(1)* %out, float %a, float %b) { - %sub = fsub float %a, %b - store float %sub, float addrspace(1)* %out, align 4 - ret void -} - -declare float @llvm.R600.load.input(i32) readnone - -declare void @llvm.AMDGPU.store.output(float, i32) - -; FUNC-LABEL: {{^}}fsub_v2f32: -; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z -; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y - -; FIXME: Should be using SGPR directly for first operand -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { - %sub = fsub <2 x float> %a, %b - store <2 x float> %sub, <2 x float> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_fsub_v4f32: -; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}} -; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}} -; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}} -; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}} - -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 - %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16 - %b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16 - %result = fsub <4 x float> %a, %b - store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16 - ret void -} - -; FIXME: Should be using SGPR directly for first operand - -; FUNC-LABEL: {{^}}s_fsub_v4f32: -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: s_endpgm -define void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) { - %result = fsub <4 x float> %a, %b - store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16 - ret void -} diff --git a/test/CodeGen/R600/fsub64.ll b/test/CodeGen/R600/fsub64.ll deleted file mode 100644 index f34a48e30a8..00000000000 --- a/test/CodeGen/R600/fsub64.ll +++ /dev/null @@ -1,107 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare double @llvm.fabs.f64(double) #0 - -; SI-LABEL: {{^}}fsub_f64: -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2) { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 - %r2 = fsub double %r0, %r1 - store double %r2, double addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}fsub_fabs_f64: -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|}} -define void @fsub_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2) { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 - %r1.fabs = call double @llvm.fabs.f64(double %r1) #0 - %r2 = fsub double %r0, %r1.fabs - store double %r2, double addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}fsub_fabs_inv_f64: -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, -v\[[0-9]+:[0-9]+\]}} -define void @fsub_fabs_inv_f64(double addrspace(1)* %out, double addrspace(1)* %in1, - double addrspace(1)* %in2) { - %r0 = load double, double addrspace(1)* %in1 - %r1 = load double, double addrspace(1)* %in2 - %r0.fabs = call double @llvm.fabs.f64(double %r0) #0 - %r2 = fsub double %r0.fabs, %r1 - store double %r2, double addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_fsub_f64: -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -define void @s_fsub_f64(double addrspace(1)* %out, double %a, double %b) { - %sub = fsub double %a, %b - store double %sub, double addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_fsub_imm_f64: -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], 4.0, -s\[[0-9]+:[0-9]+\]}} -define void @s_fsub_imm_f64(double addrspace(1)* %out, double %a, double %b) { - %sub = fsub double 4.0, %a - store double %sub, double addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_fsub_imm_inv_f64: -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], -4.0, s\[[0-9]+:[0-9]+\]}} -define void @s_fsub_imm_inv_f64(double addrspace(1)* %out, double %a, double %b) { - %sub = fsub double %a, 4.0 - store double %sub, double addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_fsub_self_f64: -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -s\[[0-9]+:[0-9]+\]}} -define void @s_fsub_self_f64(double addrspace(1)* %out, double %a) { - %sub = fsub double %a, %a - store double %sub, double addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}fsub_v2f64: -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -define void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) { - %sub = fsub <2 x double> %a, %b - store <2 x double> %sub, <2 x double> addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}fsub_v4f64: -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -define void @fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x double>, <4 x double> addrspace(1)* %in, i32 1 - %a = load <4 x double>, <4 x double> addrspace(1)* %in - %b = load <4 x double>, <4 x double> addrspace(1)* %b_ptr - %result = fsub <4 x double> %a, %b - store <4 x double> %result, <4 x double> addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_fsub_v4f64: -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} -define void @s_fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) { - %result = fsub <4 x double> %a, %b - store <4 x double> %result, <4 x double> addrspace(1)* %out, align 16 - ret void -} - -attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/R600/ftrunc.f64.ll b/test/CodeGen/R600/ftrunc.f64.ll deleted file mode 100644 index 6618d8b5e57..00000000000 --- a/test/CodeGen/R600/ftrunc.f64.ll +++ /dev/null @@ -1,111 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s - -declare double @llvm.trunc.f64(double) nounwind readnone -declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone -declare <3 x double> @llvm.trunc.v3f64(<3 x double>) nounwind readnone -declare <4 x double> @llvm.trunc.v4f64(<4 x double>) nounwind readnone -declare <8 x double> @llvm.trunc.v8f64(<8 x double>) nounwind readnone -declare <16 x double> @llvm.trunc.v16f64(<16 x double>) nounwind readnone - -; FUNC-LABEL: {{^}}v_ftrunc_f64: -; CI: v_trunc_f64 -; SI: v_bfe_u32 {{v[0-9]+}}, {{v[0-9]+}}, 20, 11 -; SI: s_endpgm -define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) { - %x = load double, double addrspace(1)* %in, align 8 - %y = call double @llvm.trunc.f64(double %x) nounwind readnone - store double %y, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}ftrunc_f64: -; CI: v_trunc_f64_e32 - -; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014 -; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 -; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01 -; SI: s_lshr_b64 -; SI: s_not_b64 -; SI: s_and_b64 -; SI: cmp_gt_i32 -; SI: cndmask_b32 -; SI: cndmask_b32 -; SI: cmp_lt_i32 -; SI: cndmask_b32 -; SI: cndmask_b32 -; SI: s_endpgm -define void @ftrunc_f64(double addrspace(1)* %out, double %x) { - %y = call double @llvm.trunc.f64(double %x) nounwind readnone - store double %y, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}ftrunc_v2f64: -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -define void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) { - %y = call <2 x double> @llvm.trunc.v2f64(<2 x double> %x) nounwind readnone - store <2 x double> %y, <2 x double> addrspace(1)* %out - ret void -} - -; FIXME-FUNC-LABEL: {{^}}ftrunc_v3f64: -; FIXME-CI: v_trunc_f64_e32 -; FIXME-CI: v_trunc_f64_e32 -; FIXME-CI: v_trunc_f64_e32 -; define void @ftrunc_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) { -; %y = call <3 x double> @llvm.trunc.v3f64(<3 x double> %x) nounwind readnone -; store <3 x double> %y, <3 x double> addrspace(1)* %out -; ret void -; } - -; FUNC-LABEL: {{^}}ftrunc_v4f64: -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -define void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) { - %y = call <4 x double> @llvm.trunc.v4f64(<4 x double> %x) nounwind readnone - store <4 x double> %y, <4 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}ftrunc_v8f64: -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -define void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) { - %y = call <8 x double> @llvm.trunc.v8f64(<8 x double> %x) nounwind readnone - store <8 x double> %y, <8 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}ftrunc_v16f64: -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -; CI: v_trunc_f64_e32 -define void @ftrunc_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) { - %y = call <16 x double> @llvm.trunc.v16f64(<16 x double> %x) nounwind readnone - store <16 x double> %y, <16 x double> addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/ftrunc.ll b/test/CodeGen/R600/ftrunc.ll deleted file mode 100644 index edc08609a8a..00000000000 --- a/test/CodeGen/R600/ftrunc.ll +++ /dev/null @@ -1,120 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG --check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI --check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI --check-prefix=FUNC %s - -declare float @llvm.trunc.f32(float) nounwind readnone -declare <2 x float> @llvm.trunc.v2f32(<2 x float>) nounwind readnone -declare <3 x float> @llvm.trunc.v3f32(<3 x float>) nounwind readnone -declare <4 x float> @llvm.trunc.v4f32(<4 x float>) nounwind readnone -declare <8 x float> @llvm.trunc.v8f32(<8 x float>) nounwind readnone -declare <16 x float> @llvm.trunc.v16f32(<16 x float>) nounwind readnone - -; FUNC-LABEL: {{^}}ftrunc_f32: -; EG: TRUNC -; SI: v_trunc_f32_e32 -define void @ftrunc_f32(float addrspace(1)* %out, float %x) { - %y = call float @llvm.trunc.f32(float %x) nounwind readnone - store float %y, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}ftrunc_v2f32: -; EG: TRUNC -; EG: TRUNC -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -define void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) { - %y = call <2 x float> @llvm.trunc.v2f32(<2 x float> %x) nounwind readnone - store <2 x float> %y, <2 x float> addrspace(1)* %out - ret void -} - -; FIXME-FUNC-LABEL: {{^}}ftrunc_v3f32: -; FIXME-EG: TRUNC -; FIXME-EG: TRUNC -; FIXME-EG: TRUNC -; FIXME-SI: v_trunc_f32_e32 -; FIXME-SI: v_trunc_f32_e32 -; FIXME-SI: v_trunc_f32_e32 -; define void @ftrunc_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) { -; %y = call <3 x float> @llvm.trunc.v3f32(<3 x float> %x) nounwind readnone -; store <3 x float> %y, <3 x float> addrspace(1)* %out -; ret void -; } - -; FUNC-LABEL: {{^}}ftrunc_v4f32: -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -define void @ftrunc_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) { - %y = call <4 x float> @llvm.trunc.v4f32(<4 x float> %x) nounwind readnone - store <4 x float> %y, <4 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}ftrunc_v8f32: -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -define void @ftrunc_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) { - %y = call <8 x float> @llvm.trunc.v8f32(<8 x float> %x) nounwind readnone - store <8 x float> %y, <8 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}ftrunc_v16f32: -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; EG: TRUNC -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -; SI: v_trunc_f32_e32 -define void @ftrunc_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) { - %y = call <16 x float> @llvm.trunc.v16f32(<16 x float> %x) nounwind readnone - store <16 x float> %y, <16 x float> addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/gep-address-space.ll b/test/CodeGen/R600/gep-address-space.ll deleted file mode 100644 index 471b0f6b13e..00000000000 --- a/test/CodeGen/R600/gep-address-space.ll +++ /dev/null @@ -1,55 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=CHECK %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s - -define void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind { -; CHECK-LABEL: {{^}}use_gep_address_space: -; CHECK: v_mov_b32_e32 [[PTR:v[0-9]+]], s{{[0-9]+}} -; CHECK: ds_write_b32 [[PTR]], v{{[0-9]+}} offset:64 - %p = getelementptr [1024 x i32], [1024 x i32] addrspace(3)* %array, i16 0, i16 16 - store i32 99, i32 addrspace(3)* %p - ret void -} - -define void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind { -; CHECK-LABEL: {{^}}use_gep_address_space_large_offset: -; The LDS offset will be 65536 bytes, which is larger than the size of LDS on -; SI, which is why it is being OR'd with the base pointer. -; SI: s_or_b32 -; CI: s_add_i32 -; CHECK: ds_write_b32 - %p = getelementptr [1024 x i32], [1024 x i32] addrspace(3)* %array, i16 0, i16 16384 - store i32 99, i32 addrspace(3)* %p - ret void -} - -define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind { -; CHECK-LABEL: {{^}}gep_as_vector_v4: -; CHECK: s_add_i32 -; CHECK: s_add_i32 -; CHECK: s_add_i32 -; CHECK: s_add_i32 - %p = getelementptr [1024 x i32], <4 x [1024 x i32] addrspace(3)*> %array, <4 x i16> zeroinitializer, <4 x i16> - %p0 = extractelement <4 x i32 addrspace(3)*> %p, i32 0 - %p1 = extractelement <4 x i32 addrspace(3)*> %p, i32 1 - %p2 = extractelement <4 x i32 addrspace(3)*> %p, i32 2 - %p3 = extractelement <4 x i32 addrspace(3)*> %p, i32 3 - store i32 99, i32 addrspace(3)* %p0 - store i32 99, i32 addrspace(3)* %p1 - store i32 99, i32 addrspace(3)* %p2 - store i32 99, i32 addrspace(3)* %p3 - ret void -} - -define void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind { -; CHECK-LABEL: {{^}}gep_as_vector_v2: -; CHECK: s_add_i32 -; CHECK: s_add_i32 - %p = getelementptr [1024 x i32], <2 x [1024 x i32] addrspace(3)*> %array, <2 x i16> zeroinitializer, <2 x i16> - %p0 = extractelement <2 x i32 addrspace(3)*> %p, i32 0 - %p1 = extractelement <2 x i32 addrspace(3)*> %p, i32 1 - store i32 99, i32 addrspace(3)* %p0 - store i32 99, i32 addrspace(3)* %p1 - ret void -} - diff --git a/test/CodeGen/R600/global-directive.ll b/test/CodeGen/R600/global-directive.ll deleted file mode 100644 index be775cf9292..00000000000 --- a/test/CodeGen/R600/global-directive.ll +++ /dev/null @@ -1,15 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -; Make sure the GlobalDirective isn't merged with the function name - -; SI: .globl foo -; SI: {{^}}foo: -define void @foo(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %a = load i32, i32 addrspace(1)* %in - %b = load i32, i32 addrspace(1)* %b_ptr - %result = add i32 %a, %b - store i32 %result, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/global-extload-i1.ll b/test/CodeGen/R600/global-extload-i1.ll deleted file mode 100644 index bd9557d730f..00000000000 --- a/test/CodeGen/R600/global-extload-i1.ll +++ /dev/null @@ -1,302 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; FIXME: Evergreen broken - -; FUNC-LABEL: {{^}}zextload_global_i1_to_i32: -; SI: buffer_load_ubyte -; SI: buffer_store_dword -; SI: s_endpgm -define void @zextload_global_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %a = load i1, i1 addrspace(1)* %in - %ext = zext i1 %a to i32 - store i32 %ext, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_i1_to_i32: -; SI: buffer_load_ubyte -; SI: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1{{$}} -; SI: buffer_store_dword -; SI: s_endpgm -define void @sextload_global_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %a = load i1, i1 addrspace(1)* %in - %ext = sext i1 %a to i32 - store i32 %ext, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v1i1_to_v1i32: -; SI: s_endpgm -define void @zextload_global_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i1>, <1 x i1> addrspace(1)* %in - %ext = zext <1 x i1> %load to <1 x i32> - store <1 x i32> %ext, <1 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v1i1_to_v1i32: -; SI: s_endpgm -define void @sextload_global_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i1>, <1 x i1> addrspace(1)* %in - %ext = sext <1 x i1> %load to <1 x i32> - store <1 x i32> %ext, <1 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v2i1_to_v2i32: -; SI: s_endpgm -define void @zextload_global_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i1>, <2 x i1> addrspace(1)* %in - %ext = zext <2 x i1> %load to <2 x i32> - store <2 x i32> %ext, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v2i1_to_v2i32: -; SI: s_endpgm -define void @sextload_global_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i1>, <2 x i1> addrspace(1)* %in - %ext = sext <2 x i1> %load to <2 x i32> - store <2 x i32> %ext, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v4i1_to_v4i32: -; SI: s_endpgm -define void @zextload_global_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i1>, <4 x i1> addrspace(1)* %in - %ext = zext <4 x i1> %load to <4 x i32> - store <4 x i32> %ext, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v4i1_to_v4i32: -; SI: s_endpgm -define void @sextload_global_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i1>, <4 x i1> addrspace(1)* %in - %ext = sext <4 x i1> %load to <4 x i32> - store <4 x i32> %ext, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v8i1_to_v8i32: -; SI: s_endpgm -define void @zextload_global_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i1>, <8 x i1> addrspace(1)* %in - %ext = zext <8 x i1> %load to <8 x i32> - store <8 x i32> %ext, <8 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v8i1_to_v8i32: -; SI: s_endpgm -define void @sextload_global_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i1>, <8 x i1> addrspace(1)* %in - %ext = sext <8 x i1> %load to <8 x i32> - store <8 x i32> %ext, <8 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v16i1_to_v16i32: -; SI: s_endpgm -define void @zextload_global_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i1>, <16 x i1> addrspace(1)* %in - %ext = zext <16 x i1> %load to <16 x i32> - store <16 x i32> %ext, <16 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v16i1_to_v16i32: -; SI: s_endpgm -define void @sextload_global_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i1>, <16 x i1> addrspace(1)* %in - %ext = sext <16 x i1> %load to <16 x i32> - store <16 x i32> %ext, <16 x i32> addrspace(1)* %out - ret void -} - -; XFUNC-LABEL: {{^}}zextload_global_v32i1_to_v32i32: -; XSI: s_endpgm -; define void @zextload_global_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind { -; %load = load <32 x i1>, <32 x i1> addrspace(1)* %in -; %ext = zext <32 x i1> %load to <32 x i32> -; store <32 x i32> %ext, <32 x i32> addrspace(1)* %out -; ret void -; } - -; XFUNC-LABEL: {{^}}sextload_global_v32i1_to_v32i32: -; XSI: s_endpgm -; define void @sextload_global_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind { -; %load = load <32 x i1>, <32 x i1> addrspace(1)* %in -; %ext = sext <32 x i1> %load to <32 x i32> -; store <32 x i32> %ext, <32 x i32> addrspace(1)* %out -; ret void -; } - -; XFUNC-LABEL: {{^}}zextload_global_v64i1_to_v64i32: -; XSI: s_endpgm -; define void @zextload_global_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind { -; %load = load <64 x i1>, <64 x i1> addrspace(1)* %in -; %ext = zext <64 x i1> %load to <64 x i32> -; store <64 x i32> %ext, <64 x i32> addrspace(1)* %out -; ret void -; } - -; XFUNC-LABEL: {{^}}sextload_global_v64i1_to_v64i32: -; XSI: s_endpgm -; define void @sextload_global_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind { -; %load = load <64 x i1>, <64 x i1> addrspace(1)* %in -; %ext = sext <64 x i1> %load to <64 x i32> -; store <64 x i32> %ext, <64 x i32> addrspace(1)* %out -; ret void -; } - -; FUNC-LABEL: {{^}}zextload_global_i1_to_i64: -; SI: buffer_load_ubyte [[LOAD:v[0-9]+]], -; SI: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}} -; SI: buffer_store_dwordx2 -define void @zextload_global_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %a = load i1, i1 addrspace(1)* %in - %ext = zext i1 %a to i64 - store i64 %ext, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_i1_to_i64: -; SI: buffer_load_ubyte [[LOAD:v[0-9]+]], -; SI: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}} -; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]] -; SI: buffer_store_dwordx2 -define void @sextload_global_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %a = load i1, i1 addrspace(1)* %in - %ext = sext i1 %a to i64 - store i64 %ext, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v1i1_to_v1i64: -; SI: s_endpgm -define void @zextload_global_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i1>, <1 x i1> addrspace(1)* %in - %ext = zext <1 x i1> %load to <1 x i64> - store <1 x i64> %ext, <1 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v1i1_to_v1i64: -; SI: s_endpgm -define void @sextload_global_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i1>, <1 x i1> addrspace(1)* %in - %ext = sext <1 x i1> %load to <1 x i64> - store <1 x i64> %ext, <1 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v2i1_to_v2i64: -; SI: s_endpgm -define void @zextload_global_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i1>, <2 x i1> addrspace(1)* %in - %ext = zext <2 x i1> %load to <2 x i64> - store <2 x i64> %ext, <2 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v2i1_to_v2i64: -; SI: s_endpgm -define void @sextload_global_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i1>, <2 x i1> addrspace(1)* %in - %ext = sext <2 x i1> %load to <2 x i64> - store <2 x i64> %ext, <2 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v4i1_to_v4i64: -; SI: s_endpgm -define void @zextload_global_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i1>, <4 x i1> addrspace(1)* %in - %ext = zext <4 x i1> %load to <4 x i64> - store <4 x i64> %ext, <4 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v4i1_to_v4i64: -; SI: s_endpgm -define void @sextload_global_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i1>, <4 x i1> addrspace(1)* %in - %ext = sext <4 x i1> %load to <4 x i64> - store <4 x i64> %ext, <4 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v8i1_to_v8i64: -; SI: s_endpgm -define void @zextload_global_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i1>, <8 x i1> addrspace(1)* %in - %ext = zext <8 x i1> %load to <8 x i64> - store <8 x i64> %ext, <8 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v8i1_to_v8i64: -; SI: s_endpgm -define void @sextload_global_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i1>, <8 x i1> addrspace(1)* %in - %ext = sext <8 x i1> %load to <8 x i64> - store <8 x i64> %ext, <8 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v16i1_to_v16i64: -; SI: s_endpgm -define void @zextload_global_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i1>, <16 x i1> addrspace(1)* %in - %ext = zext <16 x i1> %load to <16 x i64> - store <16 x i64> %ext, <16 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v16i1_to_v16i64: -; SI: s_endpgm -define void @sextload_global_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i1>, <16 x i1> addrspace(1)* %in - %ext = sext <16 x i1> %load to <16 x i64> - store <16 x i64> %ext, <16 x i64> addrspace(1)* %out - ret void -} - -; XFUNC-LABEL: {{^}}zextload_global_v32i1_to_v32i64: -; XSI: s_endpgm -; define void @zextload_global_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind { -; %load = load <32 x i1>, <32 x i1> addrspace(1)* %in -; %ext = zext <32 x i1> %load to <32 x i64> -; store <32 x i64> %ext, <32 x i64> addrspace(1)* %out -; ret void -; } - -; XFUNC-LABEL: {{^}}sextload_global_v32i1_to_v32i64: -; XSI: s_endpgm -; define void @sextload_global_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* nocapture %in) nounwind { -; %load = load <32 x i1>, <32 x i1> addrspace(1)* %in -; %ext = sext <32 x i1> %load to <32 x i64> -; store <32 x i64> %ext, <32 x i64> addrspace(1)* %out -; ret void -; } - -; XFUNC-LABEL: {{^}}zextload_global_v64i1_to_v64i64: -; XSI: s_endpgm -; define void @zextload_global_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind { -; %load = load <64 x i1>, <64 x i1> addrspace(1)* %in -; %ext = zext <64 x i1> %load to <64 x i64> -; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out -; ret void -; } - -; XFUNC-LABEL: {{^}}sextload_global_v64i1_to_v64i64: -; XSI: s_endpgm -; define void @sextload_global_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* nocapture %in) nounwind { -; %load = load <64 x i1>, <64 x i1> addrspace(1)* %in -; %ext = sext <64 x i1> %load to <64 x i64> -; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out -; ret void -; } diff --git a/test/CodeGen/R600/global-extload-i16.ll b/test/CodeGen/R600/global-extload-i16.ll deleted file mode 100644 index 103a40dee27..00000000000 --- a/test/CodeGen/R600/global-extload-i16.ll +++ /dev/null @@ -1,302 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; FIXME: cypress is broken because the bigger testcases spill and it's not implemented - -; FUNC-LABEL: {{^}}zextload_global_i16_to_i32: -; SI: buffer_load_ushort -; SI: buffer_store_dword -; SI: s_endpgm -define void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { - %a = load i16, i16 addrspace(1)* %in - %ext = zext i16 %a to i32 - store i32 %ext, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_i16_to_i32: -; SI: buffer_load_sshort -; SI: buffer_store_dword -; SI: s_endpgm -define void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { - %a = load i16, i16 addrspace(1)* %in - %ext = sext i16 %a to i32 - store i32 %ext, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i32: -; SI: buffer_load_ushort -; SI: s_endpgm -define void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i16>, <1 x i16> addrspace(1)* %in - %ext = zext <1 x i16> %load to <1 x i32> - store <1 x i32> %ext, <1 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i32: -; SI: buffer_load_sshort -; SI: s_endpgm -define void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i16>, <1 x i16> addrspace(1)* %in - %ext = sext <1 x i16> %load to <1 x i32> - store <1 x i32> %ext, <1 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i32: -; SI: s_endpgm -define void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i16>, <2 x i16> addrspace(1)* %in - %ext = zext <2 x i16> %load to <2 x i32> - store <2 x i32> %ext, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i32: -; SI: s_endpgm -define void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i16>, <2 x i16> addrspace(1)* %in - %ext = sext <2 x i16> %load to <2 x i32> - store <2 x i32> %ext, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i32: -; SI: s_endpgm -define void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i16>, <4 x i16> addrspace(1)* %in - %ext = zext <4 x i16> %load to <4 x i32> - store <4 x i32> %ext, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i32: -; SI: s_endpgm -define void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i16>, <4 x i16> addrspace(1)* %in - %ext = sext <4 x i16> %load to <4 x i32> - store <4 x i32> %ext, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i32: -; SI: s_endpgm -define void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i16>, <8 x i16> addrspace(1)* %in - %ext = zext <8 x i16> %load to <8 x i32> - store <8 x i32> %ext, <8 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i32: -; SI: s_endpgm -define void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i16>, <8 x i16> addrspace(1)* %in - %ext = sext <8 x i16> %load to <8 x i32> - store <8 x i32> %ext, <8 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i32: -; SI: s_endpgm -define void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i16>, <16 x i16> addrspace(1)* %in - %ext = zext <16 x i16> %load to <16 x i32> - store <16 x i32> %ext, <16 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i32: -; SI: s_endpgm -define void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i16>, <16 x i16> addrspace(1)* %in - %ext = sext <16 x i16> %load to <16 x i32> - store <16 x i32> %ext, <16 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i32: -; SI: s_endpgm -define void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <32 x i16>, <32 x i16> addrspace(1)* %in - %ext = zext <32 x i16> %load to <32 x i32> - store <32 x i32> %ext, <32 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i32: -; SI: s_endpgm -define void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <32 x i16>, <32 x i16> addrspace(1)* %in - %ext = sext <32 x i16> %load to <32 x i32> - store <32 x i32> %ext, <32 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i32: -; SI: s_endpgm -define void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <64 x i16>, <64 x i16> addrspace(1)* %in - %ext = zext <64 x i16> %load to <64 x i32> - store <64 x i32> %ext, <64 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i32: -; SI: s_endpgm -define void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <64 x i16>, <64 x i16> addrspace(1)* %in - %ext = sext <64 x i16> %load to <64 x i32> - store <64 x i32> %ext, <64 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_i16_to_i64: -; SI: buffer_load_ushort v[[LO:[0-9]+]], -; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} -; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] -define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { - %a = load i16, i16 addrspace(1)* %in - %ext = zext i16 %a to i64 - store i64 %ext, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_i16_to_i64: -; SI: buffer_load_sshort [[LOAD:v[0-9]+]], -; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]] -; SI: buffer_store_dwordx2 -define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { - %a = load i16, i16 addrspace(1)* %in - %ext = sext i16 %a to i64 - store i64 %ext, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i64: -; SI: s_endpgm -define void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i16>, <1 x i16> addrspace(1)* %in - %ext = zext <1 x i16> %load to <1 x i64> - store <1 x i64> %ext, <1 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i64: -; SI: s_endpgm -define void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i16>, <1 x i16> addrspace(1)* %in - %ext = sext <1 x i16> %load to <1 x i64> - store <1 x i64> %ext, <1 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i64: -; SI: s_endpgm -define void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i16>, <2 x i16> addrspace(1)* %in - %ext = zext <2 x i16> %load to <2 x i64> - store <2 x i64> %ext, <2 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i64: -; SI: s_endpgm -define void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i16>, <2 x i16> addrspace(1)* %in - %ext = sext <2 x i16> %load to <2 x i64> - store <2 x i64> %ext, <2 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i64: -; SI: s_endpgm -define void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i16>, <4 x i16> addrspace(1)* %in - %ext = zext <4 x i16> %load to <4 x i64> - store <4 x i64> %ext, <4 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i64: -; SI: s_endpgm -define void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i16>, <4 x i16> addrspace(1)* %in - %ext = sext <4 x i16> %load to <4 x i64> - store <4 x i64> %ext, <4 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i64: -; SI: s_endpgm -define void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i16>, <8 x i16> addrspace(1)* %in - %ext = zext <8 x i16> %load to <8 x i64> - store <8 x i64> %ext, <8 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i64: -; SI: s_endpgm -define void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i16>, <8 x i16> addrspace(1)* %in - %ext = sext <8 x i16> %load to <8 x i64> - store <8 x i64> %ext, <8 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i64: -; SI: s_endpgm -define void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i16>, <16 x i16> addrspace(1)* %in - %ext = zext <16 x i16> %load to <16 x i64> - store <16 x i64> %ext, <16 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i64: -; SI: s_endpgm -define void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i16>, <16 x i16> addrspace(1)* %in - %ext = sext <16 x i16> %load to <16 x i64> - store <16 x i64> %ext, <16 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i64: -; SI: s_endpgm -define void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <32 x i16>, <32 x i16> addrspace(1)* %in - %ext = zext <32 x i16> %load to <32 x i64> - store <32 x i64> %ext, <32 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i64: -; SI: s_endpgm -define void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <32 x i16>, <32 x i16> addrspace(1)* %in - %ext = sext <32 x i16> %load to <32 x i64> - store <32 x i64> %ext, <32 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i64: -; SI: s_endpgm -define void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <64 x i16>, <64 x i16> addrspace(1)* %in - %ext = zext <64 x i16> %load to <64 x i64> - store <64 x i64> %ext, <64 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i64: -; SI: s_endpgm -define void @sextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind { - %load = load <64 x i16>, <64 x i16> addrspace(1)* %in - %ext = sext <64 x i16> %load to <64 x i64> - store <64 x i64> %ext, <64 x i64> addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/global-extload-i32.ll b/test/CodeGen/R600/global-extload-i32.ll deleted file mode 100644 index 79b83452939..00000000000 --- a/test/CodeGen/R600/global-extload-i32.ll +++ /dev/null @@ -1,457 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}zextload_global_i32_to_i64: -; SI: buffer_load_dword v[[LO:[0-9]+]], -; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} -; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] -define void @zextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %a = load i32, i32 addrspace(1)* %in - %ext = zext i32 %a to i64 - store i64 %ext, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_i32_to_i64: -; SI: buffer_load_dword [[LOAD:v[0-9]+]], -; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]] -; SI: buffer_store_dwordx2 -define void @sextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %a = load i32, i32 addrspace(1)* %in - %ext = sext i32 %a to i64 - store i64 %ext, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v1i32_to_v1i64: -; SI: buffer_load_dword -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @zextload_global_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i32>, <1 x i32> addrspace(1)* %in - %ext = zext <1 x i32> %load to <1 x i64> - store <1 x i64> %ext, <1 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v1i32_to_v1i64: -; SI: buffer_load_dword -; SI: v_ashrrev_i32 -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @sextload_global_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i32>, <1 x i32> addrspace(1)* %in - %ext = sext <1 x i32> %load to <1 x i64> - store <1 x i64> %ext, <1 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v2i32_to_v2i64: -; SI: buffer_load_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @zextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i32>, <2 x i32> addrspace(1)* %in - %ext = zext <2 x i32> %load to <2 x i64> - store <2 x i64> %ext, <2 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v2i32_to_v2i64: -; SI: buffer_load_dwordx2 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI: s_endpgm -define void @sextload_global_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i32>, <2 x i32> addrspace(1)* %in - %ext = sext <2 x i32> %load to <2 x i64> - store <2 x i64> %ext, <2 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v4i32_to_v4i64: -; SI: buffer_load_dwordx4 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @zextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i32>, <4 x i32> addrspace(1)* %in - %ext = zext <4 x i32> %load to <4 x i64> - store <4 x i64> %ext, <4 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v4i32_to_v4i64: -; SI: buffer_load_dwordx4 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI: s_endpgm -define void @sextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i32>, <4 x i32> addrspace(1)* %in - %ext = sext <4 x i32> %load to <4 x i64> - store <4 x i64> %ext, <4 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v8i32_to_v8i64: -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI: s_endpgm -define void @zextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i32>, <8 x i32> addrspace(1)* %in - %ext = zext <8 x i32> %load to <8 x i64> - store <8 x i64> %ext, <8 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v8i32_to_v8i64: -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI: s_endpgm -define void @sextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i32>, <8 x i32> addrspace(1)* %in - %ext = sext <8 x i32> %load to <8 x i64> - store <8 x i64> %ext, <8 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v16i32_to_v16i64: -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI: s_endpgm -define void @sextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i32>, <16 x i32> addrspace(1)* %in - %ext = sext <16 x i32> %load to <16 x i64> - store <16 x i64> %ext, <16 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v16i32_to_v16i64 -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 - -; SI: s_endpgm -define void @zextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i32>, <16 x i32> addrspace(1)* %in - %ext = zext <16 x i32> %load to <16 x i64> - store <16 x i64> %ext, <16 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v32i32_to_v32i64: -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 -; SI-DAG: v_ashrrev_i32 - -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI: s_endpgm -define void @sextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* nocapture %in) nounwind { - %load = load <32 x i32>, <32 x i32> addrspace(1)* %in - %ext = sext <32 x i32> %load to <32 x i64> - store <32 x i64> %ext, <32 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v32i32_to_v32i64: -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dwordx2 - -; SI: s_endpgm -define void @zextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* nocapture %in) nounwind { - %load = load <32 x i32>, <32 x i32> addrspace(1)* %in - %ext = zext <32 x i32> %load to <32 x i64> - store <32 x i64> %ext, <32 x i64> addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/global-extload-i8.ll b/test/CodeGen/R600/global-extload-i8.ll deleted file mode 100644 index b31d5361d5a..00000000000 --- a/test/CodeGen/R600/global-extload-i8.ll +++ /dev/null @@ -1,299 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}zextload_global_i8_to_i32: -; SI: buffer_load_ubyte -; SI: buffer_store_dword -; SI: s_endpgm -define void @zextload_global_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { - %a = load i8, i8 addrspace(1)* %in - %ext = zext i8 %a to i32 - store i32 %ext, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_i8_to_i32: -; SI: buffer_load_sbyte -; SI: buffer_store_dword -; SI: s_endpgm -define void @sextload_global_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { - %a = load i8, i8 addrspace(1)* %in - %ext = sext i8 %a to i32 - store i32 %ext, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v1i8_to_v1i32: -; SI: s_endpgm -define void @zextload_global_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i8>, <1 x i8> addrspace(1)* %in - %ext = zext <1 x i8> %load to <1 x i32> - store <1 x i32> %ext, <1 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v1i8_to_v1i32: -; SI: s_endpgm -define void @sextload_global_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i8>, <1 x i8> addrspace(1)* %in - %ext = sext <1 x i8> %load to <1 x i32> - store <1 x i32> %ext, <1 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v2i8_to_v2i32: -; SI: s_endpgm -define void @zextload_global_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i8>, <2 x i8> addrspace(1)* %in - %ext = zext <2 x i8> %load to <2 x i32> - store <2 x i32> %ext, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v2i8_to_v2i32: -; SI: s_endpgm -define void @sextload_global_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i8>, <2 x i8> addrspace(1)* %in - %ext = sext <2 x i8> %load to <2 x i32> - store <2 x i32> %ext, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v4i8_to_v4i32: -; SI: s_endpgm -define void @zextload_global_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i8>, <4 x i8> addrspace(1)* %in - %ext = zext <4 x i8> %load to <4 x i32> - store <4 x i32> %ext, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v4i8_to_v4i32: -; SI: s_endpgm -define void @sextload_global_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i8>, <4 x i8> addrspace(1)* %in - %ext = sext <4 x i8> %load to <4 x i32> - store <4 x i32> %ext, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v8i8_to_v8i32: -; SI: s_endpgm -define void @zextload_global_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i8>, <8 x i8> addrspace(1)* %in - %ext = zext <8 x i8> %load to <8 x i32> - store <8 x i32> %ext, <8 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v8i8_to_v8i32: -; SI: s_endpgm -define void @sextload_global_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i8>, <8 x i8> addrspace(1)* %in - %ext = sext <8 x i8> %load to <8 x i32> - store <8 x i32> %ext, <8 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v16i8_to_v16i32: -; SI: s_endpgm -define void @zextload_global_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i8>, <16 x i8> addrspace(1)* %in - %ext = zext <16 x i8> %load to <16 x i32> - store <16 x i32> %ext, <16 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v16i8_to_v16i32: -; SI: s_endpgm -define void @sextload_global_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i8>, <16 x i8> addrspace(1)* %in - %ext = sext <16 x i8> %load to <16 x i32> - store <16 x i32> %ext, <16 x i32> addrspace(1)* %out - ret void -} - -; XFUNC-LABEL: {{^}}zextload_global_v32i8_to_v32i32: -; XSI: s_endpgm -; define void @zextload_global_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind { -; %load = load <32 x i8>, <32 x i8> addrspace(1)* %in -; %ext = zext <32 x i8> %load to <32 x i32> -; store <32 x i32> %ext, <32 x i32> addrspace(1)* %out -; ret void -; } - -; XFUNC-LABEL: {{^}}sextload_global_v32i8_to_v32i32: -; XSI: s_endpgm -; define void @sextload_global_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind { -; %load = load <32 x i8>, <32 x i8> addrspace(1)* %in -; %ext = sext <32 x i8> %load to <32 x i32> -; store <32 x i32> %ext, <32 x i32> addrspace(1)* %out -; ret void -; } - -; XFUNC-LABEL: {{^}}zextload_global_v64i8_to_v64i32: -; XSI: s_endpgm -; define void @zextload_global_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind { -; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in -; %ext = zext <64 x i8> %load to <64 x i32> -; store <64 x i32> %ext, <64 x i32> addrspace(1)* %out -; ret void -; } - -; XFUNC-LABEL: {{^}}sextload_global_v64i8_to_v64i32: -; XSI: s_endpgm -; define void @sextload_global_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind { -; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in -; %ext = sext <64 x i8> %load to <64 x i32> -; store <64 x i32> %ext, <64 x i32> addrspace(1)* %out -; ret void -; } - -; FUNC-LABEL: {{^}}zextload_global_i8_to_i64: -; SI: buffer_load_ubyte v[[LO:[0-9]+]], -; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} -; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] -define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { - %a = load i8, i8 addrspace(1)* %in - %ext = zext i8 %a to i64 - store i64 %ext, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_i8_to_i64: -; SI: buffer_load_sbyte [[LOAD:v[0-9]+]], -; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]] -; SI: buffer_store_dwordx2 -define void @sextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { - %a = load i8, i8 addrspace(1)* %in - %ext = sext i8 %a to i64 - store i64 %ext, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v1i8_to_v1i64: -; SI: s_endpgm -define void @zextload_global_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i8>, <1 x i8> addrspace(1)* %in - %ext = zext <1 x i8> %load to <1 x i64> - store <1 x i64> %ext, <1 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v1i8_to_v1i64: -; SI: s_endpgm -define void @sextload_global_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <1 x i8>, <1 x i8> addrspace(1)* %in - %ext = sext <1 x i8> %load to <1 x i64> - store <1 x i64> %ext, <1 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v2i8_to_v2i64: -; SI: s_endpgm -define void @zextload_global_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i8>, <2 x i8> addrspace(1)* %in - %ext = zext <2 x i8> %load to <2 x i64> - store <2 x i64> %ext, <2 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v2i8_to_v2i64: -; SI: s_endpgm -define void @sextload_global_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <2 x i8>, <2 x i8> addrspace(1)* %in - %ext = sext <2 x i8> %load to <2 x i64> - store <2 x i64> %ext, <2 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v4i8_to_v4i64: -; SI: s_endpgm -define void @zextload_global_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i8>, <4 x i8> addrspace(1)* %in - %ext = zext <4 x i8> %load to <4 x i64> - store <4 x i64> %ext, <4 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v4i8_to_v4i64: -; SI: s_endpgm -define void @sextload_global_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <4 x i8>, <4 x i8> addrspace(1)* %in - %ext = sext <4 x i8> %load to <4 x i64> - store <4 x i64> %ext, <4 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v8i8_to_v8i64: -; SI: s_endpgm -define void @zextload_global_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i8>, <8 x i8> addrspace(1)* %in - %ext = zext <8 x i8> %load to <8 x i64> - store <8 x i64> %ext, <8 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v8i8_to_v8i64: -; SI: s_endpgm -define void @sextload_global_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <8 x i8>, <8 x i8> addrspace(1)* %in - %ext = sext <8 x i8> %load to <8 x i64> - store <8 x i64> %ext, <8 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_global_v16i8_to_v16i64: -; SI: s_endpgm -define void @zextload_global_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i8>, <16 x i8> addrspace(1)* %in - %ext = zext <16 x i8> %load to <16 x i64> - store <16 x i64> %ext, <16 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_global_v16i8_to_v16i64: -; SI: s_endpgm -define void @sextload_global_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* nocapture %in) nounwind { - %load = load <16 x i8>, <16 x i8> addrspace(1)* %in - %ext = sext <16 x i8> %load to <16 x i64> - store <16 x i64> %ext, <16 x i64> addrspace(1)* %out - ret void -} - -; XFUNC-LABEL: {{^}}zextload_global_v32i8_to_v32i64: -; XSI: s_endpgm -; define void @zextload_global_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind { -; %load = load <32 x i8>, <32 x i8> addrspace(1)* %in -; %ext = zext <32 x i8> %load to <32 x i64> -; store <32 x i64> %ext, <32 x i64> addrspace(1)* %out -; ret void -; } - -; XFUNC-LABEL: {{^}}sextload_global_v32i8_to_v32i64: -; XSI: s_endpgm -; define void @sextload_global_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* nocapture %in) nounwind { -; %load = load <32 x i8>, <32 x i8> addrspace(1)* %in -; %ext = sext <32 x i8> %load to <32 x i64> -; store <32 x i64> %ext, <32 x i64> addrspace(1)* %out -; ret void -; } - -; XFUNC-LABEL: {{^}}zextload_global_v64i8_to_v64i64: -; XSI: s_endpgm -; define void @zextload_global_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind { -; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in -; %ext = zext <64 x i8> %load to <64 x i64> -; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out -; ret void -; } - -; XFUNC-LABEL: {{^}}sextload_global_v64i8_to_v64i64: -; XSI: s_endpgm -; define void @sextload_global_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* nocapture %in) nounwind { -; %load = load <64 x i8>, <64 x i8> addrspace(1)* %in -; %ext = sext <64 x i8> %load to <64 x i64> -; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out -; ret void -; } diff --git a/test/CodeGen/R600/global-zero-initializer.ll b/test/CodeGen/R600/global-zero-initializer.ll deleted file mode 100644 index 45aa8bf4e1d..00000000000 --- a/test/CodeGen/R600/global-zero-initializer.ll +++ /dev/null @@ -1,13 +0,0 @@ -; RUN: not llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s -; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s - -; CHECK: error: unsupported initializer for address space in load_init_global_global - -@lds = addrspace(1) global [256 x i32] zeroinitializer - -define void @load_init_global_global(i32 addrspace(1)* %out, i1 %p) { - %gep = getelementptr [256 x i32], [256 x i32] addrspace(1)* @lds, i32 0, i32 10 - %ld = load i32, i32 addrspace(1)* %gep - store i32 %ld, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/global_atomics.ll b/test/CodeGen/R600/global_atomics.ll deleted file mode 100644 index 847950f6376..00000000000 --- a/test/CodeGen/R600/global_atomics.ll +++ /dev/null @@ -1,801 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}atomic_add_i32_offset: -; SI: buffer_atomic_add v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_add_i32_ret_offset: -; SI: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_add_i32_addr64_offset: -; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} -define void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64_offset: -; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_add_i32: -; SI: buffer_atomic_add v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_add_i32_ret: -; SI: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc -; SI: buffer_store_dword [[RET]] -define void @atomic_add_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %0 = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_add_i32_addr64: -; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} -define void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64: -; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_and_i32_offset: -; SI: buffer_atomic_and v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_and_i32_ret_offset: -; SI: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_and_i32_addr64_offset: -; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} -define void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64_offset: -; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_and_i32: -; SI: buffer_atomic_and v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_and_i32_ret: -; SI: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc -; SI: buffer_store_dword [[RET]] -define void @atomic_and_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %0 = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_and_i32_addr64: -; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} -define void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64: -; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_sub_i32_offset: -; SI: buffer_atomic_sub v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_sub_i32_ret_offset: -; SI: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_sub_i32_addr64_offset: -; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} -define void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64_offset: -; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_sub_i32: -; SI: buffer_atomic_sub v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_sub_i32_ret: -; SI: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc -; SI: buffer_store_dword [[RET]] -define void @atomic_sub_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %0 = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_sub_i32_addr64: -; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} -define void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64: -; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_max_i32_offset: -; SI: buffer_atomic_smax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_max_i32_ret_offset: -; SI: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_max_i32_addr64_offset: -; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} -define void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64_offset: -; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_max_i32: -; SI: buffer_atomic_smax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_max_i32_ret: -; SI: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc -; SI: buffer_store_dword [[RET]] -define void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %0 = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_max_i32_addr64: -; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} -define void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64: -; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_umax_i32_offset: -; SI: buffer_atomic_umax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_umax_i32_ret_offset: -; SI: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_umax_i32_addr64_offset: -; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} -define void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64_offset: -; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_umax_i32: -; SI: buffer_atomic_umax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_umax_i32_ret: -; SI: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc -; SI: buffer_store_dword [[RET]] -define void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %0 = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_umax_i32_addr64: -; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} -define void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64: -; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_min_i32_offset: -; SI: buffer_atomic_smin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_min_i32_ret_offset: -; SI: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_min_i32_addr64_offset: -; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} -define void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64_offset: -; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_min_i32: -; SI: buffer_atomic_smin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_min_i32_ret: -; SI: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc -; SI: buffer_store_dword [[RET]] -define void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %0 = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_min_i32_addr64: -; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} -define void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64: -; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_umin_i32_offset: -; SI: buffer_atomic_umin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_umin_i32_ret_offset: -; SI: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_umin_i32_addr64_offset: -; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} -define void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64_offset: -; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_umin_i32: -; SI: buffer_atomic_umin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_umin_i32_ret: -; SI: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc -; SI: buffer_store_dword [[RET]] -define void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %0 = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_umin_i32_addr64: -; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} -define void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64: -; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_or_i32_offset: -; SI: buffer_atomic_or v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_or_i32_ret_offset: -; SI: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_or_i32_addr64_offset: -; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} -define void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64_offset: -; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_or_i32: -; SI: buffer_atomic_or v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_or_i32_ret: -; SI: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc -; SI: buffer_store_dword [[RET]] -define void @atomic_or_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %0 = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_or_i32_addr64: -; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} -define void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64: -; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_xchg_i32_offset: -; SI: buffer_atomic_swap v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_offset: -; SI: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64_offset: -; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} -define void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64_offset: -; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_xchg_i32: -; SI: buffer_atomic_swap v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_xchg_i32_ret: -; SI: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc -; SI: buffer_store_dword [[RET]] -define void @atomic_xchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %0 = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64: -; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} -define void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64: -; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_xor_i32_offset: -; SI: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} -define void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_xor_i32_ret_offset: -; SI: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc {{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 - %0 = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_xor_i32_addr64_offset: -; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} -define void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64_offset: -; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 - %0 = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_xor_i32: -; SI: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -define void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_xor_i32_ret: -; SI: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc -; SI: buffer_store_dword [[RET]] -define void @atomic_xor_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { -entry: - %0 = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} - -; FUNC-LABEL: {{^}}atomic_xor_i32_addr64: -; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} -define void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst - ret void -} - -; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64: -; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; SI: buffer_store_dword [[RET]] -define void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { -entry: - %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index - %0 = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst - store i32 %0, i32 addrspace(1)* %out2 - ret void -} diff --git a/test/CodeGen/R600/gv-const-addrspace-fail.ll b/test/CodeGen/R600/gv-const-addrspace-fail.ll deleted file mode 100644 index 014b0a5482a..00000000000 --- a/test/CodeGen/R600/gv-const-addrspace-fail.ll +++ /dev/null @@ -1,57 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; XUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - - -@a = internal addrspace(2) constant [1 x i8] [ i8 7 ], align 1 - -; FUNC-LABEL: {{^}}test_i8: -; EG: CF_END -; SI: buffer_store_byte -; SI: s_endpgm -define void @test_i8( i32 %s, i8 addrspace(1)* %out) #3 { - %arrayidx = getelementptr inbounds [1 x i8], [1 x i8] addrspace(2)* @a, i32 0, i32 %s - %1 = load i8, i8 addrspace(2)* %arrayidx, align 1 - store i8 %1, i8 addrspace(1)* %out - ret void -} - -@b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2 - -; FUNC-LABEL: {{^}}test_i16: -; EG: CF_END -; SI: buffer_store_short -; SI: s_endpgm -define void @test_i16( i32 %s, i16 addrspace(1)* %out) #3 { - %arrayidx = getelementptr inbounds [1 x i16], [1 x i16] addrspace(2)* @b, i32 0, i32 %s - %1 = load i16, i16 addrspace(2)* %arrayidx, align 2 - store i16 %1, i16 addrspace(1)* %out - ret void -} - -%struct.bar = type { float, [5 x i8] } - -; The illegal i8s aren't handled -@struct_bar_gv = internal addrspace(2) constant [1 x %struct.bar] [ %struct.bar { float 16.0, [5 x i8] [i8 0, i8 1, i8 2, i8 3, i8 4] } ] - -; FUNC-LABEL: {{^}}struct_bar_gv_load: -define void @struct_bar_gv_load(i8 addrspace(1)* %out, i32 %index) { - %gep = getelementptr inbounds [1 x %struct.bar], [1 x %struct.bar] addrspace(2)* @struct_bar_gv, i32 0, i32 0, i32 1, i32 %index - %load = load i8, i8 addrspace(2)* %gep, align 1 - store i8 %load, i8 addrspace(1)* %out, align 1 - ret void -} - - -; The private load isn't scalarzied. -@array_vector_gv = internal addrspace(2) constant [4 x <4 x i32>] [ <4 x i32> , - <4 x i32> , - <4 x i32> , - <4 x i32> ] - -; FUNC-LABEL: {{^}}array_vector_gv_load: -define void @array_vector_gv_load(<4 x i32> addrspace(1)* %out, i32 %index) { - %gep = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>] addrspace(2)* @array_vector_gv, i32 0, i32 %index - %load = load <4 x i32>, <4 x i32> addrspace(2)* %gep, align 16 - store <4 x i32> %load, <4 x i32> addrspace(1)* %out, align 16 - ret void -} diff --git a/test/CodeGen/R600/gv-const-addrspace.ll b/test/CodeGen/R600/gv-const-addrspace.ll deleted file mode 100644 index 3c1fc6c98f7..00000000000 --- a/test/CodeGen/R600/gv-const-addrspace.ll +++ /dev/null @@ -1,101 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - - -@b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2 - -@float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.0, float 1.0, float 2.0, float 3.0, float 4.0], align 4 - -; FUNC-LABEL: {{^}}float: -; FIXME: We should be using s_load_dword here. -; SI: buffer_load_dword -; VI: s_load_dword - -; EG-DAG: MOV {{\** *}}T2.X -; EG-DAG: MOV {{\** *}}T3.X -; EG-DAG: MOV {{\** *}}T4.X -; EG-DAG: MOV {{\** *}}T5.X -; EG-DAG: MOV {{\** *}}T6.X -; EG: MOVA_INT - -define void @float(float addrspace(1)* %out, i32 %index) { -entry: - %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index - %1 = load float, float addrspace(2)* %0 - store float %1, float addrspace(1)* %out - ret void -} - -@i32_gv = internal unnamed_addr addrspace(2) constant [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4], align 4 - -; FUNC-LABEL: {{^}}i32: - -; FIXME: We should be using s_load_dword here. -; SI: buffer_load_dword -; VI: s_load_dword - -; EG-DAG: MOV {{\** *}}T2.X -; EG-DAG: MOV {{\** *}}T3.X -; EG-DAG: MOV {{\** *}}T4.X -; EG-DAG: MOV {{\** *}}T5.X -; EG-DAG: MOV {{\** *}}T6.X -; EG: MOVA_INT - -define void @i32(i32 addrspace(1)* %out, i32 %index) { -entry: - %0 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(2)* @i32_gv, i32 0, i32 %index - %1 = load i32, i32 addrspace(2)* %0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - - -%struct.foo = type { float, [5 x i32] } - -@struct_foo_gv = internal unnamed_addr addrspace(2) constant [1 x %struct.foo] [ %struct.foo { float 16.0, [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4] } ] - -; FUNC-LABEL: {{^}}struct_foo_gv_load: -; GCN: s_load_dword - -define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) { - %gep = getelementptr inbounds [1 x %struct.foo], [1 x %struct.foo] addrspace(2)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index - %load = load i32, i32 addrspace(2)* %gep, align 4 - store i32 %load, i32 addrspace(1)* %out, align 4 - ret void -} - -@array_v1_gv = internal addrspace(2) constant [4 x <1 x i32>] [ <1 x i32> , - <1 x i32> , - <1 x i32> , - <1 x i32> ] - -; FUNC-LABEL: {{^}}array_v1_gv_load: -; FIXME: We should be using s_load_dword here. -; SI: buffer_load_dword -; VI: s_load_dword -define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) { - %gep = getelementptr inbounds [4 x <1 x i32>], [4 x <1 x i32>] addrspace(2)* @array_v1_gv, i32 0, i32 %index - %load = load <1 x i32>, <1 x i32> addrspace(2)* %gep, align 4 - store <1 x i32> %load, <1 x i32> addrspace(1)* %out, align 4 - ret void -} - -define void @gv_addressing_in_branch(float addrspace(1)* %out, i32 %index, i32 %a) { -entry: - %0 = icmp eq i32 0, %a - br i1 %0, label %if, label %else - -if: - %1 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index - %2 = load float, float addrspace(2)* %1 - store float %2, float addrspace(1)* %out - br label %endif - -else: - store float 1.0, float addrspace(1)* %out - br label %endif - -endif: - ret void -} diff --git a/test/CodeGen/R600/half.ll b/test/CodeGen/R600/half.ll deleted file mode 100644 index bf8f11860b5..00000000000 --- a/test/CodeGen/R600/half.ll +++ /dev/null @@ -1,525 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s - -; half args should be promoted to float - -; GCN-LABEL: {{^}}load_f16_arg: -; GCN: s_load_dword [[ARG:s[0-9]+]] -; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[ARG]] -; GCN: buffer_store_short [[CVT]] -define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 { - store half %arg, half addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}load_v2f16_arg: -; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 -; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46 -; GCN-DAG: buffer_store_short [[V0]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: buffer_store_short [[V1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} -; GCN: s_endpgm -define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { - store <2 x half> %arg, <2 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}load_v3f16_arg: -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN-NOT: buffer_load -; GCN-DAG: buffer_store_dword -; GCN-DAG: buffer_store_short -; GCN-NOT: buffer_store -; GCN: s_endpgm -define void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 { - store <3 x half> %arg, <3 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}load_v4f16_arg: -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: s_endpgm -define void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 { - store <4 x half> %arg, <4 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}load_v8f16_arg: -define void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 { - store <8 x half> %arg, <8 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_v2f16_arg: -define void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 { - %fpext = fpext <2 x half> %in to <2 x float> - store <2 x float> %fpext, <2 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_f16_to_f32_arg: -define void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 { - %ext = fpext half %arg to float - store float %ext, float addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_v2f16_to_v2f32_arg: -define void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 { - %ext = fpext <2 x half> %arg to <2 x float> - store <2 x float> %ext, <2 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg: -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN-NOT: buffer_load -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 -; GCN-NOT: v_cvt_f32_f16 -; GCN-DAG: buffer_store_dword -; GCN-DAG: buffer_store_dwordx2 -; GCN: s_endpgm -define void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 { - %ext = fpext <3 x half> %arg to <3 x float> - store <3 x float> %ext, <3 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_v4f16_to_v4f32_arg: -define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 { - %ext = fpext <4 x half> %arg to <4 x float> - store <4 x float> %ext, <4 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg: -define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 { - %ext = fpext <8 x half> %arg to <8 x float> - store <8 x float> %ext, <8 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_f16_to_f64_arg: -define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 { - %ext = fpext half %arg to double - store double %ext, double addrspace(1)* %out - ret void -} -; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg: -define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 { - %ext = fpext <2 x half> %arg to <2 x double> - store <2 x double> %ext, <2 x double> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg: -define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 { - %ext = fpext <3 x half> %arg to <3 x double> - store <3 x double> %ext, <3 x double> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg: -define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 { - %ext = fpext <4 x half> %arg to <4 x double> - store <4 x double> %ext, <4 x double> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg: -define void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 { - %ext = fpext <8 x half> %arg to <8 x double> - store <8 x double> %ext, <8 x double> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_load_store_f16: -; GCN: buffer_load_ushort [[TMP:v[0-9]+]] -; GCN: buffer_store_short [[TMP]] -define void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { - %val = load half, half addrspace(1)* %in - store half %val, half addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_load_store_v2f16: -; GCN: buffer_load_dword [[TMP:v[0-9]+]] -; GCN: buffer_store_dword [[TMP]] -define void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { - %val = load <2 x half>, <2 x half> addrspace(1)* %in - store <2 x half> %val, <2 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_load_store_v4f16: -; GCN: buffer_load_dwordx2 [[TMP:v\[[0-9]+:[0-9]+\]]] -; GCN: buffer_store_dwordx2 [[TMP]] -define void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 { - %val = load <4 x half>, <4 x half> addrspace(1)* %in - store <4 x half> %val, <4 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_load_store_v8f16: -; GCN: buffer_load_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]] -; GCN: buffer_store_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]] -; GCN: s_endpgm -define void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { - %val = load <8 x half>, <8 x half> addrspace(1)* %in - store <8 x half> %val, <8 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_f16_to_f32: -; GCN: buffer_load_ushort [[LOAD:v[0-9]+]] -; GCN: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[LOAD]] -; GCN: buffer_store_dword [[CVT]] -define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 { - %val = load half, half addrspace(1)* %in - %cvt = fpext half %val to float - store float %cvt, float addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32: -define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { - %val = load <2 x half>, <2 x half> addrspace(1)* %in - %cvt = fpext <2 x half> %val to <2 x float> - store <2 x float> %cvt, <2 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f32: -define void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { - %val = load <3 x half>, <3 x half> addrspace(1)* %in - %cvt = fpext <3 x half> %val to <3 x float> - store <3 x float> %cvt, <3 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f32: -define void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { - %val = load <4 x half>, <4 x half> addrspace(1)* %in - %cvt = fpext <4 x half> %val to <4 x float> - store <4 x float> %cvt, <4 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f32: -define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { - %val = load <8 x half>, <8 x half> addrspace(1)* %in - %cvt = fpext <8 x half> %val to <8 x float> - store <8 x float> %cvt, <8 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f32: -define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { - %val = load <16 x half>, <16 x half> addrspace(1)* %in - %cvt = fpext <16 x half> %val to <16 x float> - store <16 x float> %cvt, <16 x float> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_f16_to_f64: -; GCN: buffer_load_ushort [[LOAD:v[0-9]+]] -; GCN: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[LOAD]] -; GCN: v_cvt_f64_f32_e32 [[CVT1:v\[[0-9]+:[0-9]+\]]], [[CVT0]] -; GCN: buffer_store_dwordx2 [[CVT1]] -define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 { - %val = load half, half addrspace(1)* %in - %cvt = fpext half %val to double - store double %cvt, double addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64: -define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { - %val = load <2 x half>, <2 x half> addrspace(1)* %in - %cvt = fpext <2 x half> %val to <2 x double> - store <2 x double> %cvt, <2 x double> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64: -define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { - %val = load <3 x half>, <3 x half> addrspace(1)* %in - %cvt = fpext <3 x half> %val to <3 x double> - store <3 x double> %cvt, <3 x double> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f64: -define void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { - %val = load <4 x half>, <4 x half> addrspace(1)* %in - %cvt = fpext <4 x half> %val to <4 x double> - store <4 x double> %cvt, <4 x double> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f64: -define void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { - %val = load <8 x half>, <8 x half> addrspace(1)* %in - %cvt = fpext <8 x half> %val to <8 x double> - store <8 x double> %cvt, <8 x double> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f64: -define void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { - %val = load <16 x half>, <16 x half> addrspace(1)* %in - %cvt = fpext <16 x half> %val to <16 x double> - store <16 x double> %cvt, <16 x double> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_truncstore_f32_to_f16: -; GCN: buffer_load_dword [[LOAD:v[0-9]+]] -; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[LOAD]] -; GCN: buffer_store_short [[CVT]] -define void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 { - %val = load float, float addrspace(1)* %in - %cvt = fptrunc float %val to half - store half %cvt, half addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_truncstore_v2f32_to_v2f16: -; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} -; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]] -; GCN-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]] -; GCN-DAG: buffer_store_short [[CVT0]] -; GCN-DAG: buffer_store_short [[CVT1]] -; GCN: s_endpgm -define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { - %val = load <2 x float>, <2 x float> addrspace(1)* %in - %cvt = fptrunc <2 x float> %val to <2 x half> - store <2 x half> %cvt, <2 x half> addrspace(1)* %out - ret void -} - -; FIXME: Shouldn't do 4th conversion -; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16: -; GCN: buffer_load_dwordx4 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: buffer_store_short -; GCN: buffer_store_dword -; GCN: s_endpgm -define void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { - %val = load <3 x float>, <3 x float> addrspace(1)* %in - %cvt = fptrunc <3 x float> %val to <3 x half> - store <3 x half> %cvt, <3 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_truncstore_v4f32_to_v4f16: -; GCN: buffer_load_dwordx4 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: s_endpgm -define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { - %val = load <4 x float>, <4 x float> addrspace(1)* %in - %cvt = fptrunc <4 x float> %val to <4 x half> - store <4 x half> %cvt, <4 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16: -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: s_endpgm -define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 { - %val = load <8 x float>, <8 x float> addrspace(1)* %in - %cvt = fptrunc <8 x float> %val to <8 x half> - store <8 x half> %cvt, <8 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_truncstore_v16f32_to_v16f16: -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: s_endpgm -define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { - %val = load <16 x float>, <16 x float> addrspace(1)* %in - %cvt = fptrunc <16 x float> %val to <16 x half> - store <16 x half> %cvt, <16 x half> addrspace(1)* %out - ret void -} - -; FIXME: Unsafe math should fold conversions away -; GCN-LABEL: {{^}}fadd_f16: -; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, -; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, -; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, -; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, -; SI: v_add_f32 -; GCN: s_endpgm -define void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 { - %add = fadd half %a, %b - store half %add, half addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}fadd_v2f16: -; SI: v_add_f32 -; SI: v_add_f32 -; GCN: s_endpgm -define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 { - %add = fadd <2 x half> %a, %b - store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8 - ret void -} - -; GCN-LABEL: {{^}}fadd_v4f16: -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; GCN: s_endpgm -define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { - %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1 - %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16 - %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16 - %result = fadd <4 x half> %a, %b - store <4 x half> %result, <4 x half> addrspace(1)* %out, align 16 - ret void -} - -; GCN-LABEL: {{^}}fadd_v8f16: -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; SI: v_add_f32 -; GCN: s_endpgm -define void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 { - %add = fadd <8 x half> %a, %b - store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32 - ret void -} - -; GCN-LABEL: {{^}}fsub_f16: -; GCN: v_subrev_f32_e32 -; GCN: s_endpgm -define void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { - %b_ptr = getelementptr half, half addrspace(1)* %in, i32 1 - %a = load half, half addrspace(1)* %in - %b = load half, half addrspace(1)* %b_ptr - %sub = fsub half %a, %b - store half %sub, half addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}test_bitcast_from_half: -; GCN: buffer_load_ushort [[TMP:v[0-9]+]] -; GCN: buffer_store_short [[TMP]] -define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 { - %val = load half, half addrspace(1)* %in - %val_int = bitcast half %val to i16 - store i16 %val_int, i16 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}test_bitcast_to_half: -; GCN: buffer_load_ushort [[TMP:v[0-9]+]] -; GCN: buffer_store_short [[TMP]] -define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 { - %val = load i16, i16 addrspace(1)* %in - %val_fp = bitcast i16 %val to half - store half %val_fp, half addrspace(1)* %out - ret void -} - -attributes #0 = { nounwind } diff --git a/test/CodeGen/R600/hsa.ll b/test/CodeGen/R600/hsa.ll deleted file mode 100644 index f9113399afe..00000000000 --- a/test/CodeGen/R600/hsa.ll +++ /dev/null @@ -1,14 +0,0 @@ -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s - -; HSA: .section .hsa.version -; HSA-NEXT: .ascii "HSA Code Unit:0.0:AMD:0.1:GFX8.1:0" -; HSA: {{^}}simple: -; Make sure we are setting the ATC bit: -; HSA: s_mov_b32 s[[HI:[0-9]]], 0x100f000 -; HSA: buffer_store_dword v{{[0-9]+}}, s[0:[[HI]]], 0 - -define void @simple(i32 addrspace(1)* %out) { -entry: - store i32 0, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/i1-copy-implicit-def.ll b/test/CodeGen/R600/i1-copy-implicit-def.ll deleted file mode 100644 index b11a2113764..00000000000 --- a/test/CodeGen/R600/i1-copy-implicit-def.ll +++ /dev/null @@ -1,22 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -; SILowerI1Copies was not handling IMPLICIT_DEF -; SI-LABEL: {{^}}br_implicit_def: -; SI: BB#0: -; SI-NEXT: s_and_saveexec_b64 -; SI-NEXT: s_xor_b64 -; SI-NEXT: BB#1: -define void @br_implicit_def(i32 addrspace(1)* %out, i32 %arg) #0 { -bb: - br i1 undef, label %bb1, label %bb2 - -bb1: - store volatile i32 123, i32 addrspace(1)* %out - ret void - -bb2: - ret void -} - -attributes #0 = { nounwind } diff --git a/test/CodeGen/R600/i1-copy-phi.ll b/test/CodeGen/R600/i1-copy-phi.ll deleted file mode 100644 index 105cd06b330..00000000000 --- a/test/CodeGen/R600/i1-copy-phi.ll +++ /dev/null @@ -1,30 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}br_i1_phi: -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} -; SI: s_and_saveexec_b64 -; SI: s_xor_b64 -; SI: v_mov_b32_e32 [[REG]], -1{{$}} -; SI: v_cmp_ne_i32_e32 vcc, 0, [[REG]] -; SI: s_and_saveexec_b64 -; SI: s_xor_b64 -; SI: s_endpgm -define void @br_i1_phi(i32 %arg, i1 %arg1) #0 { -bb: - br i1 %arg1, label %bb2, label %bb3 - -bb2: ; preds = %bb - br label %bb3 - -bb3: ; preds = %bb2, %bb - %tmp = phi i1 [ true, %bb2 ], [ false, %bb ] - br i1 %tmp, label %bb4, label %bb6 - -bb4: ; preds = %bb3 - %tmp5 = mul i32 undef, %arg - br label %bb6 - -bb6: ; preds = %bb4, %bb3 - ret void -} diff --git a/test/CodeGen/R600/i8-to-double-to-float.ll b/test/CodeGen/R600/i8-to-double-to-float.ll deleted file mode 100644 index c218e1918bb..00000000000 --- a/test/CodeGen/R600/i8-to-double-to-float.ll +++ /dev/null @@ -1,11 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -define void @test(float addrspace(1)* %out, i8 addrspace(1)* %in) { - %1 = load i8, i8 addrspace(1)* %in - %2 = uitofp i8 %1 to double - %3 = fptrunc double %2 to float - store float %3, float addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/icmp-select-sete-reverse-args.ll b/test/CodeGen/R600/icmp-select-sete-reverse-args.ll deleted file mode 100644 index 60e59a5a528..00000000000 --- a/test/CodeGen/R600/icmp-select-sete-reverse-args.ll +++ /dev/null @@ -1,18 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;Test that a select with reversed True/False values is correctly lowered -;to a SETNE_INT. There should only be one SETNE_INT instruction. - -;CHECK: SETNE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;CHECK-NOT: SETNE_INT - -define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { -entry: - %0 = load i32, i32 addrspace(1)* %in - %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 - %1 = load i32, i32 addrspace(1)* %arrayidx1 - %cmp = icmp eq i32 %0, %1 - %value = select i1 %cmp, i32 0, i32 -1 - store i32 %value, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/icmp64.ll b/test/CodeGen/R600/icmp64.ll deleted file mode 100644 index 0eaa33ebafe..00000000000 --- a/test/CodeGen/R600/icmp64.ll +++ /dev/null @@ -1,93 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}test_i64_eq: -; SI: v_cmp_eq_i64 -define void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %cmp = icmp eq i64 %a, %b - %result = sext i1 %cmp to i32 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_i64_ne: -; SI: v_cmp_ne_i64 -define void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %cmp = icmp ne i64 %a, %b - %result = sext i1 %cmp to i32 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_i64_slt: -; SI: v_cmp_lt_i64 -define void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %cmp = icmp slt i64 %a, %b - %result = sext i1 %cmp to i32 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_i64_ult: -; SI: v_cmp_lt_u64 -define void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %cmp = icmp ult i64 %a, %b - %result = sext i1 %cmp to i32 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_i64_sle: -; SI: v_cmp_le_i64 -define void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %cmp = icmp sle i64 %a, %b - %result = sext i1 %cmp to i32 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_i64_ule: -; SI: v_cmp_le_u64 -define void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %cmp = icmp ule i64 %a, %b - %result = sext i1 %cmp to i32 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_i64_sgt: -; SI: v_cmp_gt_i64 -define void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %cmp = icmp sgt i64 %a, %b - %result = sext i1 %cmp to i32 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_i64_ugt: -; SI: v_cmp_gt_u64 -define void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %cmp = icmp ugt i64 %a, %b - %result = sext i1 %cmp to i32 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_i64_sge: -; SI: v_cmp_ge_i64 -define void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %cmp = icmp sge i64 %a, %b - %result = sext i1 %cmp to i32 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_i64_uge: -; SI: v_cmp_ge_u64 -define void @test_i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %cmp = icmp uge i64 %a, %b - %result = sext i1 %cmp to i32 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - diff --git a/test/CodeGen/R600/imm.ll b/test/CodeGen/R600/imm.ll deleted file mode 100644 index 12eed550eb1..00000000000 --- a/test/CodeGen/R600/imm.ll +++ /dev/null @@ -1,617 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=CHECK %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CHECK %s - -; Use a 64-bit value with lo bits that can be represented as an inline constant -; CHECK-LABEL: {{^}}i64_imm_inline_lo: -; CHECK: s_mov_b32 [[LO:s[0-9]+]], 5 -; CHECK: v_mov_b32_e32 v[[LO_VGPR:[0-9]+]], [[LO]] -; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VGPR]]: -define void @i64_imm_inline_lo(i64 addrspace(1) *%out) { -entry: - store i64 1311768464867721221, i64 addrspace(1) *%out ; 0x1234567800000005 - ret void -} - -; Use a 64-bit value with hi bits that can be represented as an inline constant -; CHECK-LABEL: {{^}}i64_imm_inline_hi: -; CHECK: s_mov_b32 [[HI:s[0-9]+]], 5 -; CHECK: v_mov_b32_e32 v[[HI_VGPR:[0-9]+]], [[HI]] -; CHECK: buffer_store_dwordx2 v{{\[[0-9]+:}}[[HI_VGPR]] -define void @i64_imm_inline_hi(i64 addrspace(1) *%out) { -entry: - store i64 21780256376, i64 addrspace(1) *%out ; 0x0000000512345678 - ret void -} - -; CHECK-LABEL: {{^}}store_imm_neg_0.0_i64: -; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x80000000 -; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}} -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]] -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]] -; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) { - store i64 -9223372036854775808, i64 addrspace(1) *%out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_neg_0.0_i32: -; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000 -; CHECK: buffer_store_dword [[REG]] -define void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) { - store i32 -2147483648, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_0.0_f32: -; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @store_inline_imm_0.0_f32(float addrspace(1)* %out) { - store float 0.0, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_imm_neg_0.0_f32: -; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000 -; CHECK: buffer_store_dword [[REG]] -define void @store_imm_neg_0.0_f32(float addrspace(1)* %out) { - store float -0.0, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_0.5_f32: -; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0.5{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @store_inline_imm_0.5_f32(float addrspace(1)* %out) { - store float 0.5, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_m_0.5_f32: -; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -0.5{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out) { - store float -0.5, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_1.0_f32: -; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @store_inline_imm_1.0_f32(float addrspace(1)* %out) { - store float 1.0, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_m_1.0_f32: -; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out) { - store float -1.0, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_2.0_f32: -; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 2.0{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @store_inline_imm_2.0_f32(float addrspace(1)* %out) { - store float 2.0, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_m_2.0_f32: -; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -2.0{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out) { - store float -2.0, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_4.0_f32: -; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 4.0{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @store_inline_imm_4.0_f32(float addrspace(1)* %out) { - store float 4.0, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_m_4.0_f32: -; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -4.0{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) { - store float -4.0, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_literal_imm_f32: -; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x45800000 -; CHECK: buffer_store_dword [[REG]] -define void @store_literal_imm_f32(float addrspace(1)* %out) { - store float 4096.0, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_0.0_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 0, [[VAL]]{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, 0.0 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_0.5_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 0.5, [[VAL]]{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, 0.5 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_neg_0.5_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -0.5, [[VAL]]{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, -0.5 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_1.0_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 1.0, [[VAL]]{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, 1.0 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_neg_1.0_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -1.0, [[VAL]]{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, -1.0 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_2.0_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 2.0, [[VAL]]{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, 2.0 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_neg_2.0_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -2.0, [[VAL]]{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, -2.0 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_4.0_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 4.0, [[VAL]]{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, 4.0 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_neg_4.0_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -4.0, [[VAL]]{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, -4.0 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}commute_add_inline_imm_0.5_f32: -; CHECK: buffer_load_dword [[VAL:v[0-9]+]] -; CHECK: v_add_f32_e32 [[REG:v[0-9]+]], 0.5, [[VAL]] -; CHECK: buffer_store_dword [[REG]] -define void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addrspace(1)* %in) { - %x = load float, float addrspace(1)* %in - %y = fadd float %x, 0.5 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}commute_add_literal_f32: -; CHECK: buffer_load_dword [[VAL:v[0-9]+]] -; CHECK: v_add_f32_e32 [[REG:v[0-9]+]], 0x44800000, [[VAL]] -; CHECK: buffer_store_dword [[REG]] -define void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1)* %in) { - %x = load float, float addrspace(1)* %in - %y = fadd float %x, 1024.0 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_1_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 1, [[VAL]]{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_1_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, 0x36a0000000000000 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_2_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 2, [[VAL]]{{$}} -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_2_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, 0x36b0000000000000 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_16_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 16, [[VAL]] -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_16_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, 0x36e0000000000000 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_neg_1_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -1, [[VAL]] -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_neg_1_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, 0xffffffffe0000000 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_neg_2_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -2, [[VAL]] -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_neg_2_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, 0xffffffffc0000000 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_neg_16_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -16, [[VAL]] -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_neg_16_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, 0xfffffffe00000000 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_63_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 63, [[VAL]] -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_63_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, 0x36ff800000000000 - store float %y, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_64_f32: -; CHECK: s_load_dword [[VAL:s[0-9]+]] -; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 64, [[VAL]] -; CHECK: buffer_store_dword [[REG]] -define void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) { - %y = fadd float %x, 0x3700000000000000 - store float %y, float addrspace(1)* %out - ret void -} - - -; CHECK-LABEL: {{^}}add_inline_imm_0.0_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 0, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, 0.0 - store double %y, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_0.5_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 0.5, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, 0.5 - store double %y, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_neg_0.5_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -0.5, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, -0.5 - store double %y, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_1.0_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, 1.0 - store double %y, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_neg_1.0_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -1.0, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, -1.0 - store double %y, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_2.0_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 2.0, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, 2.0 - store double %y, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_neg_2.0_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -2.0, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, -2.0 - store double %y, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_4.0_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 4.0, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, 4.0 - store double %y, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_neg_4.0_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -4.0, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, -4.0 - store double %y, double addrspace(1)* %out - ret void -} - - -; CHECK-LABEL: {{^}}add_inline_imm_1_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, 0x0000000000000001 - store double %y, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_2_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 2, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, 0x0000000000000002 - store double %y, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_16_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 16, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, 0x0000000000000010 - store double %y, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_neg_1_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -1, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, 0xffffffffffffffff - store double %y, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_neg_2_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -2, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, 0xfffffffffffffffe - store double %y, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_neg_16_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -16, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, 0xfffffffffffffff0 - store double %y, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_63_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 63, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, 0x000000000000003F - store double %y, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}add_inline_imm_64_f64: -; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c -; CHECK: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 64, [[VAL]] -; CHECK: buffer_store_dwordx2 [[REG]] -define void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) { - %y = fadd double %x, 0x0000000000000040 - store double %y, double addrspace(1)* %out - ret void -} - - -; CHECK-LABEL: {{^}}store_inline_imm_0.0_f64: -; CHECK: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0 -; CHECK: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0 -; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_0.0_f64(double addrspace(1)* %out) { - store double 0.0, double addrspace(1)* %out - ret void -} - - -; CHECK-LABEL: {{^}}store_literal_imm_neg_0.0_f64: -; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x80000000 -; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}} -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]] -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]] -; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) { - store double -0.0, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_0.5_f64: -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3fe00000 -; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_0.5_f64(double addrspace(1)* %out) { - store double 0.5, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_m_0.5_f64: -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfe00000 -; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_m_0.5_f64(double addrspace(1)* %out) { - store double -0.5, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_1.0_f64: -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3ff00000 -; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_1.0_f64(double addrspace(1)* %out) { - store double 1.0, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_m_1.0_f64: -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbff00000 -; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_m_1.0_f64(double addrspace(1)* %out) { - store double -1.0, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_2.0_f64: -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 2.0 -; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_2.0_f64(double addrspace(1)* %out) { - store double 2.0, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_m_2.0_f64: -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], -2.0 -; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_m_2.0_f64(double addrspace(1)* %out) { - store double -2.0, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_4.0_f64: -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x40100000 -; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_4.0_f64(double addrspace(1)* %out) { - store double 4.0, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_inline_imm_m_4.0_f64: -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}} -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xc0100000 -; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) { - store double -4.0, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}store_literal_imm_f64: -; CHECK-DAG: s_mov_b32 s[[HI_SREG:[0-9]+]], 0x40b00000 -; CHECK-DAG: s_mov_b32 s[[LO_SREG:[0-9]+]], 0{{$}} -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]] -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]] -; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}} -define void @store_literal_imm_f64(double addrspace(1)* %out) { - store double 4096.0, double addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/indirect-addressing-si.ll b/test/CodeGen/R600/indirect-addressing-si.ll deleted file mode 100644 index f551606d63a..00000000000 --- a/test/CodeGen/R600/indirect-addressing-si.ll +++ /dev/null @@ -1,121 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -; Tests for indirect addressing on SI, which is implemented using dynamic -; indexing of vectors. - -; CHECK-LABEL: {{^}}extract_w_offset: -; CHECK: s_mov_b32 m0 -; CHECK-NEXT: v_movrels_b32_e32 -define void @extract_w_offset(float addrspace(1)* %out, i32 %in) { -entry: - %0 = add i32 %in, 1 - %1 = extractelement <4 x float> , i32 %0 - store float %1, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}extract_wo_offset: -; CHECK: s_mov_b32 m0 -; CHECK-NEXT: v_movrels_b32_e32 -define void @extract_wo_offset(float addrspace(1)* %out, i32 %in) { -entry: - %0 = extractelement <4 x float> , i32 %in - store float %0, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}extract_neg_offset_sgpr: -; The offset depends on the register that holds the first element of the vector. -; CHECK: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} -; CHECK: v_movrels_b32_e32 v{{[0-9]}}, v0 -define void @extract_neg_offset_sgpr(i32 addrspace(1)* %out, i32 %offset) { -entry: - %index = add i32 %offset, -512 - %value = extractelement <4 x i32> , i32 %index - store i32 %value, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}extract_neg_offset_vgpr: -; The offset depends on the register that holds the first element of the vector. -; CHECK: v_readfirstlane_b32 -; CHECK: s_add_i32 m0, m0, 0xfffffe{{[0-9a-z]+}} -; CHECK-NEXT: v_movrels_b32_e32 v{{[0-9]}}, v0 -; CHECK: s_cbranch_execnz -define void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) { -entry: - %id = call i32 @llvm.r600.read.tidig.x() #1 - %index = add i32 %id, -512 - %value = extractelement <4 x i32> , i32 %index - store i32 %value, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}insert_w_offset: -; CHECK: s_mov_b32 m0 -; CHECK-NEXT: v_movreld_b32_e32 -define void @insert_w_offset(float addrspace(1)* %out, i32 %in) { -entry: - %0 = add i32 %in, 1 - %1 = insertelement <4 x float> , float 5.0, i32 %0 - %2 = extractelement <4 x float> %1, i32 2 - store float %2, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}insert_wo_offset: -; CHECK: s_mov_b32 m0 -; CHECK-NEXT: v_movreld_b32_e32 -define void @insert_wo_offset(float addrspace(1)* %out, i32 %in) { -entry: - %0 = insertelement <4 x float> , float 5.0, i32 %in - %1 = extractelement <4 x float> %0, i32 2 - store float %1, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}insert_neg_offset_sgpr: -; The offset depends on the register that holds the first element of the vector. -; CHECK: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} -; CHECK: v_movreld_b32_e32 v0, v{{[0-9]}} -define void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, i32 %offset) { -entry: - %index = add i32 %offset, -512 - %value = insertelement <4 x i32> , i32 5, i32 %index - store <4 x i32> %value, <4 x i32> addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}insert_neg_offset_vgpr: -; The offset depends on the register that holds the first element of the vector. -; CHECK: v_readfirstlane_b32 -; CHECK: s_add_i32 m0, m0, 0xfffffe{{[0-9a-z]+}} -; CHECK-NEXT: v_movreld_b32_e32 v0, v{{[0-9]}} -; CHECK: s_cbranch_execnz -define void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { -entry: - %id = call i32 @llvm.r600.read.tidig.x() #1 - %index = add i32 %id, -512 - %value = insertelement <4 x i32> , i32 5, i32 %index - store <4 x i32> %value, <4 x i32> addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}insert_neg_inline_offset_vgpr: -; The offset depends on the register that holds the first element of the vector. -; CHECK: v_readfirstlane_b32 -; CHECK: s_add_i32 m0, m0, -{{[0-9]+}} -; CHECK-NEXT: v_movreld_b32_e32 v0, v{{[0-9]}} -; CHECK: s_cbranch_execnz -define void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { -entry: - %id = call i32 @llvm.r600.read.tidig.x() #1 - %index = add i32 %id, -16 - %value = insertelement <4 x i32> , i32 5, i32 %index - store <4 x i32> %value, <4 x i32> addrspace(1)* %out - ret void -} - -declare i32 @llvm.r600.read.tidig.x() #1 -attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/indirect-private-64.ll b/test/CodeGen/R600/indirect-private-64.ll deleted file mode 100644 index d63e1b6c521..00000000000 --- a/test/CodeGen/R600/indirect-private-64.ll +++ /dev/null @@ -1,91 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=SI -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s - - -declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind - -; SI-LABEL: {{^}}private_access_f64_alloca: - -; SI-ALLOCA: buffer_store_dwordx2 -; SI-ALLOCA: buffer_load_dwordx2 - -; SI-PROMOTE: ds_write_b64 -; SI-PROMOTE: ds_read_b64 -define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) nounwind { - %val = load double, double addrspace(1)* %in, align 8 - %array = alloca double, i32 16, align 8 - %ptr = getelementptr double, double* %array, i32 %b - store double %val, double* %ptr, align 8 - call void @llvm.AMDGPU.barrier.local() noduplicate nounwind - %result = load double, double* %ptr, align 8 - store double %result, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}private_access_v2f64_alloca: - -; SI-ALLOCA: buffer_store_dwordx4 -; SI-ALLOCA: buffer_load_dwordx4 - -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_read_b32 -; SI-PROMOTE: ds_read_b32 -; SI-PROMOTE: ds_read_b32 -; SI-PROMOTE: ds_read_b32 -define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind { - %val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16 - %array = alloca <2 x double>, i32 16, align 16 - %ptr = getelementptr <2 x double>, <2 x double>* %array, i32 %b - store <2 x double> %val, <2 x double>* %ptr, align 16 - call void @llvm.AMDGPU.barrier.local() noduplicate nounwind - %result = load <2 x double>, <2 x double>* %ptr, align 16 - store <2 x double> %result, <2 x double> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}private_access_i64_alloca: - -; SI-ALLOCA: buffer_store_dwordx2 -; SI-ALLOCA: buffer_load_dwordx2 - -; SI-PROMOTE: ds_write_b64 -; SI-PROMOTE: ds_read_b64 -define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) nounwind { - %val = load i64, i64 addrspace(1)* %in, align 8 - %array = alloca i64, i32 16, align 8 - %ptr = getelementptr i64, i64* %array, i32 %b - store i64 %val, i64* %ptr, align 8 - call void @llvm.AMDGPU.barrier.local() noduplicate nounwind - %result = load i64, i64* %ptr, align 8 - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}private_access_v2i64_alloca: - -; SI-ALLOCA: buffer_store_dwordx4 -; SI-ALLOCA: buffer_load_dwordx4 - -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_read_b32 -; SI-PROMOTE: ds_read_b32 -; SI-PROMOTE: ds_read_b32 -; SI-PROMOTE: ds_read_b32 -define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind { - %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 - %array = alloca <2 x i64>, i32 16, align 16 - %ptr = getelementptr <2 x i64>, <2 x i64>* %array, i32 %b - store <2 x i64> %val, <2 x i64>* %ptr, align 16 - call void @llvm.AMDGPU.barrier.local() noduplicate nounwind - %result = load <2 x i64>, <2 x i64>* %ptr, align 16 - store <2 x i64> %result, <2 x i64> addrspace(1)* %out, align 16 - ret void -} diff --git a/test/CodeGen/R600/infinite-loop-evergreen.ll b/test/CodeGen/R600/infinite-loop-evergreen.ll deleted file mode 100644 index f6e39b3d830..00000000000 --- a/test/CodeGen/R600/infinite-loop-evergreen.ll +++ /dev/null @@ -1,10 +0,0 @@ -; XFAIL: * -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s - -define void @inf_loop_irreducible_cfg() nounwind { -entry: - br label %block - -block: - br label %block -} diff --git a/test/CodeGen/R600/infinite-loop.ll b/test/CodeGen/R600/infinite-loop.ll deleted file mode 100644 index 7233aa57fd7..00000000000 --- a/test/CodeGen/R600/infinite-loop.ll +++ /dev/null @@ -1,18 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}infinite_loop: -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7 -; SI: BB0_1: -; SI: buffer_store_dword [[REG]] -; SI: s_waitcnt vmcnt(0) expcnt(0) -; SI: s_branch BB0_1 -define void @infinite_loop(i32 addrspace(1)* %out) { -entry: - br label %for.body - -for.body: ; preds = %entry, %for.body - store i32 999, i32 addrspace(1)* %out, align 4 - br label %for.body -} - diff --git a/test/CodeGen/R600/inline-asm.ll b/test/CodeGen/R600/inline-asm.ll deleted file mode 100644 index efc2292de3a..00000000000 --- a/test/CodeGen/R600/inline-asm.ll +++ /dev/null @@ -1,12 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -; CHECK: {{^}}inline_asm: -; CHECK: s_endpgm -; CHECK: s_endpgm -define void @inline_asm(i32 addrspace(1)* %out) { -entry: - store i32 5, i32 addrspace(1)* %out - call void asm sideeffect "s_endpgm", ""() - ret void -} diff --git a/test/CodeGen/R600/inline-calls.ll b/test/CodeGen/R600/inline-calls.ll deleted file mode 100644 index 33a4c832e75..00000000000 --- a/test/CodeGen/R600/inline-calls.ll +++ /dev/null @@ -1,25 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s - -; CHECK-NOT: {{^}}func: -define internal fastcc i32 @func(i32 %a) { -entry: - %tmp0 = add i32 %a, 1 - ret i32 %tmp0 -} - -; CHECK: {{^}}kernel: -define void @kernel(i32 addrspace(1)* %out) { -entry: - %tmp0 = call i32 @func(i32 1) - store i32 %tmp0, i32 addrspace(1)* %out - ret void -} - -; CHECK: {{^}}kernel2: -define void @kernel2(i32 addrspace(1)* %out) { -entry: - call void @kernel(i32 addrspace(1)* %out) - ret void -} diff --git a/test/CodeGen/R600/input-mods.ll b/test/CodeGen/R600/input-mods.ll deleted file mode 100644 index 1c4d285cbcb..00000000000 --- a/test/CodeGen/R600/input-mods.ll +++ /dev/null @@ -1,26 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG -;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM - -;EG-LABEL: {{^}}test: -;EG: EXP_IEEE * -;CM-LABEL: {{^}}test: -;CM: EXP_IEEE T{{[0-9]+}}.X, -|T{{[0-9]+}}.X| -;CM: EXP_IEEE T{{[0-9]+}}.Y (MASKED), -|T{{[0-9]+}}.X| -;CM: EXP_IEEE T{{[0-9]+}}.Z (MASKED), -|T{{[0-9]+}}.X| -;CM: EXP_IEEE * T{{[0-9]+}}.W (MASKED), -|T{{[0-9]+}}.X| - -define void @test(<4 x float> inreg %reg0) #0 { - %r0 = extractelement <4 x float> %reg0, i32 0 - %r1 = call float @llvm.fabs.f32(float %r0) - %r2 = fsub float -0.000000e+00, %r1 - %r3 = call float @llvm.exp2.f32(float %r2) - %vec = insertelement <4 x float> undef, float %r3, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) - ret void -} - -declare float @llvm.exp2.f32(float) readnone -declare float @llvm.fabs.f32(float) readnone -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } diff --git a/test/CodeGen/R600/insert_subreg.ll b/test/CodeGen/R600/insert_subreg.ll deleted file mode 100644 index 4a5e8869c2d..00000000000 --- a/test/CodeGen/R600/insert_subreg.ll +++ /dev/null @@ -1,16 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s - -; Test that INSERT_SUBREG instructions don't have non-register operands after -; instruction selection. - -; Make sure this doesn't crash -; CHECK-LABEL: test: -define void @test(i64 addrspace(1)* %out) { -entry: - %tmp0 = alloca [16 x i32] - %tmp1 = ptrtoint [16 x i32]* %tmp0 to i32 - %tmp2 = sext i32 %tmp1 to i64 - store i64 %tmp2, i64 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/insert_vector_elt.ll b/test/CodeGen/R600/insert_vector_elt.ll deleted file mode 100644 index 6de3d408c48..00000000000 --- a/test/CodeGen/R600/insert_vector_elt.ll +++ /dev/null @@ -1,252 +0,0 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s - -; FIXME: Broken on evergreen -; FIXME: For some reason the 8 and 16 vectors are being stored as -; individual elements instead of 128-bit stores. - - -; FIXME: Why is the constant moved into the intermediate register and -; not just directly into the vector component? - -; SI-LABEL: {{^}}insertelement_v4f32_0: -; s_load_dwordx4 s{{[}}[[LOW_REG:[0-9]+]]: -; v_mov_b32_e32 -; v_mov_b32_e32 [[CONSTREG:v[0-9]+]], 5.000000e+00 -; v_mov_b32_e32 v[[LOW_REG]], [[CONSTREG]] -; buffer_store_dwordx4 v{{[}}[[LOW_REG]]: -define void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { - %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0 - store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}insertelement_v4f32_1: -define void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { - %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1 - store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}insertelement_v4f32_2: -define void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { - %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2 - store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}insertelement_v4f32_3: -define void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { - %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3 - store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}insertelement_v4i32_0: -define void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind { - %vecins = insertelement <4 x i32> %a, i32 999, i32 0 - store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v2f32: -; SI: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 -; SI: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]] -; SI: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]: -define void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind { - %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b - store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v4f32: -; SI: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 -; SI: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]] -; SI: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]: -define void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind { - %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b - store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v8f32: -; FIXMESI: buffer_store_dwordx4 -; FIXMESI: buffer_store_dwordx4 -define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind { - %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b - store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32 - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v16f32: -; FIXMESI: buffer_store_dwordx4 -; FIXMESI: buffer_store_dwordx4 -; FIXMESI: buffer_store_dwordx4 -; FIXMESI: buffer_store_dwordx4 -define void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind { - %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b - store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64 - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v2i32: -; SI: buffer_store_dwordx2 -define void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind { - %vecins = insertelement <2 x i32> %a, i32 5, i32 %b - store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v4i32: -; SI: buffer_store_dwordx4 -define void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b) nounwind { - %vecins = insertelement <4 x i32> %a, i32 5, i32 %b - store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v8i32: -; FIXMESI: buffer_store_dwordx4 -; FIXMESI: buffer_store_dwordx4 -define void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind { - %vecins = insertelement <8 x i32> %a, i32 5, i32 %b - store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32 - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v16i32: -; FIXMESI: buffer_store_dwordx4 -; FIXMESI: buffer_store_dwordx4 -; FIXMESI: buffer_store_dwordx4 -; FIXMESI: buffer_store_dwordx4 -define void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind { - %vecins = insertelement <16 x i32> %a, i32 5, i32 %b - store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64 - ret void -} - - -; SI-LABEL: {{^}}dynamic_insertelement_v2i16: -; FIXMESI: buffer_store_dwordx2 -define void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind { - %vecins = insertelement <2 x i16> %a, i16 5, i32 %b - store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v4i16: -; FIXMESI: buffer_store_dwordx4 -define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind { - %vecins = insertelement <4 x i16> %a, i16 5, i32 %b - store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out, align 16 - ret void -} - - -; SI-LABEL: {{^}}dynamic_insertelement_v2i8: -; FIXMESI: BUFFER_STORE_USHORT -define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind { - %vecins = insertelement <2 x i8> %a, i8 5, i32 %b - store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v4i8: -; FIXMESI: buffer_store_dword -define void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind { - %vecins = insertelement <4 x i8> %a, i8 5, i32 %b - store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v8i8: -; FIXMESI: buffer_store_dwordx2 -define void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind { - %vecins = insertelement <8 x i8> %a, i8 5, i32 %b - store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v16i8: -; FIXMESI: buffer_store_dwordx4 -define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind { - %vecins = insertelement <16 x i8> %a, i8 5, i32 %b - store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16 - ret void -} - -; This test requires handling INSERT_SUBREG in SIFixSGPRCopies. Check that -; the compiler doesn't crash. -; SI-LABEL: {{^}}insert_split_bb: -define void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) { -entry: - %0 = insertelement <2 x i32> undef, i32 %a, i32 0 - %1 = icmp eq i32 %a, 0 - br i1 %1, label %if, label %else - -if: - %2 = load i32, i32 addrspace(1)* %in - %3 = insertelement <2 x i32> %0, i32 %2, i32 1 - br label %endif - -else: - %4 = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %5 = load i32, i32 addrspace(1)* %4 - %6 = insertelement <2 x i32> %0, i32 %5, i32 1 - br label %endif - -endif: - %7 = phi <2 x i32> [%3, %if], [%6, %else] - store <2 x i32> %7, <2 x i32> addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v2f64: -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind { - %vecins = insertelement <2 x double> %a, double 8.0, i32 %b - store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v2i64: -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind { - %vecins = insertelement <2 x i64> %a, i64 5, i32 %b - store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v4f64: -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind { - %vecins = insertelement <4 x double> %a, double 8.0, i32 %b - store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}dynamic_insertelement_v8f64: -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind { - %vecins = insertelement <8 x double> %a, double 8.0, i32 %b - store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16 - ret void -} diff --git a/test/CodeGen/R600/jump-address.ll b/test/CodeGen/R600/jump-address.ll deleted file mode 100644 index f55912e3740..00000000000 --- a/test/CodeGen/R600/jump-address.ll +++ /dev/null @@ -1,52 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; CHECK: JUMP @6 -; CHECK: EXPORT -; CHECK-NOT: EXPORT - -define void @main() #0 { -main_body: - %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %1 = extractelement <4 x float> %0, i32 0 - %2 = bitcast float %1 to i32 - %3 = icmp eq i32 %2, 0 - %4 = sext i1 %3 to i32 - %5 = bitcast i32 %4 to float - %6 = bitcast float %5 to i32 - %7 = icmp ne i32 %6, 0 - br i1 %7, label %ENDIF, label %ELSE - -ELSE: ; preds = %main_body - %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %9 = extractelement <4 x float> %8, i32 0 - %10 = bitcast float %9 to i32 - %11 = icmp eq i32 %10, 1 - %12 = sext i1 %11 to i32 - %13 = bitcast i32 %12 to float - %14 = bitcast float %13 to i32 - %15 = icmp ne i32 %14, 0 - br i1 %15, label %IF13, label %ENDIF - -ENDIF: ; preds = %IF13, %ELSE, %main_body - %temp.0 = phi float [ 0xFFF8000000000000, %main_body ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF13 ] - %temp1.0 = phi float [ 0.000000e+00, %main_body ], [ %23, %IF13 ], [ 0.000000e+00, %ELSE ] - %temp2.0 = phi float [ 1.000000e+00, %main_body ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF13 ] - %temp3.0 = phi float [ 5.000000e-01, %main_body ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF13 ] - %16 = insertelement <4 x float> undef, float %temp.0, i32 0 - %17 = insertelement <4 x float> %16, float %temp1.0, i32 1 - %18 = insertelement <4 x float> %17, float %temp2.0, i32 2 - %19 = insertelement <4 x float> %18, float %temp3.0, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %19, i32 0, i32 0) - ret void - -IF13: ; preds = %ELSE - %20 = load <4 x float>, <4 x float> addrspace(8)* null - %21 = extractelement <4 x float> %20, i32 0 - %22 = fsub float -0.000000e+00, %21 - %23 = fadd float 0xFFF8000000000000, %22 - br label %ENDIF -} - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } diff --git a/test/CodeGen/R600/kcache-fold.ll b/test/CodeGen/R600/kcache-fold.ll deleted file mode 100644 index 7e2291cfdc3..00000000000 --- a/test/CodeGen/R600/kcache-fold.ll +++ /dev/null @@ -1,100 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; CHECK: {{^}}main1: -; CHECK: MOV * T{{[0-9]+\.[XYZW], KC0}} -define void @main1() { -main_body: - %0 = load <4 x float>, <4 x float> addrspace(8)* null - %1 = extractelement <4 x float> %0, i32 0 - %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %3 = extractelement <4 x float> %2, i32 0 - %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %5 = extractelement <4 x float> %4, i32 0 - %6 = fcmp ogt float %1, 0.000000e+00 - %7 = select i1 %6, float %3, float %5 - %8 = load <4 x float>, <4 x float> addrspace(8)* null - %9 = extractelement <4 x float> %8, i32 1 - %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %11 = extractelement <4 x float> %10, i32 1 - %12 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %13 = extractelement <4 x float> %12, i32 1 - %14 = fcmp ogt float %9, 0.000000e+00 - %15 = select i1 %14, float %11, float %13 - %16 = load <4 x float>, <4 x float> addrspace(8)* null - %17 = extractelement <4 x float> %16, i32 2 - %18 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %19 = extractelement <4 x float> %18, i32 2 - %20 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %21 = extractelement <4 x float> %20, i32 2 - %22 = fcmp ogt float %17, 0.000000e+00 - %23 = select i1 %22, float %19, float %21 - %24 = load <4 x float>, <4 x float> addrspace(8)* null - %25 = extractelement <4 x float> %24, i32 3 - %26 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %27 = extractelement <4 x float> %26, i32 3 - %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %29 = extractelement <4 x float> %28, i32 3 - %30 = fcmp ogt float %25, 0.000000e+00 - %31 = select i1 %30, float %27, float %29 - %32 = call float @llvm.AMDIL.clamp.(float %7, float 0.000000e+00, float 1.000000e+00) - %33 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00) - %34 = call float @llvm.AMDIL.clamp.(float %23, float 0.000000e+00, float 1.000000e+00) - %35 = call float @llvm.AMDIL.clamp.(float %31, float 0.000000e+00, float 1.000000e+00) - %36 = insertelement <4 x float> undef, float %32, i32 0 - %37 = insertelement <4 x float> %36, float %33, i32 1 - %38 = insertelement <4 x float> %37, float %34, i32 2 - %39 = insertelement <4 x float> %38, float %35, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %39, i32 0, i32 0) - ret void -} - -; CHECK: {{^}}main2: -; CHECK-NOT: MOV -define void @main2() { -main_body: - %0 = load <4 x float>, <4 x float> addrspace(8)* null - %1 = extractelement <4 x float> %0, i32 0 - %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %3 = extractelement <4 x float> %2, i32 0 - %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %5 = extractelement <4 x float> %4, i32 1 - %6 = fcmp ogt float %1, 0.000000e+00 - %7 = select i1 %6, float %3, float %5 - %8 = load <4 x float>, <4 x float> addrspace(8)* null - %9 = extractelement <4 x float> %8, i32 1 - %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %11 = extractelement <4 x float> %10, i32 0 - %12 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %13 = extractelement <4 x float> %12, i32 1 - %14 = fcmp ogt float %9, 0.000000e+00 - %15 = select i1 %14, float %11, float %13 - %16 = load <4 x float>, <4 x float> addrspace(8)* null - %17 = extractelement <4 x float> %16, i32 2 - %18 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %19 = extractelement <4 x float> %18, i32 3 - %20 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %21 = extractelement <4 x float> %20, i32 2 - %22 = fcmp ogt float %17, 0.000000e+00 - %23 = select i1 %22, float %19, float %21 - %24 = load <4 x float>, <4 x float> addrspace(8)* null - %25 = extractelement <4 x float> %24, i32 3 - %26 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %27 = extractelement <4 x float> %26, i32 3 - %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %29 = extractelement <4 x float> %28, i32 2 - %30 = fcmp ogt float %25, 0.000000e+00 - %31 = select i1 %30, float %27, float %29 - %32 = call float @llvm.AMDIL.clamp.(float %7, float 0.000000e+00, float 1.000000e+00) - %33 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00) - %34 = call float @llvm.AMDIL.clamp.(float %23, float 0.000000e+00, float 1.000000e+00) - %35 = call float @llvm.AMDIL.clamp.(float %31, float 0.000000e+00, float 1.000000e+00) - %36 = insertelement <4 x float> undef, float %32, i32 0 - %37 = insertelement <4 x float> %36, float %33, i32 1 - %38 = insertelement <4 x float> %37, float %34, i32 2 - %39 = insertelement <4 x float> %38, float %35, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %39, i32 0, i32 0) - ret void -} - -declare float @llvm.AMDIL.clamp.(float, float, float) readnone -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) diff --git a/test/CodeGen/R600/kernel-args.ll b/test/CodeGen/R600/kernel-args.ll deleted file mode 100644 index 1dd7c2cb799..00000000000 --- a/test/CodeGen/R600/kernel-args.ll +++ /dev/null @@ -1,473 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC - -; FUNC-LABEL: {{^}}i8_arg: -; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z -; GCN: buffer_load_ubyte - -define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind { -entry: - %0 = zext i8 %in to i32 - store i32 %0, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}i8_zext_arg: -; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z -; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb -; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c - -define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind { -entry: - %0 = zext i8 %in to i32 - store i32 %0, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}i8_sext_arg: -; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z -; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb -; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c - -define void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind { -entry: - %0 = sext i8 %in to i32 - store i32 %0, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}i16_arg: -; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z -; GCN: buffer_load_ushort - -define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind { -entry: - %0 = zext i16 %in to i32 - store i32 %0, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}i16_zext_arg: -; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z -; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb -; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c - -define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind { -entry: - %0 = zext i16 %in to i32 - store i32 %0, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}i16_sext_arg: -; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z -; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb -; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c - -define void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind { -entry: - %0 = sext i16 %in to i32 - store i32 %0, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}i32_arg: -; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z -; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb -; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c -define void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind { -entry: - store i32 %in, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}f32_arg: -; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z -; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb -; VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c -define void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind { -entry: - store float %in, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v2i8_arg: -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -define void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) { -entry: - store <2 x i8> %in, <2 x i8> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}v2i16_arg: -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; GCN-DAG: buffer_load_ushort -; GCN-DAG: buffer_load_ushort -define void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) { -entry: - store <2 x i16> %in, <2 x i16> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}v2i32_arg: -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W -; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb -; VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c -define void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind { -entry: - store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v2f32_arg: -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W -; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb -; VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c -define void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind { -entry: - store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v3i8_arg: -; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40 -; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41 -; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42 -define void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind { -entry: - store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v3i16_arg: -; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44 -; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 -; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48 -define void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind { -entry: - store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4 - ret void -} -; FUNC-LABEL: {{^}}v3i32_arg: -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W -; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd -; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 -define void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind { -entry: - store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v3f32_arg: -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W -; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd -; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 -define void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind { -entry: - store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v4i8_arg: -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -define void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) { -entry: - store <4 x i8> %in, <4 x i8> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}v4i16_arg: -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -define void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) { -entry: - store <4 x i16> %in, <4 x i16> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}v4i32_arg: -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X -; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd -; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 -define void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind { -entry: - store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v4f32_arg: -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X -; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd -; VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 -define void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind { -entry: - store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v8i8_arg: -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -define void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) { -entry: - store <8 x i8> %in, <8 x i8> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}v8i16_arg: -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -define void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) { -entry: - store <8 x i16> %in, <8 x i16> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}v8i32_arg: -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X -; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11 -; VI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x44 -define void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind { -entry: - store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v8f32_arg: -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X -; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11 -define void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind { -entry: - store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v16i8_arg: -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -define void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) { -entry: - store <16 x i8> %in, <16 x i8> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}v16i16_arg: -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -define void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) { -entry: - store <16 x i16> %in, <16 x i16> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}v16i32_arg: -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X -; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 -; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 -define void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind { -entry: - store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v16f32_arg: -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X -; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19 -; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64 -define void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind { -entry: - store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}kernel_arg_i64: -; GCN: s_load_dwordx2 -; GCN: s_load_dwordx2 -; GCN: buffer_store_dwordx2 -define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { - store i64 %a, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}f64_kernel_arg: -; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9 -; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb -; VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x24 -; VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x2c -; GCN: buffer_store_dwordx2 -define void @f64_kernel_arg(double addrspace(1)* %out, double %in) { -entry: - store double %in, double addrspace(1)* %out - ret void -} - -; XFUNC-LABEL: {{^}}kernel_arg_v1i64: -; XGCN: s_load_dwordx2 -; XGCN: s_load_dwordx2 -; XGCN: buffer_store_dwordx2 -; define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind { -; store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8 -; ret void -; } diff --git a/test/CodeGen/R600/large-alloca.ll b/test/CodeGen/R600/large-alloca.ll deleted file mode 100644 index 671833d1a33..00000000000 --- a/test/CodeGen/R600/large-alloca.ll +++ /dev/null @@ -1,15 +0,0 @@ -; XFAIL: * -; REQUIRES: asserts -; RUN: llc -march=amdgcn -mcpu=SI < %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s - -define void @large_alloca(i32 addrspace(1)* %out, i32 %x, i32 %y) nounwind { - %large = alloca [8192 x i32], align 4 - %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191 - store i32 %x, i32* %gep - %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y - %0 = load i32, i32* %gep1 - store i32 %0, i32 addrspace(1)* %out - ret void -} - diff --git a/test/CodeGen/R600/large-constant-initializer.ll b/test/CodeGen/R600/large-constant-initializer.ll deleted file mode 100644 index 9975b1b7f5c..00000000000 --- a/test/CodeGen/R600/large-constant-initializer.ll +++ /dev/null @@ -1,19 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s -; CHECK: s_endpgm - -@gv = external unnamed_addr addrspace(2) constant [239 x i32], align 4 - -define void @opencv_cvtfloat_crash(i32 addrspace(1)* %out, i32 %x) nounwind { - %val = load i32, i32 addrspace(2)* getelementptr ([239 x i32], [239 x i32] addrspace(2)* @gv, i64 0, i64 239), align 4 - %mul12 = mul nsw i32 %val, 7 - br i1 undef, label %exit, label %bb - -bb: - %cmp = icmp slt i32 %x, 0 - br label %exit - -exit: - ret void -} - diff --git a/test/CodeGen/R600/lds-initializer.ll b/test/CodeGen/R600/lds-initializer.ll deleted file mode 100644 index bf8df63be9f..00000000000 --- a/test/CodeGen/R600/lds-initializer.ll +++ /dev/null @@ -1,13 +0,0 @@ -; RUN: not llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s -; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s - -; CHECK: error: unsupported initializer for address space in load_init_lds_global - -@lds = addrspace(3) global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8] - -define void @load_init_lds_global(i32 addrspace(1)* %out, i1 %p) { - %gep = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds, i32 0, i32 10 - %ld = load i32, i32 addrspace(3)* %gep - store i32 %ld, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/lds-oqap-crash.ll b/test/CodeGen/R600/lds-oqap-crash.ll deleted file mode 100644 index 6ff6fc3d7af..00000000000 --- a/test/CodeGen/R600/lds-oqap-crash.ll +++ /dev/null @@ -1,28 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s - -; The test is for a bug in R600EmitClauseMarkers.cpp where this pass -; was searching for a use of the OQAP register in order to determine -; if an LDS instruction could fit in the current clause, but never finding -; one. This created an infinite loop and hung the compiler. -; -; The LDS instruction should not have been defining OQAP in the first place, -; because the LDS instructions are pseudo instructions and the OQAP -; reads and writes are bundled together in the same instruction. - -; CHECK: {{^}}lds_crash: -define void @lds_crash(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %a, i32 %b, i32 %c) { -entry: - %0 = load i32, i32 addrspace(3)* %in - ; This block needs to be > 115 ISA instructions to hit the bug, - ; so we'll use udiv instructions. - %div0 = udiv i32 %0, %b - %div1 = udiv i32 %div0, %a - %div2 = udiv i32 %div1, 11 - %div3 = udiv i32 %div2, %a - %div4 = udiv i32 %div3, %b - %div5 = udiv i32 %div4, %c - %div6 = udiv i32 %div5, %div0 - %div7 = udiv i32 %div6, %div1 - store i32 %div7, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/lds-output-queue.ll b/test/CodeGen/R600/lds-output-queue.ll deleted file mode 100644 index 44ffc36af14..00000000000 --- a/test/CodeGen/R600/lds-output-queue.ll +++ /dev/null @@ -1,99 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s -; -; This test checks that the lds input queue will is empty at the end of -; the ALU clause. - -; CHECK-LABEL: {{^}}lds_input_queue: -; CHECK: LDS_READ_RET * OQAP -; CHECK-NOT: ALU clause -; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP - -@local_mem = internal unnamed_addr addrspace(3) global [2 x i32] undef, align 4 - -define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) { -entry: - %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index - %1 = load i32, i32 addrspace(3)* %0 - call void @llvm.AMDGPU.barrier.local() - - ; This will start a new clause for the vertex fetch - %2 = load i32, i32 addrspace(1)* %in - %3 = add i32 %1, %2 - store i32 %3, i32 addrspace(1)* %out - ret void -} - -declare void @llvm.AMDGPU.barrier.local() - -; The machine scheduler does not do proper alias analysis and assumes that -; loads from global values (Note that a global value is different that a -; value from global memory. A global value is a value that is declared -; outside of a function, it can reside in any address space) alias with -; all other loads. -; -; This is a problem for scheduling the reads from the local data share (lds). -; These reads are implemented using two instructions. The first copies the -; data from lds into the lds output queue, and the second moves the data from -; the input queue into main memory. These two instructions don't have to be -; scheduled one after the other, but they do need to be scheduled in the same -; clause. The aliasing problem mentioned above causes problems when there is a -; load from global memory which immediately follows a load from a global value that -; has been declared in the local memory space: -; -; %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index -; %1 = load i32, i32 addrspace(3)* %0 -; %2 = load i32, i32 addrspace(1)* %in -; -; The instruction selection phase will generate ISA that looks like this: -; %OQAP = LDS_READ_RET -; %vreg0 = MOV %OQAP -; %vreg1 = VTX_READ_32 -; %vreg2 = ADD_INT %vreg1, %vreg0 -; -; The bottom scheduler will schedule the two ALU instructions first: -; -; UNSCHEDULED: -; %OQAP = LDS_READ_RET -; %vreg1 = VTX_READ_32 -; -; SCHEDULED: -; -; vreg0 = MOV %OQAP -; vreg2 = ADD_INT %vreg1, %vreg2 -; -; The lack of proper aliasing results in the local memory read (LDS_READ_RET) -; to consider the global memory read (VTX_READ_32) has a chain dependency, so -; the global memory read will always be scheduled first. This will give us a -; final program which looks like this: -; -; Alu clause: -; %OQAP = LDS_READ_RET -; VTX clause: -; %vreg1 = VTX_READ_32 -; Alu clause: -; vreg0 = MOV %OQAP -; vreg2 = ADD_INT %vreg1, %vreg2 -; -; This is an illegal program because the OQAP def and use know occur in -; different ALU clauses. -; -; This test checks this scenario and makes sure it doesn't result in an -; illegal program. For now, we have fixed this issue by merging the -; LDS_READ_RET and MOV together during instruction selection and then -; expanding them after scheduling. Once the scheduler has better alias -; analysis, we should be able to keep these instructions sparate before -; scheduling. -; -; CHECK-LABEL: {{^}}local_global_alias: -; CHECK: LDS_READ_RET -; CHECK-NOT: ALU clause -; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP -define void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { -entry: - %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 0 - %1 = load i32, i32 addrspace(3)* %0 - %2 = load i32, i32 addrspace(1)* %in - %3 = add i32 %2, %1 - store i32 %3, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/lds-size.ll b/test/CodeGen/R600/lds-size.ll deleted file mode 100644 index 3e8328659fd..00000000000 --- a/test/CodeGen/R600/lds-size.ll +++ /dev/null @@ -1,26 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; This test makes sure we do not double count global values when they are -; used in different basic blocks. - -; CHECK: .long 166120 -; CHECK-NEXT: .long 1 -; CHECK-LABEL: {{^}}test: -@lds = internal unnamed_addr addrspace(3) global i32 undef, align 4 - -define void @test(i32 addrspace(1)* %out, i32 %cond) { -entry: - %0 = icmp eq i32 %cond, 0 - br i1 %0, label %if, label %else - -if: - store i32 1, i32 addrspace(3)* @lds - br label %endif - -else: - store i32 2, i32 addrspace(3)* @lds - br label %endif - -endif: - ret void -} diff --git a/test/CodeGen/R600/lds-zero-initializer.ll b/test/CodeGen/R600/lds-zero-initializer.ll deleted file mode 100644 index fb51bc0e50c..00000000000 --- a/test/CodeGen/R600/lds-zero-initializer.ll +++ /dev/null @@ -1,13 +0,0 @@ -; RUN: not llc -march=amdgcn -mcpu=SI < %s 2>&1 | FileCheck %s -; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s - -; CHECK: error: unsupported initializer for address space in load_zeroinit_lds_global - -@lds = addrspace(3) global [256 x i32] zeroinitializer - -define void @load_zeroinit_lds_global(i32 addrspace(1)* %out, i1 %p) { - %gep = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds, i32 0, i32 10 - %ld = load i32, i32 addrspace(3)* %gep - store i32 %ld, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/legalizedag-bug-expand-setcc.ll b/test/CodeGen/R600/legalizedag-bug-expand-setcc.ll deleted file mode 100644 index 4244c48d240..00000000000 --- a/test/CodeGen/R600/legalizedag-bug-expand-setcc.ll +++ /dev/null @@ -1,26 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; This tests a bug where LegalizeDAG was not checking the target's -; BooleanContents value and always using one for true, when expanding -; setcc to select_cc. -; -; This bug caused the icmp IR instruction to be expanded to two machine -; instructions, when only one is needed. -; - -; CHECK: {{^}}setcc_expand: -; CHECK: SET -; CHECK-NOT: CND -define void @setcc_expand(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = icmp eq i32 %in, 5 - br i1 %0, label %IF, label %ENDIF -IF: - %1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - store i32 0, i32 addrspace(1)* %1 - br label %ENDIF - -ENDIF: - store i32 0, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/lit.local.cfg b/test/CodeGen/R600/lit.local.cfg deleted file mode 100644 index ad9ce2541ef..00000000000 --- a/test/CodeGen/R600/lit.local.cfg +++ /dev/null @@ -1,2 +0,0 @@ -if not 'R600' in config.root.targets: - config.unsupported = True diff --git a/test/CodeGen/R600/literals.ll b/test/CodeGen/R600/literals.ll deleted file mode 100644 index cff1c24f89d..00000000000 --- a/test/CodeGen/R600/literals.ll +++ /dev/null @@ -1,64 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; Test using an integer literal constant. -; Generated ASM should be: -; ADD_INT KC0[2].Z literal.x, 5 -; or -; ADD_INT literal.x KC0[2].Z, 5 - -; CHECK: {{^}}i32_literal: -; CHECK: ADD_INT {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x -; CHECK-NEXT: LSHR -; CHECK-NEXT: 5 -define void @i32_literal(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = add i32 5, %in - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; Test using a float literal constant. -; Generated ASM should be: -; ADD KC0[2].Z literal.x, 5.0 -; or -; ADD literal.x KC0[2].Z, 5.0 - -; CHECK: {{^}}float_literal: -; CHECK: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x -; CHECK-NEXT: LSHR -; CHECK-NEXT: 1084227584(5.0 -define void @float_literal(float addrspace(1)* %out, float %in) { -entry: - %0 = fadd float 5.0, %in - store float %0, float addrspace(1)* %out - ret void -} - -; Make sure inline literals are folded into REG_SEQUENCE instructions. -; CHECK: {{^}}inline_literal_reg_sequence: -; CHECK: MOV {{\** *}}T[[GPR:[0-9]]].X, 0.0 -; CHECK-NEXT: MOV {{\** *}}T[[GPR]].Y, 0.0 -; CHECK-NEXT: MOV {{\** *}}T[[GPR]].Z, 0.0 -; CHECK-NEXT: MOV {{\** *}}T[[GPR]].W, 0.0 - -define void @inline_literal_reg_sequence(<4 x i32> addrspace(1)* %out) { -entry: - store <4 x i32> , <4 x i32> addrspace(1)* %out - ret void -} - -; CHECK: {{^}}inline_literal_dot4: -; CHECK: DOT4 T[[GPR:[0-9]]].X, 1.0 -; CHECK-NEXT: DOT4 T[[GPR]].Y (MASKED), 1.0 -; CHECK-NEXT: DOT4 T[[GPR]].Z (MASKED), 1.0 -; CHECK-NEXT: DOT4 * T[[GPR]].W (MASKED), 1.0 -define void @inline_literal_dot4(float addrspace(1)* %out) { -entry: - %0 = call float @llvm.AMDGPU.dp4(<4 x float> , <4 x float> ) - store float %0, float addrspace(1)* %out - ret void -} - -declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 - -attributes #1 = { readnone } diff --git a/test/CodeGen/R600/llvm.AMDGPU.abs.ll b/test/CodeGen/R600/llvm.AMDGPU.abs.ll deleted file mode 100644 index 8bf094b8bc7..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.abs.ll +++ /dev/null @@ -1,49 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare i32 @llvm.AMDGPU.abs(i32) nounwind readnone - -; Legacy name -declare i32 @llvm.AMDIL.abs.i32(i32) nounwind readnone - -; FUNC-LABEL: {{^}}s_abs_i32: -; SI: s_sub_i32 -; SI: s_max_i32 -; SI: s_endpgm - -; EG: SUB_INT -; EG: MAX_INT -define void @s_abs_i32(i32 addrspace(1)* %out, i32 %src) nounwind { - %abs = call i32 @llvm.AMDGPU.abs(i32 %src) nounwind readnone - store i32 %abs, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_abs_i32: -; SI: v_sub_i32_e32 -; SI: v_max_i32_e32 -; SI: s_endpgm - -; EG: SUB_INT -; EG: MAX_INT -define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind { - %val = load i32, i32 addrspace(1)* %src, align 4 - %abs = call i32 @llvm.AMDGPU.abs(i32 %val) nounwind readnone - store i32 %abs, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}abs_i32_legacy_amdil: -; SI: v_sub_i32_e32 -; SI: v_max_i32_e32 -; SI: s_endpgm - -; EG: SUB_INT -; EG: MAX_INT -define void @abs_i32_legacy_amdil(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind { - %val = load i32, i32 addrspace(1)* %src, align 4 - %abs = call i32 @llvm.AMDIL.abs.i32(i32 %val) nounwind readnone - store i32 %abs, i32 addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/llvm.AMDGPU.barrier.global.ll b/test/CodeGen/R600/llvm.AMDGPU.barrier.global.ll deleted file mode 100644 index db883972d64..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.barrier.global.ll +++ /dev/null @@ -1,30 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}test_barrier_global: -; EG: GROUP_BARRIER -; SI: buffer_store_dword -; SI: s_waitcnt -; SI: s_barrier - -define void @test_barrier_global(i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.tidig.x() - %1 = getelementptr i32, i32 addrspace(1)* %out, i32 %0 - store i32 %0, i32 addrspace(1)* %1 - call void @llvm.AMDGPU.barrier.global() - %2 = call i32 @llvm.r600.read.local.size.x() - %3 = sub i32 %2, 1 - %4 = sub i32 %3, %0 - %5 = getelementptr i32, i32 addrspace(1)* %out, i32 %4 - %6 = load i32, i32 addrspace(1)* %5 - store i32 %6, i32 addrspace(1)* %1 - ret void -} - -declare void @llvm.AMDGPU.barrier.global() - -declare i32 @llvm.r600.read.tidig.x() #0 -declare i32 @llvm.r600.read.local.size.x() #0 - -attributes #0 = { readnone } diff --git a/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll b/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll deleted file mode 100644 index 48fb2e0b1a8..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll +++ /dev/null @@ -1,31 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}test_barrier_local: -; EG: GROUP_BARRIER - -; SI: buffer_store_dword -; SI: s_waitcnt -; SI: s_barrier - -define void @test_barrier_local(i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.tidig.x() - %1 = getelementptr i32, i32 addrspace(1)* %out, i32 %0 - store i32 %0, i32 addrspace(1)* %1 - call void @llvm.AMDGPU.barrier.local() - %2 = call i32 @llvm.r600.read.local.size.x() - %3 = sub i32 %2, 1 - %4 = sub i32 %3, %0 - %5 = getelementptr i32, i32 addrspace(1)* %out, i32 %4 - %6 = load i32, i32 addrspace(1)* %5 - store i32 %6, i32 addrspace(1)* %1 - ret void -} - -declare void @llvm.AMDGPU.barrier.local() - -declare i32 @llvm.r600.read.tidig.x() #0 -declare i32 @llvm.r600.read.local.size.x() #0 - -attributes #0 = { readnone } diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll b/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll deleted file mode 100644 index 1168713ca66..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll +++ /dev/null @@ -1,437 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone - -; FUNC-LABEL: {{^}}bfe_i32_arg_arg_arg: -; SI: v_bfe_i32 -; EG: BFE_INT -; EG: encoding: [{{[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+}},0xac -define void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_arg_arg_imm: -; SI: v_bfe_i32 -; EG: BFE_INT -define void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 123) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_arg_imm_arg: -; SI: v_bfe_i32 -; EG: BFE_INT -define void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 123, i32 %src2) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_imm_arg_arg: -; SI: v_bfe_i32 -; EG: BFE_INT -define void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 123, i32 %src1, i32 %src2) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_bfe_print_arg: -; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 2, 8 -define void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) nounwind { - %load = load i32, i32 addrspace(1)* %src0, align 4 - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 2, i32 8) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_arg_0_width_reg_offset: -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 0) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_arg_0_width_imm_offset: -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 8, i32 0) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_test_6: -; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} -; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} -; SI: s_endpgm -define void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 1, i32 31) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_test_7: -; SI-NOT: shl -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -define void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 0, i32 31) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_test_8: -; SI: buffer_load_dword -; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1 -; SI: s_endpgm -define void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_test_9: -; SI-NOT: {{[^@]}}bfe -; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 31, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_test_10: -; SI-NOT: {{[^@]}}bfe -; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 1, i32 31) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_test_11: -; SI-NOT: {{[^@]}}bfe -; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 8, i32 24) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_test_12: -; SI-NOT: {{[^@]}}bfe -; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 24, i32 8) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_test_13: -; SI: v_ashrrev_i32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = ashr i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_test_14: -; SI-NOT: lshr -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = lshr i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_0: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 0) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_1: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 12334, i32 0, i32 0) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_2: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 1) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_3: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 1, i32 0, i32 1) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_4: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 0, i32 1) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_5: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 7, i32 1) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_6: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0xffffff80 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 0, i32 8) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_7: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 0, i32 8) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_8: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 6, i32 8) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_9: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65536, i32 16, i32 8) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_10: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65535, i32 16, i32 16) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_11: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -6 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 4) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_12: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 31, i32 1) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_13: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 131070, i32 16, i32 16) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_14: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 40 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 2, i32 30) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_15: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 28) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_16: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 1, i32 7) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_17: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 1, i32 31) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_18: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind { - %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 31, i32 1) nounwind readnone - store i32 %bfe_i32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_sext_in_reg_i24: -; SI: buffer_load_dword [[LOAD:v[0-9]+]], -; SI-NOT: v_lshl -; SI-NOT: v_ashr -; SI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 0, 24 -; SI: buffer_store_dword [[BFE]], -define void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 0, i32 24) - %shl = shl i32 %bfe, 8 - %ashr = ashr i32 %shl, 8 - store i32 %ashr, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @simplify_demanded_bfe_sdiv -; SI: buffer_load_dword [[LOAD:v[0-9]+]] -; SI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 1, 16 -; SI: v_lshrrev_b32_e32 [[TMP0:v[0-9]+]], 31, [[BFE]] -; SI: v_add_i32_e32 [[TMP1:v[0-9]+]], [[TMP0]], [[BFE]] -; SI: v_ashrrev_i32_e32 [[TMP2:v[0-9]+]], 1, [[TMP1]] -; SI: buffer_store_dword [[TMP2]] -define void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %src = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %src, i32 1, i32 16) nounwind readnone - %div = sdiv i32 %bfe, 2 - store i32 %div, i32 addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll b/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll deleted file mode 100644 index 541119242a9..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll +++ /dev/null @@ -1,627 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare i32 @llvm.AMDGPU.bfe.u32(i32, i32, i32) nounwind readnone - -; FUNC-LABEL: {{^}}bfe_u32_arg_arg_arg: -; SI: v_bfe_u32 -; EG: BFE_UINT -define void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_arg_arg_imm: -; SI: v_bfe_u32 -; EG: BFE_UINT -define void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 123) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_arg_imm_arg: -; SI: v_bfe_u32 -; EG: BFE_UINT -define void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 123, i32 %src2) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_imm_arg_arg: -; SI: v_bfe_u32 -; EG: BFE_UINT -define void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 123, i32 %src1, i32 %src2) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_arg_0_width_reg_offset: -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 0) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_arg_0_width_imm_offset: -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 8, i32 0) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_zextload_i8: -; SI: buffer_load_ubyte -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { - %load = load i8, i8 addrspace(1)* %in - %ext = zext i8 %load to i32 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8: -; SI: buffer_load_dword -; SI: v_add_i32 -; SI-NEXT: v_and_b32_e32 -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %load = load i32, i32 addrspace(1)* %in, align 4 - %add = add i32 %load, 1 - %ext = and i32 %add, 255 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i16: -; SI: buffer_load_dword -; SI: v_add_i32 -; SI-NEXT: v_and_b32_e32 -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %load = load i32, i32 addrspace(1)* %in, align 4 - %add = add i32 %load, 1 - %ext = and i32 %add, 65535 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 16) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_1: -; SI: buffer_load_dword -; SI: v_add_i32 -; SI: bfe -; SI: s_endpgm -define void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %load = load i32, i32 addrspace(1)* %in, align 4 - %add = add i32 %load, 1 - %ext = and i32 %add, 255 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 1, i32 8) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_3: -; SI: buffer_load_dword -; SI: v_add_i32 -; SI-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0xf8 -; SI-NEXT: bfe -; SI: s_endpgm -define void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %load = load i32, i32 addrspace(1)* %in, align 4 - %add = add i32 %load, 1 - %ext = and i32 %add, 255 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 3, i32 8) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_7: -; SI: buffer_load_dword -; SI: v_add_i32 -; SI-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0x80 -; SI-NEXT: bfe -; SI: s_endpgm -define void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %load = load i32, i32 addrspace(1)* %in, align 4 - %add = add i32 %load, 1 - %ext = and i32 %add, 255 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 7, i32 8) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i16_offset_8: -; SI: buffer_load_dword -; SI: v_add_i32 -; SI-NEXT: bfe -; SI: s_endpgm -define void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %load = load i32, i32 addrspace(1)* %in, align 4 - %add = add i32 %load, 1 - %ext = and i32 %add, 65535 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 8, i32 8) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_1: -; SI: buffer_load_dword -; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}} -; SI: s_endpgm -; EG: AND_INT T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, 1, -define void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 0, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -define void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 8) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -define void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_4: -; SI-NOT: lshl -; SI-NOT: shr -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -define void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %shr = lshr i32 %shl, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shr, i32 31, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_5: -; SI: buffer_load_dword -; SI-NOT: lshl -; SI-NOT: shr -; SI: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1 -; SI: s_endpgm -define void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %shr = ashr i32 %shl, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shr, i32 0, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_6: -; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} -; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} -; SI: s_endpgm -define void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 1, i32 31) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_7: -; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 31) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_8: -; SI-NOT: {{[^@]}}bfe -; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_9: -; SI-NOT: {{[^@]}}bfe -; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 31, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_10: -; SI-NOT: {{[^@]}}bfe -; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 1, i32 31) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_11: -; SI-NOT: {{[^@]}}bfe -; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 8, i32 24) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_12: -; SI-NOT: {{[^@]}}bfe -; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 24, i32 8) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_13: -; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = ashr i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_test_14: -; SI-NOT: lshr -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = lshr i32 %x, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_0: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 0) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_1: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 12334, i32 0, i32 0) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_2: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 1) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_3: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 1, i32 0, i32 1) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_4: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 0, i32 1) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_5: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 7, i32 1) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_6: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x80 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 0, i32 8) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_7: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 0, i32 8) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_8: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 6, i32 8) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_9: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65536, i32 16, i32 8) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_10: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65535, i32 16, i32 16) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_11: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 4) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_12: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 31, i32 1) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_13: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 131070, i32 16, i32 16) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_14: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 40 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 2, i32 30) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_15: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 28) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_16: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 1, i32 7) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_17: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 1, i32 31) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_18: -; SI-NOT: {{[^@]}}bfe -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0 -; SI: buffer_store_dword [[VREG]], -; SI: s_endpgm -; EG-NOT: BFE -define void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind { - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 31, i32 1) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 - ret void -} - -; Make sure that SimplifyDemandedBits doesn't cause the and to be -; reduced to the bits demanded by the bfe. - -; XXX: The operand to v_bfe_u32 could also just directly be the load register. -; FUNC-LABEL: {{^}}simplify_bfe_u32_multi_use_arg: -; SI: buffer_load_dword [[ARG:v[0-9]+]] -; SI: v_and_b32_e32 [[AND:v[0-9]+]], 63, [[ARG]] -; SI: v_bfe_u32 [[BFE:v[0-9]+]], [[AND]], 2, 2 -; SI-DAG: buffer_store_dword [[AND]] -; SI-DAG: buffer_store_dword [[BFE]] -; SI: s_endpgm -define void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0, - i32 addrspace(1)* %out1, - i32 addrspace(1)* %in) nounwind { - %src = load i32, i32 addrspace(1)* %in, align 4 - %and = and i32 %src, 63 - %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %and, i32 2, i32 2) nounwind readnone - store i32 %bfe_u32, i32 addrspace(1)* %out0, align 4 - store i32 %and, i32 addrspace(1)* %out1, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lshr_and: -; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006 -; SI: buffer_store_dword -define void @lshr_and(i32 addrspace(1)* %out, i32 %a) nounwind { - %b = lshr i32 %a, 6 - %c = and i32 %b, 7 - store i32 %c, i32 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_lshr_and: -; SI: v_bfe_u32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, 3 -; SI: buffer_store_dword -define void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %c = lshr i32 %a, %b - %d = and i32 %c, 7 - store i32 %d, i32 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}and_lshr: -; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006 -; SI: buffer_store_dword -define void @and_lshr(i32 addrspace(1)* %out, i32 %a) nounwind { - %b = and i32 %a, 448 - %c = lshr i32 %b, 6 - store i32 %c, i32 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}and_lshr2: -; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006 -; SI: buffer_store_dword -define void @and_lshr2(i32 addrspace(1)* %out, i32 %a) nounwind { - %b = and i32 %a, 511 - %c = lshr i32 %b, 6 - store i32 %c, i32 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}shl_lshr: -; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x150002 -; SI: buffer_store_dword -define void @shl_lshr(i32 addrspace(1)* %out, i32 %a) nounwind { - %b = shl i32 %a, 9 - %c = lshr i32 %b, 11 - store i32 %c, i32 addrspace(1)* %out, align 8 - ret void -} diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfi.ll b/test/CodeGen/R600/llvm.AMDGPU.bfi.ll deleted file mode 100644 index 517a55abc09..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.bfi.ll +++ /dev/null @@ -1,42 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare i32 @llvm.AMDGPU.bfi(i32, i32, i32) nounwind readnone - -; FUNC-LABEL: {{^}}bfi_arg_arg_arg: -; SI: v_bfi_b32 -; EG: BFI_INT -define void @bfi_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind { - %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 %src1, i32 %src1) nounwind readnone - store i32 %bfi, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfi_arg_arg_imm: -; SI: v_bfi_b32 -; EG: BFI_INT -define void @bfi_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { - %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 %src1, i32 123) nounwind readnone - store i32 %bfi, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfi_arg_imm_arg: -; SI: v_bfi_b32 -; EG: BFI_INT -define void @bfi_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind { - %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 123, i32 %src2) nounwind readnone - store i32 %bfi, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfi_imm_arg_arg: -; SI: v_bfi_b32 -; EG: BFI_INT -define void @bfi_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind { - %bfi = call i32 @llvm.AMDGPU.bfi(i32 123, i32 %src1, i32 %src2) nounwind readnone - store i32 %bfi, i32 addrspace(1)* %out, align 4 - ret void -} - diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfm.ll b/test/CodeGen/R600/llvm.AMDGPU.bfm.ll deleted file mode 100644 index 50492289d74..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.bfm.ll +++ /dev/null @@ -1,60 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare i32 @llvm.AMDGPU.bfm(i32, i32) nounwind readnone - -; FUNC-LABEL: {{^}}bfm_arg_arg: -; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} -; EG: BFM_INT -define void @bfm_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { - %bfm = call i32 @llvm.AMDGPU.bfm(i32 %src0, i32 %src1) nounwind readnone - store i32 %bfm, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfm_arg_imm: -; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x7b -; EG: BFM_INT -define void @bfm_arg_imm(i32 addrspace(1)* %out, i32 %src0) nounwind { - %bfm = call i32 @llvm.AMDGPU.bfm(i32 %src0, i32 123) nounwind readnone - store i32 %bfm, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfm_imm_arg: -; SI: s_bfm_b32 {{s[0-9]+}}, 0x7b, {{s[0-9]+}} -; EG: BFM_INT -define void @bfm_imm_arg(i32 addrspace(1)* %out, i32 %src1) nounwind { - %bfm = call i32 @llvm.AMDGPU.bfm(i32 123, i32 %src1) nounwind readnone - store i32 %bfm, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfm_imm_imm: -; SI: s_bfm_b32 {{s[0-9]+}}, 0x7b, 0x1c8 -; EG: BFM_INT -define void @bfm_imm_imm(i32 addrspace(1)* %out) nounwind { - %bfm = call i32 @llvm.AMDGPU.bfm(i32 123, i32 456) nounwind readnone - store i32 %bfm, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfm_pattern: -; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} -define void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) { - %a = shl i32 1, %x - %b = sub i32 %a, 1 - %c = shl i32 %b, %y - store i32 %c, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}bfm_pattern_simple: -; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0 -define void @bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) { - %a = shl i32 1, %x - %b = sub i32 %a, 1 - store i32 %b, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/llvm.AMDGPU.brev.ll b/test/CodeGen/R600/llvm.AMDGPU.brev.ll deleted file mode 100644 index 301de4b1c82..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.brev.ll +++ /dev/null @@ -1,28 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.AMDGPU.brev(i32) nounwind readnone - -; FUNC-LABEL: {{^}}s_brev_i32: -; SI: s_load_dword [[VAL:s[0-9]+]], -; SI: s_brev_b32 [[SRESULT:s[0-9]+]], [[VAL]] -; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] -; SI: buffer_store_dword [[VRESULT]], -; SI: s_endpgm -define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind { - %ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone - store i32 %ctlz, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_brev_i32: -; SI: buffer_load_dword [[VAL:v[0-9]+]], -; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr, align 4 - %ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone - store i32 %ctlz, i32 addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/llvm.AMDGPU.clamp.ll b/test/CodeGen/R600/llvm.AMDGPU.clamp.ll deleted file mode 100644 index 11ec963ab31..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.clamp.ll +++ /dev/null @@ -1,67 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare float @llvm.fabs.f32(float) nounwind readnone -declare float @llvm.AMDGPU.clamp.f32(float, float, float) nounwind readnone -declare float @llvm.AMDIL.clamp.f32(float, float, float) nounwind readnone - -; FUNC-LABEL: {{^}}clamp_0_1_f32: -; SI: s_load_dword [[ARG:s[0-9]+]], -; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, [[ARG]] clamp{{$}} -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm - -; EG: MOV_SAT -define void @clamp_0_1_f32(float addrspace(1)* %out, float %src) nounwind { - %clamp = call float @llvm.AMDGPU.clamp.f32(float %src, float 0.0, float 1.0) nounwind readnone - store float %clamp, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}clamp_fabs_0_1_f32: -; SI: s_load_dword [[ARG:s[0-9]+]], -; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, |[[ARG]]| clamp{{$}} -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @clamp_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind { - %src.fabs = call float @llvm.fabs.f32(float %src) nounwind readnone - %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fabs, float 0.0, float 1.0) nounwind readnone - store float %clamp, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}clamp_fneg_0_1_f32: -; SI: s_load_dword [[ARG:s[0-9]+]], -; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, -[[ARG]] clamp{{$}} -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @clamp_fneg_0_1_f32(float addrspace(1)* %out, float %src) nounwind { - %src.fneg = fsub float -0.0, %src - %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fneg, float 0.0, float 1.0) nounwind readnone - store float %clamp, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}clamp_fneg_fabs_0_1_f32: -; SI: s_load_dword [[ARG:s[0-9]+]], -; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, -|[[ARG]]| clamp{{$}} -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @clamp_fneg_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind { - %src.fabs = call float @llvm.fabs.f32(float %src) nounwind readnone - %src.fneg.fabs = fsub float -0.0, %src.fabs - %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fneg.fabs, float 0.0, float 1.0) nounwind readnone - store float %clamp, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}clamp_0_1_amdil_legacy_f32: -; SI: s_load_dword [[ARG:s[0-9]+]], -; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, [[ARG]] clamp{{$}} -; SI: buffer_store_dword [[RESULT]] -define void @clamp_0_1_amdil_legacy_f32(float addrspace(1)* %out, float %src) nounwind { - %clamp = call float @llvm.AMDIL.clamp.f32(float %src, float 0.0, float 1.0) nounwind readnone - store float %clamp, float addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/llvm.AMDGPU.class.ll b/test/CodeGen/R600/llvm.AMDGPU.class.ll deleted file mode 100644 index 805a88b59c7..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.class.ll +++ /dev/null @@ -1,497 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare i1 @llvm.AMDGPU.class.f32(float, i32) #1 -declare i1 @llvm.AMDGPU.class.f64(double, i32) #1 -declare i32 @llvm.r600.read.tidig.x() #1 -declare float @llvm.fabs.f32(float) #1 -declare double @llvm.fabs.f64(double) #1 - -; SI-LABEL: {{^}}test_class_f32: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { - %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_fabs_f32: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { - %a.fabs = call float @llvm.fabs.f32(float %a) #1 - %result = call i1 @llvm.AMDGPU.class.f32(float %a.fabs, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_fneg_f32: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { - %a.fneg = fsub float -0.0, %a - %result = call i1 @llvm.AMDGPU.class.f32(float %a.fneg, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_fneg_fabs_f32: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { - %a.fabs = call float @llvm.fabs.f32(float %a) #1 - %a.fneg.fabs = fsub float -0.0, %a.fabs - %result = call i1 @llvm.AMDGPU.class.f32(float %a.fneg.fabs, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_1_f32: -; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 1{{$}} -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 { - %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_64_f32: -; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 64{{$}} -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 { - %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 64) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; Set all 10 bits of mask -; SI-LABEL: {{^}}test_class_full_mask_f32: -; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}} -; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 { - %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1023) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_9bit_mask_f32: -; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} -; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 { - %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 511) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}v_test_class_full_mask_f32: -; SI-DAG: buffer_load_dword [[VA:v[0-9]+]] -; SI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} -; SI: v_cmp_class_f32_e32 vcc, [[VA]], [[MASK]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load float, float addrspace(1)* %gep.in - - %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 511) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %gep.out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f32: -; SI-DAG: buffer_load_dword [[VB:v[0-9]+]] -; SI: v_cmp_class_f32_e32 vcc, 1.0, [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %b = load i32, i32 addrspace(1)* %gep.in - - %result = call i1 @llvm.AMDGPU.class.f32(float 1.0, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %gep.out, align 4 - ret void -} - -; FIXME: Why isn't this using a literal constant operand? -; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f32: -; SI-DAG: buffer_load_dword [[VB:v[0-9]+]] -; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000 -; SI: v_cmp_class_f32_e32 vcc, [[VK]], [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %b = load i32, i32 addrspace(1)* %gep.in - - %result = call i1 @llvm.AMDGPU.class.f32(float 1024.0, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %gep.out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_f64: -; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { - %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_fabs_f64: -; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { - %a.fabs = call double @llvm.fabs.f64(double %a) #1 - %result = call i1 @llvm.AMDGPU.class.f64(double %a.fabs, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_fneg_f64: -; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { - %a.fneg = fsub double -0.0, %a - %result = call i1 @llvm.AMDGPU.class.f64(double %a.fneg, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_fneg_fabs_f64: -; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]] -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 { - %a.fabs = call double @llvm.fabs.f64(double %a) #1 - %a.fneg.fabs = fsub double -0.0, %a.fabs - %result = call i1 @llvm.AMDGPU.class.f64(double %a.fneg.fabs, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_1_f64: -; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 1{{$}} -; SI: s_endpgm -define void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 { - %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 1) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_64_f64: -; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 64{{$}} -; SI: s_endpgm -define void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 { - %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 64) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; Set all 9 bits of mask -; SI-LABEL: {{^}}test_class_full_mask_f64: -; SI: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} -; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[MASK]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI-NEXT: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 { - %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 511) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}v_test_class_full_mask_f64: -; SI-DAG: buffer_load_dwordx2 [[VA:v\[[0-9]+:[0-9]+\]]] -; SI-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}} -; SI: v_cmp_class_f64_e32 vcc, [[VA]], [[MASK]] -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load double, double addrspace(1)* %in - - %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 511) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %gep.out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f64: -; XSI: v_cmp_class_f64_e32 vcc, 1.0, -; SI: v_cmp_class_f64_e32 vcc, -; SI: s_endpgm -define void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %b = load i32, i32 addrspace(1)* %gep.in - - %result = call i1 @llvm.AMDGPU.class.f64(double 1.0, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %gep.out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f64: -; SI: v_cmp_class_f64_e32 vcc, s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} -; SI: s_endpgm -define void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %b = load i32, i32 addrspace(1)* %gep.in - - %result = call i1 @llvm.AMDGPU.class.f64(double 1024.0, i32 %b) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %gep.out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_fold_or_class_f32_0: -; SI-NOT: v_cmp_class -; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 3{{$}} -; SI-NOT: v_cmp_class -; SI: s_endpgm -define void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load float, float addrspace(1)* %gep.in - - %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1 - %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 3) #1 - %or = or i1 %class0, %class1 - - %sext = sext i1 %or to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_fold_or3_class_f32_0: -; SI-NOT: v_cmp_class -; SI: v_cmp_class_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}} -; SI-NOT: v_cmp_class -; SI: s_endpgm -define void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load float, float addrspace(1)* %gep.in - - %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1 - %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1 - %class2 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1 - %or.0 = or i1 %class0, %class1 - %or.1 = or i1 %or.0, %class2 - - %sext = sext i1 %or.1 to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_fold_or_all_tests_class_f32_0: -; SI-NOT: v_cmp_class -; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}} -; SI: v_cmp_class_f32_e32 vcc, v{{[0-9]+}}, [[MASK]]{{$}} -; SI-NOT: v_cmp_class -; SI: s_endpgm -define void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load float, float addrspace(1)* %gep.in - - %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 1) #1 - %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 2) #1 - %class2 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1 - %class3 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1 - %class4 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 16) #1 - %class5 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 32) #1 - %class6 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 64) #1 - %class7 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 128) #1 - %class8 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 256) #1 - %class9 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 512) #1 - %or.0 = or i1 %class0, %class1 - %or.1 = or i1 %or.0, %class2 - %or.2 = or i1 %or.1, %class3 - %or.3 = or i1 %or.2, %class4 - %or.4 = or i1 %or.3, %class5 - %or.5 = or i1 %or.4, %class6 - %or.6 = or i1 %or.5, %class7 - %or.7 = or i1 %or.6, %class8 - %or.8 = or i1 %or.7, %class9 - %sext = sext i1 %or.8 to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_fold_or_class_f32_1: -; SI-NOT: v_cmp_class -; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 12{{$}} -; SI-NOT: v_cmp_class -; SI: s_endpgm -define void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load float, float addrspace(1)* %gep.in - - %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1 - %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 8) #1 - %or = or i1 %class0, %class1 - - %sext = sext i1 %or to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_fold_or_class_f32_2: -; SI-NOT: v_cmp_class -; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}} -; SI-NOT: v_cmp_class -; SI: s_endpgm -define void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load float, float addrspace(1)* %gep.in - - %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1 - %class1 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 7) #1 - %or = or i1 %class0, %class1 - - %sext = sext i1 %or to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_no_fold_or_class_f32_0: -; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 4{{$}} -; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 8{{$}} -; SI: s_or_b64 -; SI: s_endpgm -define void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load float, float addrspace(1)* %gep.in - - %class0 = call i1 @llvm.AMDGPU.class.f32(float %a, i32 4) #1 - %class1 = call i1 @llvm.AMDGPU.class.f32(float %b, i32 8) #1 - %or = or i1 %class0, %class1 - - %sext = sext i1 %or to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_0_f32: -; SI-NOT: v_cmp_class -; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 { - %result = call i1 @llvm.AMDGPU.class.f32(float %a, i32 0) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_class_0_f64: -; SI-NOT: v_cmp_class -; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 { - %result = call i1 @llvm.AMDGPU.class.f64(double %a, i32 0) #1 - %sext = sext i1 %result to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/llvm.AMDGPU.cube.ll b/test/CodeGen/R600/llvm.AMDGPU.cube.ll deleted file mode 100644 index e95a51093cb..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.cube.ll +++ /dev/null @@ -1,59 +0,0 @@ - -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; CHECK: {{^}}cube: -; CHECK: CUBE T{{[0-9]}}.X -; CHECK: CUBE T{{[0-9]}}.Y -; CHECK: CUBE T{{[0-9]}}.Z -; CHECK: CUBE * T{{[0-9]}}.W -define void @cube() #0 { -main_body: - %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) - %1 = extractelement <4 x float> %0, i32 3 - %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) - %3 = extractelement <4 x float> %2, i32 0 - %4 = fdiv float %3, %1 - %5 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) - %6 = extractelement <4 x float> %5, i32 1 - %7 = fdiv float %6, %1 - %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) - %9 = extractelement <4 x float> %8, i32 2 - %10 = fdiv float %9, %1 - %11 = insertelement <4 x float> undef, float %4, i32 0 - %12 = insertelement <4 x float> %11, float %7, i32 1 - %13 = insertelement <4 x float> %12, float %10, i32 2 - %14 = insertelement <4 x float> %13, float 1.000000e+00, i32 3 - %15 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %14) - %16 = extractelement <4 x float> %15, i32 0 - %17 = extractelement <4 x float> %15, i32 1 - %18 = extractelement <4 x float> %15, i32 2 - %19 = extractelement <4 x float> %15, i32 3 - %20 = call float @fabs(float %18) - %21 = fdiv float 1.000000e+00, %20 - %22 = fmul float %16, %21 - %23 = fadd float %22, 1.500000e+00 - %24 = fmul float %17, %21 - %25 = fadd float %24, 1.500000e+00 - %26 = insertelement <4 x float> undef, float %25, i32 0 - %27 = insertelement <4 x float> %26, float %23, i32 1 - %28 = insertelement <4 x float> %27, float %19, i32 2 - %29 = insertelement <4 x float> %28, float %25, i32 3 - %30 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %29, i32 16, i32 0, i32 4) - call void @llvm.R600.store.swizzle(<4 x float> %30, i32 0, i32 0) - ret void -} - -; Function Attrs: readnone -declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #1 - -; Function Attrs: readnone -declare float @fabs(float) #1 - -; Function Attrs: readnone -declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1 - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { readnone } - diff --git a/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll b/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll deleted file mode 100644 index 8b32f696449..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll +++ /dev/null @@ -1,43 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s - -declare float @llvm.AMDGPU.cvt.f32.ubyte0(i32) nounwind readnone -declare float @llvm.AMDGPU.cvt.f32.ubyte1(i32) nounwind readnone -declare float @llvm.AMDGPU.cvt.f32.ubyte2(i32) nounwind readnone -declare float @llvm.AMDGPU.cvt.f32.ubyte3(i32) nounwind readnone - -; SI-LABEL: {{^}}test_unpack_byte0_to_float: -; SI: v_cvt_f32_ubyte0 -define void @test_unpack_byte0_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 - %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte0(i32 %val) nounwind readnone - store float %cvt, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_unpack_byte1_to_float: -; SI: v_cvt_f32_ubyte1 -define void @test_unpack_byte1_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 - %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte1(i32 %val) nounwind readnone - store float %cvt, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_unpack_byte2_to_float: -; SI: v_cvt_f32_ubyte2 -define void @test_unpack_byte2_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 - %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte2(i32 %val) nounwind readnone - store float %cvt, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_unpack_byte3_to_float: -; SI: v_cvt_f32_ubyte3 -define void @test_unpack_byte3_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 - %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte3(i32 %val) nounwind readnone - store float %cvt, float addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll b/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll deleted file mode 100644 index 55ca9c7536e..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll +++ /dev/null @@ -1,31 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s - -declare float @llvm.AMDGPU.div.fixup.f32(float, float, float) nounwind readnone -declare double @llvm.AMDGPU.div.fixup.f64(double, double, double) nounwind readnone - -; GCN-LABEL: {{^}}test_div_fixup_f32: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 -; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 -; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] -; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]] -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm -define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { - %result = call float @llvm.AMDGPU.div.fixup.f32(float %a, float %b, float %c) nounwind readnone - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_div_fixup_f64: -; GCN: v_div_fixup_f64 -define void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind { - %result = call double @llvm.AMDGPU.div.fixup.f64(double %a, double %b, double %c) nounwind readnone - store double %result, double addrspace(1)* %out, align 8 - ret void -} diff --git a/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll b/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll deleted file mode 100644 index bcb7f870f1f..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll +++ /dev/null @@ -1,179 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s - -; FIXME: Enable for VI. - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone -declare void @llvm.AMDGPU.barrier.global() nounwind noduplicate -declare float @llvm.AMDGPU.div.fmas.f32(float, float, float, i1) nounwind readnone -declare double @llvm.AMDGPU.div.fmas.f64(double, double, double, i1) nounwind readnone - -; GCN-LABEL: {{^}}test_div_fmas_f32: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 -; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 -; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] -; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]] -; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VB]], [[VA]], [[VC]] -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm -define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { - %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_0: -; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] -; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VB]], [[VC]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { - %result = call float @llvm.AMDGPU.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_1: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] -; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]] -; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VA]], [[VC]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { - %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_2: -; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]] -; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] -; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0 -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { - %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_div_fmas_f64: -; GCN: v_div_fmas_f64 -define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind { - %result = call double @llvm.AMDGPU.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone - store double %result, double addrspace(1)* %out, align 8 - ret void -} - -; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc: -; SI: v_cmp_eq_i32_e64 vcc, 0, s{{[0-9]+}} -; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind { - %cmp = icmp eq i32 %i, 0 - %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_div_fmas_f32_imm_false_cond_to_vcc: -; SI: s_mov_b64 vcc, 0 -; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { - %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_div_fmas_f32_imm_true_cond_to_vcc: -; SI: s_mov_b64 vcc, -1 -; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind { - %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_div_fmas_f32_logical_cond_to_vcc: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} - -; SI-DAG: v_cmp_eq_i32_e32 [[CMP0:vcc]], 0, v{{[0-9]+}} -; SI-DAG: v_cmp_ne_i32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 0, s{{[0-9]+}} -; SI: s_and_b64 vcc, [[CMP0]], [[CMP1]] -; SI: v_div_fmas_f32 {{v[0-9]+}}, [[A]], [[B]], [[C]] -; SI: s_endpgm -define void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 %d) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1 - %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 2 - - %a = load float, float addrspace(1)* %gep.a - %b = load float, float addrspace(1)* %gep.b - %c = load float, float addrspace(1)* %gep.c - - %cmp0 = icmp eq i32 %tid, 0 - %cmp1 = icmp ne i32 %d, 0 - %and = and i1 %cmp0, %cmp1 - - %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %and) nounwind readnone - store float %result, float addrspace(1)* %gep.out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc: -; SI: v_cmp_eq_i32_e32 vcc, 0, v{{[0-9]+}} -; SI: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc -; SI: s_xor_b64 [[SAVE]], exec, [[SAVE]] - -; SI: buffer_load_dword [[LOAD:v[0-9]+]] -; SI: v_cmp_ne_i32_e32 vcc, 0, [[LOAD]] -; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc - - -; SI: BB9_2: -; SI: s_or_b64 exec, exec, [[SAVE]] -; SI: v_cmp_ne_i32_e32 vcc, 0, v0 -; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -; SI: buffer_store_dword -; SI: s_endpgm -define void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind { -entry: - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.out = getelementptr float, float addrspace(1)* %out, i32 2 - %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1 - %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2 - - %a = load float, float addrspace(1)* %gep.a - %b = load float, float addrspace(1)* %gep.b - %c = load float, float addrspace(1)* %gep.c - - %cmp0 = icmp eq i32 %tid, 0 - br i1 %cmp0, label %bb, label %exit - -bb: - %val = load i32, i32 addrspace(1)* %dummy - %cmp1 = icmp ne i32 %val, 0 - br label %exit - -exit: - %cond = phi i1 [false, %entry], [%cmp1, %bb] - %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %cond) nounwind readnone - store float %result, float addrspace(1)* %gep.out, align 4 - ret void -} diff --git a/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll b/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll deleted file mode 100644 index de830de039c..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll +++ /dev/null @@ -1,364 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone -declare { float, i1 } @llvm.AMDGPU.div.scale.f32(float, float, i1) nounwind readnone -declare { double, i1 } @llvm.AMDGPU.div.scale.f64(double, double, i1) nounwind readnone -declare float @llvm.fabs.f32(float) nounwind readnone - -; SI-LABEL @test_div_scale_f32_1: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f32_2: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f64_1: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] -; SI: buffer_store_dwordx2 [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0, align 8 - %b = load double, double addrspace(1)* %gep.1, align 8 - - %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone - %result0 = extractvalue { double, i1 } %result, 0 - store double %result0, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL @test_div_scale_f64_1: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] -; SI: buffer_store_dwordx2 [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 - - %a = load double, double addrspace(1)* %gep.0, align 8 - %b = load double, double addrspace(1)* %gep.1, align 8 - - %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone - %result0 = extractvalue { double, i1 } %result, 0 - store double %result0, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL @test_div_scale_f32_scalar_num_1: -; SI-DAG: buffer_load_dword [[B:v[0-9]+]] -; SI-DAG: s_load_dword [[A:s[0-9]+]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr float, float addrspace(1)* %in, i32 %tid - - %b = load float, float addrspace(1)* %gep, align 4 - - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f32_scalar_num_2: -; SI-DAG: buffer_load_dword [[B:v[0-9]+]] -; SI-DAG: s_load_dword [[A:s[0-9]+]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr float, float addrspace(1)* %in, i32 %tid - - %b = load float, float addrspace(1)* %gep, align 4 - - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f32_scalar_den_1: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]] -; SI-DAG: s_load_dword [[B:s[0-9]+]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr float, float addrspace(1)* %in, i32 %tid - - %a = load float, float addrspace(1)* %gep, align 4 - - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f32_scalar_den_2: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]] -; SI-DAG: s_load_dword [[B:s[0-9]+]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr float, float addrspace(1)* %in, i32 %tid - - %a = load float, float addrspace(1)* %gep, align 4 - - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f64_scalar_num_1: -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]] -; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] -; SI: buffer_store_dwordx2 [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr double, double addrspace(1)* %in, i32 %tid - - %b = load double, double addrspace(1)* %gep, align 8 - - %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone - %result0 = extractvalue { double, i1 } %result, 0 - store double %result0, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL @test_div_scale_f64_scalar_num_2: -; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]] -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] -; SI: buffer_store_dwordx2 [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr double, double addrspace(1)* %in, i32 %tid - - %b = load double, double addrspace(1)* %gep, align 8 - - %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone - %result0 = extractvalue { double, i1 } %result, 0 - store double %result0, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL @test_div_scale_f64_scalar_den_1: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] -; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] -; SI: buffer_store_dwordx2 [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr double, double addrspace(1)* %in, i32 %tid - - %a = load double, double addrspace(1)* %gep, align 8 - - %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone - %result0 = extractvalue { double, i1 } %result, 0 - store double %result0, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL @test_div_scale_f64_scalar_den_2: -; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] -; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] -; SI: buffer_store_dwordx2 [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr double, double addrspace(1)* %in, i32 %tid - - %a = load double, double addrspace(1)* %gep, align 8 - - %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone - %result0 = extractvalue { double, i1 } %result, 0 - store double %result0, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL @test_div_scale_f32_all_scalar_1: -; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc -; SI: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[VA]] -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind { - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f32_all_scalar_2: -; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc -; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[VB]], [[A]] -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind { - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f64_all_scalar_1: -; SI-DAG: s_load_dwordx2 s{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: v_mov_b32_e32 v[[VA_LO:[0-9]+]], s[[A_LO]] -; SI-DAG: v_mov_b32_e32 v[[VA_HI:[0-9]+]], s[[A_HI]] -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], v{{\[}}[[VA_LO]]:[[VA_HI]]{{\]}} -; SI: buffer_store_dwordx2 [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind { - %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone - %result0 = extractvalue { double, i1 } %result, 0 - store double %result0, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL @test_div_scale_f64_all_scalar_2: -; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dwordx2 s{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xd -; SI-DAG: v_mov_b32_e32 v[[VB_LO:[0-9]+]], s[[B_LO]] -; SI-DAG: v_mov_b32_e32 v[[VB_HI:[0-9]+]], s[[B_HI]] -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], v{{\[}}[[VB_LO]]:[[VB_HI]]{{\]}}, [[A]] -; SI: buffer_store_dwordx2 [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind { - %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone - %result0 = extractvalue { double, i1 } %result, 0 - store double %result0, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL @test_div_scale_f32_inline_imm_num: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[A]], 1.0 -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %a = load float, float addrspace(1)* %gep.0, align 4 - - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float 1.0, float %a, i1 false) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f32_inline_imm_den: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], 2.0, 2.0, [[A]] -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %a = load float, float addrspace(1)* %gep.0, align 4 - - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float 2.0, i1 false) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f32_fabs_num: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], |[[A]]| -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone - - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a.fabs, float %b, i1 false) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL @test_div_scale_f32_fabs_den: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], |[[B]]|, |[[B]]|, [[A]] -; SI: buffer_store_dword [[RESULT0]] -; SI: s_endpgm -define void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone - - %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b.fabs, i1 false) nounwind readnone - %result0 = extractvalue { float, i1 } %result, 0 - store float %result0, float addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/llvm.AMDGPU.flbit.i32.ll b/test/CodeGen/R600/llvm.AMDGPU.flbit.i32.ll deleted file mode 100644 index 20c7af8ade5..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.flbit.i32.ll +++ /dev/null @@ -1,28 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.AMDGPU.flbit.i32(i32) nounwind readnone - -; FUNC-LABEL: {{^}}s_flbit: -; SI: s_load_dword [[VAL:s[0-9]+]], -; SI: s_flbit_i32 [[SRESULT:s[0-9]+]], [[VAL]] -; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] -; SI: buffer_store_dword [[VRESULT]], -; SI: s_endpgm -define void @s_flbit(i32 addrspace(1)* noalias %out, i32 %val) nounwind { - %r = call i32 @llvm.AMDGPU.flbit.i32(i32 %val) nounwind readnone - store i32 %r, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_flbit: -; SI: buffer_load_dword [[VAL:v[0-9]+]], -; SI: v_ffbh_i32_e32 [[RESULT:v[0-9]+]], [[VAL]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @v_flbit(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { - %val = load i32, i32 addrspace(1)* %valptr, align 4 - %r = call i32 @llvm.AMDGPU.flbit.i32(i32 %val) nounwind readnone - store i32 %r, i32 addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/llvm.AMDGPU.fract.f64.ll b/test/CodeGen/R600/llvm.AMDGPU.fract.f64.ll deleted file mode 100644 index e098dd35d6d..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.fract.f64.ll +++ /dev/null @@ -1,60 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s - -declare double @llvm.fabs.f64(double %Val) -declare double @llvm.AMDGPU.fract.f64(double) nounwind readnone - -; FUNC-LABEL: {{^}}fract_f64: -; GCN: v_fract_f64_e32 [[FRC:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]] -; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1 -; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff -; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]] -; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3 -; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]] -; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]] -; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]] -; CI: buffer_store_dwordx2 [[FRC]] -define void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) nounwind { - %val = load double, double addrspace(1)* %src, align 4 - %fract = call double @llvm.AMDGPU.fract.f64(double %val) nounwind readnone - store double %fract, double addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}fract_f64_neg: -; GCN: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]] -; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1 -; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff -; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]] -; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3 -; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]] -; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]] -; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]] -; CI: buffer_store_dwordx2 [[FRC]] -define void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src) nounwind { - %val = load double, double addrspace(1)* %src, align 4 - %neg = fsub double 0.0, %val - %fract = call double @llvm.AMDGPU.fract.f64(double %neg) nounwind readnone - store double %fract, double addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}fract_f64_neg_abs: -; GCN: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -|v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]| -; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1 -; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff -; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]] -; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3 -; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[LO]], v[[MINLO]], [[COND]] -; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[HI]], v[[MINHI]], [[COND]] -; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]] -; CI: buffer_store_dwordx2 [[FRC]] -define void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* %src) nounwind { - %val = load double, double addrspace(1)* %src, align 4 - %abs = call double @llvm.fabs.f64(double %val) - %neg = fsub double 0.0, %abs - %fract = call double @llvm.AMDGPU.fract.f64(double %neg) nounwind readnone - store double %fract, double addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/llvm.AMDGPU.fract.ll b/test/CodeGen/R600/llvm.AMDGPU.fract.ll deleted file mode 100644 index 7501b4b7546..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.fract.ll +++ /dev/null @@ -1,65 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare float @llvm.fabs.f32(float %Val) -declare float @llvm.AMDGPU.fract.f32(float) nounwind readnone - -; Legacy name -declare float @llvm.AMDIL.fraction.f32(float) nounwind readnone - -; FUNC-LABEL: {{^}}fract_f32: -; CI: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]] -; SI: v_floor_f32_e32 [[FLR:v[0-9]+]], [[INPUT:v[0-9]+]] -; SI: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[FLR]], [[INPUT]] -; GCN: buffer_store_dword [[RESULT]] -; EG: FRACT -define void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) nounwind { - %val = load float, float addrspace(1)* %src, align 4 - %fract = call float @llvm.AMDGPU.fract.f32(float %val) nounwind readnone - store float %fract, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}fract_f32_legacy_amdil: -; CI: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]] -; SI: v_floor_f32_e32 [[FLR:v[0-9]+]], [[INPUT:v[0-9]+]] -; SI: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[FLR]], [[INPUT]] -; GCN: buffer_store_dword [[RESULT]] -; EG: FRACT -define void @fract_f32_legacy_amdil(float addrspace(1)* %out, float addrspace(1)* %src) nounwind { - %val = load float, float addrspace(1)* %src, align 4 - %fract = call float @llvm.AMDIL.fraction.f32(float %val) nounwind readnone - store float %fract, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}fract_f32_neg: -; CI: v_fract_f32_e64 [[RESULT:v[0-9]+]], -[[INPUT:v[0-9]+]] -; SI: v_floor_f32_e64 [[FLR:v[0-9]+]], -[[INPUT:v[0-9]+]] -; SI: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[INPUT]], [[FLR]] -; GCN: buffer_store_dword [[RESULT]] -; EG: FRACT -define void @fract_f32_neg(float addrspace(1)* %out, float addrspace(1)* %src) nounwind { - %val = load float, float addrspace(1)* %src, align 4 - %neg = fsub float 0.0, %val - %fract = call float @llvm.AMDGPU.fract.f32(float %neg) nounwind readnone - store float %fract, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}fract_f32_neg_abs: -; CI: v_fract_f32_e64 [[RESULT:v[0-9]+]], -|[[INPUT:v[0-9]+]]| -; SI: v_floor_f32_e64 [[FLR:v[0-9]+]], -|[[INPUT:v[0-9]+]]| -; SI: v_sub_f32_e64 [[RESULT:v[0-9]+]], -|[[INPUT]]|, [[FLR]] -; GCN: buffer_store_dword [[RESULT]] -; EG: FRACT -define void @fract_f32_neg_abs(float addrspace(1)* %out, float addrspace(1)* %src) nounwind { - %val = load float, float addrspace(1)* %src, align 4 - %abs = call float @llvm.fabs.f32(float %val) - %neg = fsub float 0.0, %abs - %fract = call float @llvm.AMDGPU.fract.f32(float %neg) nounwind readnone - store float %fract, float addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/llvm.AMDGPU.imad24.ll b/test/CodeGen/R600/llvm.AMDGPU.imad24.ll deleted file mode 100644 index 42102e30f07..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.imad24.ll +++ /dev/null @@ -1,22 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s -; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s -; XUN: llc -march=r600 -mcpu=r770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s - -; FIXME: Store of i32 seems to be broken pre-EG somehow? - -declare i32 @llvm.AMDGPU.imad24(i32, i32, i32) nounwind readnone - -; FUNC-LABEL: {{^}}test_imad24: -; SI: v_mad_i32_i24 -; CM: MULADD_INT24 -; R600: MULLO_INT -; R600: ADD_INT -define void @test_imad24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind { - %mad = call i32 @llvm.AMDGPU.imad24(i32 %src0, i32 %src1, i32 %src2) nounwind readnone - store i32 %mad, i32 addrspace(1)* %out, align 4 - ret void -} - diff --git a/test/CodeGen/R600/llvm.AMDGPU.imax.ll b/test/CodeGen/R600/llvm.AMDGPU.imax.ll deleted file mode 100644 index 46662f96c29..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.imax.ll +++ /dev/null @@ -1,33 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}vector_imax: -; SI: v_max_i32_e32 -define void @vector_imax(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 { -main_body: - %load = load i32, i32 addrspace(1)* %in, align 4 - %max = call i32 @llvm.AMDGPU.imax(i32 %p0, i32 %load) - %bc = bitcast i32 %max to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) - ret void -} - -; SI-LABEL: {{^}}scalar_imax: -; SI: s_max_i32 -define void @scalar_imax(i32 %p0, i32 %p1) #0 { -entry: - %max = call i32 @llvm.AMDGPU.imax(i32 %p0, i32 %p1) - %bc = bitcast i32 %max to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) - ret void -} - -; Function Attrs: readnone -declare i32 @llvm.AMDGPU.imax(i32, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } - -!0 = !{!"const", null, i32 1} diff --git a/test/CodeGen/R600/llvm.AMDGPU.imin.ll b/test/CodeGen/R600/llvm.AMDGPU.imin.ll deleted file mode 100644 index 34b454e2375..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.imin.ll +++ /dev/null @@ -1,33 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}vector_imin: -; SI: v_min_i32_e32 -define void @vector_imin(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 { -main_body: - %load = load i32, i32 addrspace(1)* %in, align 4 - %min = call i32 @llvm.AMDGPU.imin(i32 %p0, i32 %load) - %bc = bitcast i32 %min to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) - ret void -} - -; SI-LABEL: {{^}}scalar_imin: -; SI: s_min_i32 -define void @scalar_imin(i32 %p0, i32 %p1) #0 { -entry: - %min = call i32 @llvm.AMDGPU.imin(i32 %p0, i32 %p1) - %bc = bitcast i32 %min to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) - ret void -} - -; Function Attrs: readnone -declare i32 @llvm.AMDGPU.imin(i32, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } - -!0 = !{!"const", null, i32 1} diff --git a/test/CodeGen/R600/llvm.AMDGPU.imul24.ll b/test/CodeGen/R600/llvm.AMDGPU.imul24.ll deleted file mode 100644 index fdc1172260b..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.imul24.ll +++ /dev/null @@ -1,16 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s - -declare i32 @llvm.AMDGPU.imul24(i32, i32) nounwind readnone - -; FUNC-LABEL: {{^}}test_imul24: -; SI: v_mul_i32_i24 -; CM: MUL_INT24 -; R600: MULLO_INT -define void @test_imul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { - %mul = call i32 @llvm.AMDGPU.imul24(i32 %src0, i32 %src1) nounwind readnone - store i32 %mul, i32 addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/llvm.AMDGPU.kill.ll b/test/CodeGen/R600/llvm.AMDGPU.kill.ll deleted file mode 100644 index 057708e7b5c..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.kill.ll +++ /dev/null @@ -1,39 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}kill_gs_const: -; SI-NOT: v_cmpx_le_f32 -; SI: s_mov_b64 exec, 0 - -define void @kill_gs_const() #0 { -main_body: - %0 = icmp ule i32 0, 3 - %1 = select i1 %0, float 1.000000e+00, float -1.000000e+00 - call void @llvm.AMDGPU.kill(float %1) - %2 = icmp ule i32 3, 0 - %3 = select i1 %2, float 1.000000e+00, float -1.000000e+00 - call void @llvm.AMDGPU.kill(float %3) - ret void -} - -; SI-LABEL: {{^}}kill_vcc_implicit_def: -; SI-NOT: v_cmp_gt_f32_e32 vcc, -; SI: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}} -; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}} -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]] -define void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #1 { -entry: - %tmp0 = fcmp olt float %13, 0.0 - call void @llvm.AMDGPU.kill(float %14) - %tmp1 = select i1 %tmp0, float 1.0, float 0.0 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 1, i32 1, float %tmp1, float %tmp1, float %tmp1, float %tmp1) - ret void -} - -declare void @llvm.AMDGPU.kill(float) -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="2" } -attributes #1 = { "ShaderType"="0" } - -!0 = !{!"const", null, i32 1} diff --git a/test/CodeGen/R600/llvm.AMDGPU.ldexp.ll b/test/CodeGen/R600/llvm.AMDGPU.ldexp.ll deleted file mode 100644 index a59c0ce6d67..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.ldexp.ll +++ /dev/null @@ -1,23 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare float @llvm.AMDGPU.ldexp.f32(float, i32) nounwind readnone -declare double @llvm.AMDGPU.ldexp.f64(double, i32) nounwind readnone - -; SI-LABEL: {{^}}test_ldexp_f32: -; SI: v_ldexp_f32 -; SI: s_endpgm -define void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind { - %result = call float @llvm.AMDGPU.ldexp.f32(float %a, i32 %b) nounwind readnone - store float %result, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_ldexp_f64: -; SI: v_ldexp_f64 -; SI: s_endpgm -define void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwind { - %result = call double @llvm.AMDGPU.ldexp.f64(double %a, i32 %b) nounwind readnone - store double %result, double addrspace(1)* %out, align 8 - ret void -} diff --git a/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll b/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll deleted file mode 100644 index 4cafd563685..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll +++ /dev/null @@ -1,13 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare float @llvm.AMDGPU.legacy.rsq(float) nounwind readnone - -; FUNC-LABEL: {{^}}rsq_legacy_f32: -; SI: v_rsq_legacy_f32_e32 -; EG: RECIPSQRT_IEEE -define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) nounwind { - %rsq = call float @llvm.AMDGPU.legacy.rsq(float %src) nounwind readnone - store float %rsq, float addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/llvm.AMDGPU.mul.ll b/test/CodeGen/R600/llvm.AMDGPU.mul.ll deleted file mode 100644 index 83b56a5029d..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.mul.ll +++ /dev/null @@ -1,17 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -define void @test(<4 x float> inreg %reg0) #0 { - %r0 = extractelement <4 x float> %reg0, i32 0 - %r1 = extractelement <4 x float> %reg0, i32 1 - %r2 = call float @llvm.AMDGPU.mul( float %r0, float %r1) - %vec = insertelement <4 x float> undef, float %r2, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) - ret void -} - -declare float @llvm.AMDGPU.mul(float ,float ) readnone -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } \ No newline at end of file diff --git a/test/CodeGen/R600/llvm.AMDGPU.rcp.f64.ll b/test/CodeGen/R600/llvm.AMDGPU.rcp.f64.ll deleted file mode 100644 index d2a655bf909..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.rcp.f64.ll +++ /dev/null @@ -1,33 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone -declare double @llvm.sqrt.f64(double) nounwind readnone - -; FUNC-LABEL: {{^}}rcp_f64: -; SI: v_rcp_f64_e32 -define void @rcp_f64(double addrspace(1)* %out, double %src) nounwind { - %rcp = call double @llvm.AMDGPU.rcp.f64(double %src) nounwind readnone - store double %rcp, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}rcp_pat_f64: -; SI: v_rcp_f64_e32 -define void @rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind { - %rcp = fdiv double 1.0, %src - store double %rcp, double addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}rsq_rcp_pat_f64: -; SI-UNSAFE: v_rsq_f64_e32 -; SI-SAFE-NOT: v_rsq_f64_e32 -; SI-SAFE: v_sqrt_f64 -; SI-SAFE: v_rcp_f64 -define void @rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind { - %sqrt = call double @llvm.sqrt.f64(double %src) nounwind readnone - %rcp = call double @llvm.AMDGPU.rcp.f64(double %sqrt) nounwind readnone - store double %rcp, double addrspace(1)* %out, align 8 - ret void -} diff --git a/test/CodeGen/R600/llvm.AMDGPU.rcp.ll b/test/CodeGen/R600/llvm.AMDGPU.rcp.ll deleted file mode 100644 index edd6e9a72f1..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.rcp.ll +++ /dev/null @@ -1,50 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s -; XUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s -; XUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s - -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG-SAFE -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare float @llvm.AMDGPU.rcp.f32(float) nounwind readnone -declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone - -declare float @llvm.sqrt.f32(float) nounwind readnone - -; FUNC-LABEL: {{^}}rcp_f32: -; SI: v_rcp_f32_e32 -; EG: RECIP_IEEE -define void @rcp_f32(float addrspace(1)* %out, float %src) nounwind { - %rcp = call float @llvm.AMDGPU.rcp.f32(float %src) nounwind readnone - store float %rcp, float addrspace(1)* %out, align 4 - ret void -} - -; FIXME: Evergreen only ever does unsafe fp math. -; FUNC-LABEL: {{^}}rcp_pat_f32: - -; SI-SAFE: v_rcp_f32_e32 -; XSI-SAFE-SPDENORM-NOT: v_rcp_f32_e32 - -; EG: RECIP_IEEE - -define void @rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind { - %rcp = fdiv float 1.0, %src - store float %rcp, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}rsq_rcp_pat_f32: -; SI-UNSAFE: v_rsq_f32_e32 -; SI-SAFE: v_sqrt_f32_e32 -; SI-SAFE: v_rcp_f32_e32 - -; EG: RECIPSQRT_IEEE -define void @rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind { - %sqrt = call float @llvm.sqrt.f32(float %src) nounwind readnone - %rcp = call float @llvm.AMDGPU.rcp.f32(float %sqrt) nounwind readnone - store float %rcp, float addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll b/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll deleted file mode 100644 index 67f1d22c717..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll +++ /dev/null @@ -1,23 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s - -declare double @llvm.AMDGPU.rsq.clamped.f64(double) nounwind readnone - -; FUNC-LABEL: {{^}}rsq_clamped_f64: -; SI: v_rsq_clamp_f64_e32 - -; VI: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[2:3] -; TODO: this constant should be folded: -; VI: s_mov_b32 s[[ALLBITS:[0-9+]]], -1 -; VI: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff -; VI: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]] -; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]] -; VI: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff -; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]] -; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]] - -define void @rsq_clamped_f64(double addrspace(1)* %out, double %src) nounwind { - %rsq_clamped = call double @llvm.AMDGPU.rsq.clamped.f64(double %src) nounwind readnone - store double %rsq_clamped, double addrspace(1)* %out, align 8 - ret void -} diff --git a/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll b/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll deleted file mode 100644 index eeff2536b23..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll +++ /dev/null @@ -1,23 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - - -declare float @llvm.AMDGPU.rsq.clamped.f32(float) nounwind readnone - -; FUNC-LABEL: {{^}}rsq_clamped_f32: -; SI: v_rsq_clamp_f32_e32 - -; VI: v_rsq_f32_e32 [[RSQ:v[0-9]+]], {{s[0-9]+}} -; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0x7f7fffff, [[RSQ]] -; TODO: this constant should be folded: -; VI: v_mov_b32_e32 [[MINFLT:v[0-9]+]], 0xff7fffff -; VI: v_max_f32_e32 {{v[0-9]+}}, [[MIN]], [[MINFLT]] - -; EG: RECIPSQRT_CLAMPED - -define void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind { - %rsq_clamped = call float @llvm.AMDGPU.rsq.clamped.f32(float %src) nounwind readnone - store float %rsq_clamped, float addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/llvm.AMDGPU.rsq.ll b/test/CodeGen/R600/llvm.AMDGPU.rsq.ll deleted file mode 100644 index 36b72f14db1..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.rsq.ll +++ /dev/null @@ -1,33 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare float @llvm.AMDGPU.rsq.f32(float) nounwind readnone - -; FUNC-LABEL: {{^}}rsq_f32: -; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} -; EG: RECIPSQRT_IEEE -define void @rsq_f32(float addrspace(1)* %out, float %src) nounwind { - %rsq = call float @llvm.AMDGPU.rsq.f32(float %src) nounwind readnone - store float %rsq, float addrspace(1)* %out, align 4 - ret void -} - -; TODO: Really these should be constant folded -; FUNC-LABEL: {{^}}rsq_f32_constant_4.0 -; SI: v_rsq_f32_e32 {{v[0-9]+}}, 4.0 -; EG: RECIPSQRT_IEEE -define void @rsq_f32_constant_4.0(float addrspace(1)* %out) nounwind { - %rsq = call float @llvm.AMDGPU.rsq.f32(float 4.0) nounwind readnone - store float %rsq, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}rsq_f32_constant_100.0 -; SI: v_rsq_f32_e32 {{v[0-9]+}}, 0x42c80000 -; EG: RECIPSQRT_IEEE -define void @rsq_f32_constant_100.0(float addrspace(1)* %out) nounwind { - %rsq = call float @llvm.AMDGPU.rsq.f32(float 100.0) nounwind readnone - store float %rsq, float addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/llvm.AMDGPU.tex.ll b/test/CodeGen/R600/llvm.AMDGPU.tex.ll deleted file mode 100644 index 10206609bb5..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.tex.ll +++ /dev/null @@ -1,42 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN -;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN -;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN -;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN -;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:UUNN -;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:NNNN -;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:NNNN -;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZZ}} RID:0 SID:0 CT:UUNN -;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYYW}} RID:0 SID:0 CT:NNUN -;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN -;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYYZ}} RID:0 SID:0 CT:NNUN -;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN -;CHECK: TEX_SAMPLE_C T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN -;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN -;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN -;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN - -define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { - %addr = load <4 x float>, <4 x float> addrspace(1)* %in - %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %addr, i32 0, i32 0, i32 1) - %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res1, i32 0, i32 0, i32 2) - %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res2, i32 0, i32 0, i32 3) - %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res3, i32 0, i32 0, i32 4) - %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res4, i32 0, i32 0, i32 5) - %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res5, i32 0, i32 0, i32 6) - %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res6, i32 0, i32 0, i32 7) - %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res7, i32 0, i32 0, i32 8) - %res9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res8, i32 0, i32 0, i32 9) - %res10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res9, i32 0, i32 0, i32 10) - %res11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res10, i32 0, i32 0, i32 11) - %res12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res11, i32 0, i32 0, i32 12) - %res13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res12, i32 0, i32 0, i32 13) - %res14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res13, i32 0, i32 0, i32 14) - %res15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res14, i32 0, i32 0, i32 15) - %res16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res15, i32 0, i32 0, i32 16) - store <4 x float> %res16, <4 x float> addrspace(1)* %out - ret void -} - -declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone diff --git a/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll b/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll deleted file mode 100644 index 6b546a7e17c..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll +++ /dev/null @@ -1,30 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare double @llvm.AMDGPU.trig.preop.f64(double, i32) nounwind readnone - -; SI-LABEL: {{^}}test_trig_preop_f64: -; SI-DAG: buffer_load_dword [[SEG:v[0-9]+]] -; SI-DAG: buffer_load_dwordx2 [[SRC:v\[[0-9]+:[0-9]+\]]], -; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], [[SEG]] -; SI: buffer_store_dwordx2 [[RESULT]], -; SI: s_endpgm -define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %a = load double, double addrspace(1)* %aptr, align 8 - %b = load i32, i32 addrspace(1)* %bptr, align 4 - %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 %b) nounwind readnone - store double %result, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}test_trig_preop_f64_imm_segment: -; SI: buffer_load_dwordx2 [[SRC:v\[[0-9]+:[0-9]+\]]], -; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], 7 -; SI: buffer_store_dwordx2 [[RESULT]], -; SI: s_endpgm -define void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind { - %a = load double, double addrspace(1)* %aptr, align 8 - %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 7) nounwind readnone - store double %result, double addrspace(1)* %out, align 8 - ret void -} diff --git a/test/CodeGen/R600/llvm.AMDGPU.trunc.ll b/test/CodeGen/R600/llvm.AMDGPU.trunc.ll deleted file mode 100644 index 74792e50017..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.trunc.ll +++ /dev/null @@ -1,17 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 %s -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s - -; R600: {{^}}amdgpu_trunc: -; R600: TRUNC T{{[0-9]+\.[XYZW]}}, KC0[2].Z -; SI: {{^}}amdgpu_trunc: -; SI: v_trunc_f32 - -define void @amdgpu_trunc(float addrspace(1)* %out, float %x) { -entry: - %0 = call float @llvm.AMDGPU.trunc(float %x) - store float %0, float addrspace(1)* %out - ret void -} - -declare float @llvm.AMDGPU.trunc(float ) readnone diff --git a/test/CodeGen/R600/llvm.AMDGPU.umad24.ll b/test/CodeGen/R600/llvm.AMDGPU.umad24.ll deleted file mode 100644 index 77a073b0cb0..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.umad24.ll +++ /dev/null @@ -1,38 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s -; XUN: llc -march=r600 -mcpu=rv770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s - -declare i32 @llvm.AMDGPU.umad24(i32, i32, i32) nounwind readnone -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -; FUNC-LABEL: {{^}}test_umad24: -; SI: v_mad_u32_u24 -; EG: MULADD_UINT24 -; R600: MULLO_UINT -; R600: ADD_INT -define void @test_umad24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind { - %mad = call i32 @llvm.AMDGPU.umad24(i32 %src0, i32 %src1, i32 %src2) nounwind readnone - store i32 %mad, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}commute_umad24: -; SI-DAG: buffer_load_dword [[SRC0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[SRC2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_mad_u32_u24 [[RESULT:v[0-9]+]], 4, [[SRC0]], [[SRC2]] -; SI: buffer_store_dword [[RESULT]] -define void @commute_umad24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %src0.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %src2.gep = getelementptr i32, i32 addrspace(1)* %src0.gep, i32 1 - - %src0 = load i32, i32 addrspace(1)* %src0.gep, align 4 - %src2 = load i32, i32 addrspace(1)* %src2.gep, align 4 - %mad = call i32 @llvm.AMDGPU.umad24(i32 %src0, i32 4, i32 %src2) nounwind readnone - store i32 %mad, i32 addrspace(1)* %out.gep, align 4 - ret void -} - diff --git a/test/CodeGen/R600/llvm.AMDGPU.umax.ll b/test/CodeGen/R600/llvm.AMDGPU.umax.ll deleted file mode 100644 index a97d103016d..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.umax.ll +++ /dev/null @@ -1,48 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}vector_umax: -; SI: v_max_u32_e32 -define void @vector_umax(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 { -main_body: - %load = load i32, i32 addrspace(1)* %in, align 4 - %max = call i32 @llvm.AMDGPU.umax(i32 %p0, i32 %load) - %bc = bitcast i32 %max to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) - ret void -} - -; SI-LABEL: {{^}}scalar_umax: -; SI: s_max_u32 -define void @scalar_umax(i32 %p0, i32 %p1) #0 { -entry: - %max = call i32 @llvm.AMDGPU.umax(i32 %p0, i32 %p1) - %bc = bitcast i32 %max to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) - ret void -} - -; SI-LABEL: {{^}}trunc_zext_umax: -; SI: buffer_load_ubyte [[VREG:v[0-9]+]], -; SI: v_max_u32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]] -; SI-NOT: and -; SI: buffer_store_short [[RESULT]], -define void @trunc_zext_umax(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind { - %tmp5 = load i8, i8 addrspace(1)* %src, align 1 - %tmp2 = zext i8 %tmp5 to i32 - %tmp3 = tail call i32 @llvm.AMDGPU.umax(i32 %tmp2, i32 0) nounwind readnone - %tmp4 = trunc i32 %tmp3 to i8 - %tmp6 = zext i8 %tmp4 to i16 - store i16 %tmp6, i16 addrspace(1)* %out, align 2 - ret void -} - -; Function Attrs: readnone -declare i32 @llvm.AMDGPU.umax(i32, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } - -!0 = !{!"const", null, i32 1} diff --git a/test/CodeGen/R600/llvm.AMDGPU.umin.ll b/test/CodeGen/R600/llvm.AMDGPU.umin.ll deleted file mode 100644 index 2acd10e0c63..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.umin.ll +++ /dev/null @@ -1,48 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}vector_umin: -; SI: v_min_u32_e32 -define void @vector_umin(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 { -main_body: - %load = load i32, i32 addrspace(1)* %in, align 4 - %min = call i32 @llvm.AMDGPU.umin(i32 %p0, i32 %load) - %bc = bitcast i32 %min to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) - ret void -} - -; SI-LABEL: {{^}}scalar_umin: -; SI: s_min_u32 -define void @scalar_umin(i32 %p0, i32 %p1) #0 { -entry: - %min = call i32 @llvm.AMDGPU.umin(i32 %p0, i32 %p1) - %bc = bitcast i32 %min to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %bc, float %bc, float %bc, float %bc) - ret void -} - -; SI-LABEL: {{^}}trunc_zext_umin: -; SI: buffer_load_ubyte [[VREG:v[0-9]+]], -; SI: v_min_u32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]] -; SI-NOT: and -; SI: buffer_store_short [[RESULT]], -define void @trunc_zext_umin(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind { - %tmp5 = load i8, i8 addrspace(1)* %src, align 1 - %tmp2 = zext i8 %tmp5 to i32 - %tmp3 = tail call i32 @llvm.AMDGPU.umin(i32 %tmp2, i32 0) nounwind readnone - %tmp4 = trunc i32 %tmp3 to i8 - %tmp6 = zext i8 %tmp4 to i16 - store i16 %tmp6, i16 addrspace(1)* %out, align 2 - ret void -} - -; Function Attrs: readnone -declare i32 @llvm.AMDGPU.umin(i32, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } - -!0 = !{!"const", null, i32 1} diff --git a/test/CodeGen/R600/llvm.AMDGPU.umul24.ll b/test/CodeGen/R600/llvm.AMDGPU.umul24.ll deleted file mode 100644 index 76624a078b3..00000000000 --- a/test/CodeGen/R600/llvm.AMDGPU.umul24.ll +++ /dev/null @@ -1,18 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; XUN: llc -march=r600 -mcpu=r600 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s -; XUN: llc -march=r600 -mcpu=r770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s - -declare i32 @llvm.AMDGPU.umul24(i32, i32) nounwind readnone - -; FUNC-LABEL: {{^}}test_umul24: -; SI: v_mul_u32_u24 -; R600: MUL_UINT24 -; R600: MULLO_UINT -define void @test_umul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind { - %mul = call i32 @llvm.AMDGPU.umul24(i32 %src0, i32 %src1) nounwind readnone - store i32 %mul, i32 addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/llvm.SI.fs.interp.ll b/test/CodeGen/R600/llvm.SI.fs.interp.ll deleted file mode 100644 index 3d05da616e4..00000000000 --- a/test/CodeGen/R600/llvm.SI.fs.interp.ll +++ /dev/null @@ -1,59 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN %s -;RUN: llc < %s -march=amdgcn -mcpu=kabini -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=16BANK %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s - -;GCN-LABEL: {{^}}main: -;GCN-NOT: s_wqm -;GCN: s_mov_b32 -;GCN-NEXT: v_interp_mov_f32 -;GCN: v_interp_p1_f32 -;GCN: v_interp_p2_f32 - -define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) #0 { -main_body: - %5 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3) - %6 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %4) - %7 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %4) - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %6, float %7, float %7) - ret void -} - -; Thest that v_interp_p1 uses different source and destination registers -; on 16 bank LDS chips. - -; 16BANK-LABEL: {{^}}v_interp_p1_bank16_bug: -; 16BANK-NOT: v_interp_p1_f32 [[DST:v[0-9]+]], [[DST]] - -define void @v_interp_p1_bank16_bug([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 { -main_body: - %22 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %5, <2 x i32> %7) - %23 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %7) - %24 = call float @llvm.SI.fs.interp(i32 2, i32 0, i32 %5, <2 x i32> %7) - %25 = call float @fabs(float %22) - %26 = call float @fabs(float %23) - %27 = call float @fabs(float %24) - %28 = call i32 @llvm.SI.packf16(float %25, float %26) - %29 = bitcast i32 %28 to float - %30 = call i32 @llvm.SI.packf16(float %27, float 1.000000e+00) - %31 = bitcast i32 %30 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %29, float %31, float %29, float %31) - ret void -} - -; Function Attrs: readnone -declare float @fabs(float) #2 - -; Function Attrs: nounwind readnone -declare i32 @llvm.SI.packf16(float, float) #1 - -; Function Attrs: nounwind readnone -declare float @llvm.SI.fs.constant(i32, i32, i32) #1 - -; Function Attrs: nounwind readnone -declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } -attributes #2 = { readnone } diff --git a/test/CodeGen/R600/llvm.SI.gather4.ll b/test/CodeGen/R600/llvm.SI.gather4.ll deleted file mode 100644 index 275cb580bc9..00000000000 --- a/test/CodeGen/R600/llvm.SI.gather4.ll +++ /dev/null @@ -1,509 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-LABEL: {{^}}gather4_v2: -;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_v2() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4: -;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_cl: -;CHECK: image_gather4_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_l: -;CHECK: image_gather4_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_l() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_b: -;CHECK: image_gather4_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_b() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_b_cl: -;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_b_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_b_cl_v8: -;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_b_cl_v8() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_lz_v2: -;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_lz_v2() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_lz: -;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_lz() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - - - -;CHECK-LABEL: {{^}}gather4_o: -;CHECK: image_gather4_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_o() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_cl_o: -;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_cl_o() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_cl_o_v8: -;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_cl_o_v8() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_l_o: -;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_l_o() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_l_o_v8: -;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_l_o_v8() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_b_o: -;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_b_o() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_b_o_v8: -;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_b_o_v8() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_b_cl_o: -;CHECK: image_gather4_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_b_cl_o() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_lz_o: -;CHECK: image_gather4_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_lz_o() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - - - -;CHECK-LABEL: {{^}}gather4_c: -;CHECK: image_gather4_c {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_cl: -;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_cl_v8: -;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_cl_v8() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_l: -;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_l() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_l_v8: -;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_l_v8() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_b: -;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_b() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_b_v8: -;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_b_v8() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_b_cl: -;CHECK: image_gather4_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_b_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_lz: -;CHECK: image_gather4_c_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_lz() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - - - -;CHECK-LABEL: {{^}}gather4_c_o: -;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_o() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_o_v8: -;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_o_v8() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_cl_o: -;CHECK: image_gather4_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_cl_o() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_l_o: -;CHECK: image_gather4_c_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_l_o() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_b_o: -;CHECK: image_gather4_c_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_b_o() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_b_cl_o: -;CHECK: image_gather4_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_b_cl_o() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_lz_o: -;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_lz_o() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}gather4_c_lz_o_v8: -;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @gather4_c_lz_o_v8() #0 { -main_body: - %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - - - -declare <4 x float> @llvm.SI.gather4.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - -declare <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - -declare <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - -declare <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/llvm.SI.getlod.ll b/test/CodeGen/R600/llvm.SI.getlod.ll deleted file mode 100644 index 06ee98e91b3..00000000000 --- a/test/CodeGen/R600/llvm.SI.getlod.ll +++ /dev/null @@ -1,45 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-LABEL: {{^}}getlod: -;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @getlod() #0 { -main_body: - %r = call <4 x float> @llvm.SI.getlod.i32(i32 undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1) - ret void -} - -;CHECK-LABEL: {{^}}getlod_v2: -;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @getlod_v2() #0 { -main_body: - %r = call <4 x float> @llvm.SI.getlod.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1) - ret void -} - -;CHECK-LABEL: {{^}}getlod_v4: -;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @getlod_v4() #0 { -main_body: - %r = call <4 x float> @llvm.SI.getlod.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1) - ret void -} - - -declare <4 x float> @llvm.SI.getlod.i32(i32, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.getlod.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.getlod.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/llvm.SI.image.ll b/test/CodeGen/R600/llvm.SI.image.ll deleted file mode 100644 index 0fac8d79956..00000000000 --- a/test/CodeGen/R600/llvm.SI.image.ll +++ /dev/null @@ -1,50 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-LABEL: {{^}}image_load: -;CHECK: image_load {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @image_load() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.load.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}image_load_mip: -;CHECK: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @image_load_mip() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}getresinfo: -;CHECK: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} -define void @getresinfo() #0 { -main_body: - %r = call <4 x float> @llvm.SI.getresinfo.i32(i32 undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.getresinfo.i32(i32, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/llvm.SI.image.sample.ll b/test/CodeGen/R600/llvm.SI.image.sample.ll deleted file mode 100644 index 4bc638a2806..00000000000 --- a/test/CodeGen/R600/llvm.SI.image.sample.ll +++ /dev/null @@ -1,310 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-LABEL: {{^}}sample: -;CHECK: s_wqm -;CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_cl: -;CHECK: s_wqm -;CHECK: image_sample_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_d: -;CHECK-NOT: s_wqm -;CHECK: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_d() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_d_cl: -;CHECK-NOT: s_wqm -;CHECK: image_sample_d_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_d_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_l: -;CHECK-NOT: s_wqm -;CHECK: image_sample_l {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_l() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_b: -;CHECK: s_wqm -;CHECK: image_sample_b {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_b() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_b_cl: -;CHECK: s_wqm -;CHECK: image_sample_b_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_b_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_lz: -;CHECK-NOT: s_wqm -;CHECK: image_sample_lz {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_lz() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_cd: -;CHECK-NOT: s_wqm -;CHECK: image_sample_cd {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_cd() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_cd_cl: -;CHECK-NOT: s_wqm -;CHECK: image_sample_cd_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_cd_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c: -;CHECK: s_wqm -;CHECK: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_cl: -;CHECK: s_wqm -;CHECK: image_sample_c_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_d: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_d() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_d_cl: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_d_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_d_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_l: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_l {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_l() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_b: -;CHECK: s_wqm -;CHECK: image_sample_c_b {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_b() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_b_cl: -;CHECK: s_wqm -;CHECK: image_sample_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_b_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_lz: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_lz {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_lz() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_cd: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_cd {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_cd() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_cd_cl: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_cd_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_cd_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - - -declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - -declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/llvm.SI.image.sample.o.ll b/test/CodeGen/R600/llvm.SI.image.sample.o.ll deleted file mode 100644 index 9d8935414ed..00000000000 --- a/test/CodeGen/R600/llvm.SI.image.sample.o.ll +++ /dev/null @@ -1,310 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-LABEL: {{^}}sample: -;CHECK: s_wqm -;CHECK: image_sample_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_cl: -;CHECK: s_wqm -;CHECK: image_sample_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_d: -;CHECK-NOT: s_wqm -;CHECK: image_sample_d_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_d() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_d_cl: -;CHECK-NOT: s_wqm -;CHECK: image_sample_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_d_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_l: -;CHECK-NOT: s_wqm -;CHECK: image_sample_l_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_l() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_b: -;CHECK: s_wqm -;CHECK: image_sample_b_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_b() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_b_cl: -;CHECK: s_wqm -;CHECK: image_sample_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_b_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_lz: -;CHECK-NOT: s_wqm -;CHECK: image_sample_lz_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_lz() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_cd: -;CHECK-NOT: s_wqm -;CHECK: image_sample_cd_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_cd() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_cd_cl: -;CHECK-NOT: s_wqm -;CHECK: image_sample_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_cd_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c: -;CHECK: s_wqm -;CHECK: image_sample_c_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_cl: -;CHECK: s_wqm -;CHECK: image_sample_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_d: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_d_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_d() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_d_cl: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_d_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_l: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_l_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_l() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_b: -;CHECK: s_wqm -;CHECK: image_sample_c_b_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_b() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_b_cl: -;CHECK: s_wqm -;CHECK: image_sample_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_b_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_lz: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_lz() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_cd: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_cd_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_cd() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - -;CHECK-LABEL: {{^}}sample_c_cd_cl: -;CHECK-NOT: s_wqm -;CHECK: image_sample_c_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -define void @sample_c_cd_cl() #0 { -main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %r0 = extractelement <4 x float> %r, i32 0 - %r1 = extractelement <4 x float> %r, i32 1 - %r2 = extractelement <4 x float> %r, i32 2 - %r3 = extractelement <4 x float> %r, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) - ret void -} - - -declare <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - -declare <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/llvm.SI.imageload.ll b/test/CodeGen/R600/llvm.SI.imageload.ll deleted file mode 100644 index b67716c3b66..00000000000 --- a/test/CodeGen/R600/llvm.SI.imageload.ll +++ /dev/null @@ -1,132 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-DAG: image_load {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1 -;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0 -;CHECK-DAG: image_load_mip {{v[0-9]+}}, 2, 0, 0, 0 -;CHECK-DAG: image_load_mip {{v[0-9]+}}, 1, 0, 0, 0 -;CHECK-DAG: image_load_mip {{v[0-9]+}}, 4, 0, 0, 0 -;CHECK-DAG: image_load_mip {{v[0-9]+}}, 8, 0, 0, 0 -;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0 -;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1 -;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0 -;CHECK-DAG: image_load_mip {{v[0-9]+}}, 8, 0, 0, -1 - -define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) { - %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0 - %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1 - %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2 - %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3 - %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0 - %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1 - %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1 - %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2 - %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2 - %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3 - %res1 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v1, - <32 x i8> undef, i32 1) - %res2 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v2, - <32 x i8> undef, i32 2) - %res3 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v3, - <32 x i8> undef, i32 3) - %res4 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v4, - <32 x i8> undef, i32 4) - %res5 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v5, - <32 x i8> undef, i32 5) - %res6 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v6, - <32 x i8> undef, i32 6) - %res10 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v10, - <32 x i8> undef, i32 10) - %res11 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v11, - <32 x i8> undef, i32 11) - %res15 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v15, - <32 x i8> undef, i32 15) - %res16 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v16, - <32 x i8> undef, i32 16) - %e1 = extractelement <4 x i32> %res1, i32 0 - %e2 = extractelement <4 x i32> %res2, i32 1 - %e3 = extractelement <4 x i32> %res3, i32 2 - %e4 = extractelement <4 x i32> %res4, i32 3 - %t0 = extractelement <4 x i32> %res5, i32 0 - %t1 = extractelement <4 x i32> %res5, i32 1 - %e5 = add i32 %t0, %t1 - %t2 = extractelement <4 x i32> %res6, i32 0 - %t3 = extractelement <4 x i32> %res6, i32 2 - %e6 = add i32 %t2, %t3 - %t10 = extractelement <4 x i32> %res10, i32 2 - %t11 = extractelement <4 x i32> %res10, i32 3 - %e10 = add i32 %t10, %t11 - %t12 = extractelement <4 x i32> %res11, i32 0 - %t13 = extractelement <4 x i32> %res11, i32 1 - %t14 = extractelement <4 x i32> %res11, i32 2 - %t15 = add i32 %t12, %t13 - %e11 = add i32 %t14, %t15 - %t28 = extractelement <4 x i32> %res15, i32 0 - %t29 = extractelement <4 x i32> %res15, i32 1 - %t30 = extractelement <4 x i32> %res15, i32 2 - %t31 = extractelement <4 x i32> %res15, i32 3 - %t32 = add i32 %t28, %t29 - %t33 = add i32 %t30, %t31 - %e15 = add i32 %t32, %t33 - %e16 = extractelement <4 x i32> %res16, i32 3 - %s1 = add i32 %e1, %e2 - %s2 = add i32 %s1, %e3 - %s3 = add i32 %s2, %e4 - %s4 = add i32 %s3, %e5 - %s5 = add i32 %s4, %e6 - %s9 = add i32 %s5, %e10 - %s10 = add i32 %s9, %e11 - %s14 = add i32 %s10, %e15 - %s15 = add i32 %s14, %e16 - %s16 = bitcast i32 %s15 to float - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s16, float %s16, float %s16, float %s16) - ret void -} - -; Test that ccordinates are stored in vgprs and not sgprs -; CHECK: vgpr_coords -; CHECK: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}} -define void @vgpr_coords(float addrspace(2)* addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { -main_body: - %20 = getelementptr float addrspace(2)*, float addrspace(2)* addrspace(2)* %0, i32 0 - %21 = load float addrspace(2)*, float addrspace(2)* addrspace(2)* %20, !tbaa !2 - %22 = getelementptr float, float addrspace(2)* %21, i32 0 - %23 = load float, float addrspace(2)* %22, !tbaa !2, !invariant.load !1 - %24 = getelementptr float, float addrspace(2)* %21, i32 1 - %25 = load float, float addrspace(2)* %24, !tbaa !2, !invariant.load !1 - %26 = getelementptr float, float addrspace(2)* %21, i32 4 - %27 = load float, float addrspace(2)* %26, !tbaa !2, !invariant.load !1 - %28 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0 - %29 = load <32 x i8>, <32 x i8> addrspace(2)* %28, !tbaa !2 - %30 = bitcast float %27 to i32 - %31 = bitcast float %23 to i32 - %32 = bitcast float %25 to i32 - %33 = insertelement <4 x i32> undef, i32 %31, i32 0 - %34 = insertelement <4 x i32> %33, i32 %32, i32 1 - %35 = insertelement <4 x i32> %34, i32 %30, i32 2 - %36 = insertelement <4 x i32> %35, i32 undef, i32 3 - %37 = call <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32> %36, <32 x i8> %29, i32 2) - %38 = extractelement <4 x i32> %37, i32 0 - %39 = extractelement <4 x i32> %37, i32 1 - %40 = extractelement <4 x i32> %37, i32 2 - %41 = extractelement <4 x i32> %37, i32 3 - %42 = bitcast i32 %38 to float - %43 = bitcast i32 %39 to float - %44 = bitcast i32 %40 to float - %45 = bitcast i32 %41 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %42, float %43, float %44, float %45) - ret void -} - -declare <4 x i32> @llvm.SI.imageload.(<4 x i32>, <32 x i8>, i32) readnone -; Function Attrs: nounwind readnone -declare <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32>, <32 x i8>, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } - -!0 = !{!"const", null} -!1 = !{} -!2 = !{!0, !0, i64 0, i32 1} diff --git a/test/CodeGen/R600/llvm.SI.load.dword.ll b/test/CodeGen/R600/llvm.SI.load.dword.ll deleted file mode 100644 index f6c258539d5..00000000000 --- a/test/CodeGen/R600/llvm.SI.load.dword.ll +++ /dev/null @@ -1,53 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s - -; Example of a simple geometry shader loading vertex attributes from the -; ESGS ring buffer - -; FIXME: Out of bounds immediate offset crashes - -; CHECK-LABEL: {{^}}main: -; CHECK: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 glc slc -; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen glc slc -; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen glc slc -; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen offen glc slc -; CHECK: s_movk_i32 [[K:s[0-9]+]], 0x4d2 ; encoding -; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, [[K]] idxen offen offset:65535 glc slc - -define void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <32 x i8>] addrspace(2)* byval %arg2, [2 x <16 x i8>] addrspace(2)* byval %arg3, [17 x <16 x i8>] addrspace(2)* inreg %arg4, [17 x <16 x i8>] addrspace(2)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) #0 { -main_body: - %tmp = getelementptr [2 x <16 x i8>], [2 x <16 x i8>] addrspace(2)* %arg3, i64 0, i32 1 - %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 - %tmp11 = shl i32 %arg6, 2 - %tmp12 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0) - %tmp13 = bitcast i32 %tmp12 to float - %tmp14 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 %tmp11, i32 0, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0) - %tmp15 = bitcast i32 %tmp14 to float - %tmp16 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 %tmp11, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 0) - %tmp17 = bitcast i32 %tmp16 to float - %tmp18 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 0) - %tmp19 = bitcast i32 %tmp18 to float - - %tmp20 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 123, i32 1, i32 1, i32 1, i32 1, i32 0) - %tmp21 = bitcast i32 %tmp20 to float - - %tmp22 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 1234, i32 65535, i32 1, i32 1, i32 1, i32 1, i32 0) - %tmp23 = bitcast i32 %tmp22 to float - - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp13, float %tmp15, float %tmp17, float %tmp19) - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp21, float %tmp23, float %tmp23, float %tmp23) - ret void -} - -; Function Attrs: nounwind readonly -declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - -; Function Attrs: nounwind readonly -declare i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="1" } -attributes #1 = { nounwind readonly } - -!0 = !{!"const", null, i32 1} diff --git a/test/CodeGen/R600/llvm.SI.resinfo.ll b/test/CodeGen/R600/llvm.SI.resinfo.ll deleted file mode 100644 index ac95fd0b83a..00000000000 --- a/test/CodeGen/R600/llvm.SI.resinfo.ll +++ /dev/null @@ -1,111 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s - -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1 -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 2, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 1, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 4, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 8, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 9, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 6, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 10, 0, 0, -1 -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1 -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 11, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 13, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 14, 0, 0, 0 -; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 8, 0, 0, -1 - -define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, - i32 %a9, i32 %a10, i32 %a11, i32 %a12, i32 %a13, i32 %a14, i32 %a15, i32 %a16) { - %res1 = call <4 x i32> @llvm.SI.resinfo(i32 %a1, <32 x i8> undef, i32 1) - %res2 = call <4 x i32> @llvm.SI.resinfo(i32 %a2, <32 x i8> undef, i32 2) - %res3 = call <4 x i32> @llvm.SI.resinfo(i32 %a3, <32 x i8> undef, i32 3) - %res4 = call <4 x i32> @llvm.SI.resinfo(i32 %a4, <32 x i8> undef, i32 4) - %res5 = call <4 x i32> @llvm.SI.resinfo(i32 %a5, <32 x i8> undef, i32 5) - %res6 = call <4 x i32> @llvm.SI.resinfo(i32 %a6, <32 x i8> undef, i32 6) - %res7 = call <4 x i32> @llvm.SI.resinfo(i32 %a7, <32 x i8> undef, i32 7) - %res8 = call <4 x i32> @llvm.SI.resinfo(i32 %a8, <32 x i8> undef, i32 8) - %res9 = call <4 x i32> @llvm.SI.resinfo(i32 %a9, <32 x i8> undef, i32 9) - %res10 = call <4 x i32> @llvm.SI.resinfo(i32 %a10, <32 x i8> undef, i32 10) - %res11 = call <4 x i32> @llvm.SI.resinfo(i32 %a11, <32 x i8> undef, i32 11) - %res12 = call <4 x i32> @llvm.SI.resinfo(i32 %a12, <32 x i8> undef, i32 12) - %res13 = call <4 x i32> @llvm.SI.resinfo(i32 %a13, <32 x i8> undef, i32 13) - %res14 = call <4 x i32> @llvm.SI.resinfo(i32 %a14, <32 x i8> undef, i32 14) - %res15 = call <4 x i32> @llvm.SI.resinfo(i32 %a15, <32 x i8> undef, i32 15) - %res16 = call <4 x i32> @llvm.SI.resinfo(i32 %a16, <32 x i8> undef, i32 16) - %e1 = extractelement <4 x i32> %res1, i32 0 - %e2 = extractelement <4 x i32> %res2, i32 1 - %e3 = extractelement <4 x i32> %res3, i32 2 - %e4 = extractelement <4 x i32> %res4, i32 3 - %t0 = extractelement <4 x i32> %res5, i32 0 - %t1 = extractelement <4 x i32> %res5, i32 1 - %e5 = add i32 %t0, %t1 - %t2 = extractelement <4 x i32> %res6, i32 0 - %t3 = extractelement <4 x i32> %res6, i32 2 - %e6 = add i32 %t2, %t3 - %t4 = extractelement <4 x i32> %res7, i32 0 - %t5 = extractelement <4 x i32> %res7, i32 3 - %e7 = add i32 %t4, %t5 - %t6 = extractelement <4 x i32> %res8, i32 1 - %t7 = extractelement <4 x i32> %res8, i32 2 - %e8 = add i32 %t6, %t7 - %t8 = extractelement <4 x i32> %res9, i32 1 - %t9 = extractelement <4 x i32> %res9, i32 3 - %e9 = add i32 %t8, %t9 - %t10 = extractelement <4 x i32> %res10, i32 2 - %t11 = extractelement <4 x i32> %res10, i32 3 - %e10 = add i32 %t10, %t11 - %t12 = extractelement <4 x i32> %res11, i32 0 - %t13 = extractelement <4 x i32> %res11, i32 1 - %t14 = extractelement <4 x i32> %res11, i32 2 - %t15 = add i32 %t12, %t13 - %e11 = add i32 %t14, %t15 - %t16 = extractelement <4 x i32> %res12, i32 0 - %t17 = extractelement <4 x i32> %res12, i32 1 - %t18 = extractelement <4 x i32> %res12, i32 3 - %t19 = add i32 %t16, %t17 - %e12 = add i32 %t18, %t19 - %t20 = extractelement <4 x i32> %res13, i32 0 - %t21 = extractelement <4 x i32> %res13, i32 2 - %t22 = extractelement <4 x i32> %res13, i32 3 - %t23 = add i32 %t20, %t21 - %e13 = add i32 %t22, %t23 - %t24 = extractelement <4 x i32> %res14, i32 1 - %t25 = extractelement <4 x i32> %res14, i32 2 - %t26 = extractelement <4 x i32> %res14, i32 3 - %t27 = add i32 %t24, %t25 - %e14 = add i32 %t26, %t27 - %t28 = extractelement <4 x i32> %res15, i32 0 - %t29 = extractelement <4 x i32> %res15, i32 1 - %t30 = extractelement <4 x i32> %res15, i32 2 - %t31 = extractelement <4 x i32> %res15, i32 3 - %t32 = add i32 %t28, %t29 - %t33 = add i32 %t30, %t31 - %e15 = add i32 %t32, %t33 - %e16 = extractelement <4 x i32> %res16, i32 3 - %s1 = add i32 %e1, %e2 - %s2 = add i32 %s1, %e3 - %s3 = add i32 %s2, %e4 - %s4 = add i32 %s3, %e5 - %s5 = add i32 %s4, %e6 - %s6 = add i32 %s5, %e7 - %s7 = add i32 %s6, %e8 - %s8 = add i32 %s7, %e9 - %s9 = add i32 %s8, %e10 - %s10 = add i32 %s9, %e11 - %s11 = add i32 %s10, %e12 - %s12 = add i32 %s11, %e13 - %s13 = add i32 %s12, %e14 - %s14 = add i32 %s13, %e15 - %s15 = add i32 %s14, %e16 - %s16 = bitcast i32 %s15 to float - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s16, float %s16, float %s16, float %s16) - ret void -} - -declare <4 x i32> @llvm.SI.resinfo(i32, <32 x i8>, i32) readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/test/CodeGen/R600/llvm.SI.sample-masked.ll b/test/CodeGen/R600/llvm.SI.sample-masked.ll deleted file mode 100644 index ce9558cbf81..00000000000 --- a/test/CodeGen/R600/llvm.SI.sample-masked.ll +++ /dev/null @@ -1,96 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s - -; CHECK-LABEL: {{^}}v1: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 13 -define void @v1(i32 %a1) #0 { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) - %2 = extractelement <4 x float> %1, i32 0 - %3 = extractelement <4 x float> %1, i32 2 - %4 = extractelement <4 x float> %1, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4) - ret void -} - -; CHECK-LABEL: {{^}}v2: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 11 -define void @v2(i32 %a1) #0 { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) - %2 = extractelement <4 x float> %1, i32 0 - %3 = extractelement <4 x float> %1, i32 1 - %4 = extractelement <4 x float> %1, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4) - ret void -} - -; CHECK-LABEL: {{^}}v3: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 14 -define void @v3(i32 %a1) #0 { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) - %2 = extractelement <4 x float> %1, i32 1 - %3 = extractelement <4 x float> %1, i32 2 - %4 = extractelement <4 x float> %1, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4) - ret void -} - -; CHECK-LABEL: {{^}}v4: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 7 -define void @v4(i32 %a1) #0 { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) - %2 = extractelement <4 x float> %1, i32 0 - %3 = extractelement <4 x float> %1, i32 1 - %4 = extractelement <4 x float> %1, i32 2 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4) - ret void -} - -; CHECK-LABEL: {{^}}v5: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 10 -define void @v5(i32 %a1) #0 { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) - %2 = extractelement <4 x float> %1, i32 1 - %3 = extractelement <4 x float> %1, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3) - ret void -} - -; CHECK-LABEL: {{^}}v6: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 6 -define void @v6(i32 %a1) #0 { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) - %2 = extractelement <4 x float> %1, i32 1 - %3 = extractelement <4 x float> %1, i32 2 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3) - ret void -} - -; CHECK-LABEL: {{^}}v7: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 9 -define void @v7(i32 %a1) #0 { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) - %2 = extractelement <4 x float> %1, i32 0 - %3 = extractelement <4 x float> %1, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3) - ret void -} - -declare <4 x float> @llvm.SI.sample.v1i32(<1 x i32>, <32 x i8>, <16 x i8>, i32) readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } diff --git a/test/CodeGen/R600/llvm.SI.sample.ll b/test/CodeGen/R600/llvm.SI.sample.ll deleted file mode 100644 index 509c45f588b..00000000000 --- a/test/CodeGen/R600/llvm.SI.sample.ll +++ /dev/null @@ -1,160 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15 -;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 3 -;CHECK-DAG: image_sample {{v[0-9]+}}, 2 -;CHECK-DAG: image_sample {{v[0-9]+}}, 1 -;CHECK-DAG: image_sample {{v[0-9]+}}, 4 -;CHECK-DAG: image_sample {{v[0-9]+}}, 8 -;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 5 -;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 9 -;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 6 -;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 10 -;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 12 -;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 7 -;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 11 -;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 13 -;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 14 -;CHECK-DAG: image_sample {{v[0-9]+}}, 8 - -define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) #0 { - %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0 - %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1 - %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2 - %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3 - %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0 - %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1 - %v7 = insertelement <4 x i32> undef, i32 %a2, i32 2 - %v8 = insertelement <4 x i32> undef, i32 %a2, i32 3 - %v9 = insertelement <4 x i32> undef, i32 %a3, i32 0 - %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1 - %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2 - %v12 = insertelement <4 x i32> undef, i32 %a3, i32 3 - %v13 = insertelement <4 x i32> undef, i32 %a4, i32 0 - %v14 = insertelement <4 x i32> undef, i32 %a4, i32 1 - %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2 - %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3 - %res1 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v1, - <32 x i8> undef, <16 x i8> undef, i32 1) - %res2 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v2, - <32 x i8> undef, <16 x i8> undef, i32 2) - %res3 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v3, - <32 x i8> undef, <16 x i8> undef, i32 3) - %res4 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v4, - <32 x i8> undef, <16 x i8> undef, i32 4) - %res5 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v5, - <32 x i8> undef, <16 x i8> undef, i32 5) - %res6 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v6, - <32 x i8> undef, <16 x i8> undef, i32 6) - %res7 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v7, - <32 x i8> undef, <16 x i8> undef, i32 7) - %res8 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v8, - <32 x i8> undef, <16 x i8> undef, i32 8) - %res9 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v9, - <32 x i8> undef, <16 x i8> undef, i32 9) - %res10 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v10, - <32 x i8> undef, <16 x i8> undef, i32 10) - %res11 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v11, - <32 x i8> undef, <16 x i8> undef, i32 11) - %res12 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v12, - <32 x i8> undef, <16 x i8> undef, i32 12) - %res13 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v13, - <32 x i8> undef, <16 x i8> undef, i32 13) - %res14 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v14, - <32 x i8> undef, <16 x i8> undef, i32 14) - %res15 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v15, - <32 x i8> undef, <16 x i8> undef, i32 15) - %res16 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v16, - <32 x i8> undef, <16 x i8> undef, i32 16) - %e1 = extractelement <4 x float> %res1, i32 0 - %e2 = extractelement <4 x float> %res2, i32 1 - %e3 = extractelement <4 x float> %res3, i32 2 - %e4 = extractelement <4 x float> %res4, i32 3 - %t0 = extractelement <4 x float> %res5, i32 0 - %t1 = extractelement <4 x float> %res5, i32 1 - %e5 = fadd float %t0, %t1 - %t2 = extractelement <4 x float> %res6, i32 0 - %t3 = extractelement <4 x float> %res6, i32 2 - %e6 = fadd float %t2, %t3 - %t4 = extractelement <4 x float> %res7, i32 0 - %t5 = extractelement <4 x float> %res7, i32 3 - %e7 = fadd float %t4, %t5 - %t6 = extractelement <4 x float> %res8, i32 1 - %t7 = extractelement <4 x float> %res8, i32 2 - %e8 = fadd float %t6, %t7 - %t8 = extractelement <4 x float> %res9, i32 1 - %t9 = extractelement <4 x float> %res9, i32 3 - %e9 = fadd float %t8, %t9 - %t10 = extractelement <4 x float> %res10, i32 2 - %t11 = extractelement <4 x float> %res10, i32 3 - %e10 = fadd float %t10, %t11 - %t12 = extractelement <4 x float> %res11, i32 0 - %t13 = extractelement <4 x float> %res11, i32 1 - %t14 = extractelement <4 x float> %res11, i32 2 - %t15 = fadd float %t12, %t13 - %e11 = fadd float %t14, %t15 - %t16 = extractelement <4 x float> %res12, i32 0 - %t17 = extractelement <4 x float> %res12, i32 1 - %t18 = extractelement <4 x float> %res12, i32 3 - %t19 = fadd float %t16, %t17 - %e12 = fadd float %t18, %t19 - %t20 = extractelement <4 x float> %res13, i32 0 - %t21 = extractelement <4 x float> %res13, i32 2 - %t22 = extractelement <4 x float> %res13, i32 3 - %t23 = fadd float %t20, %t21 - %e13 = fadd float %t22, %t23 - %t24 = extractelement <4 x float> %res14, i32 1 - %t25 = extractelement <4 x float> %res14, i32 2 - %t26 = extractelement <4 x float> %res14, i32 3 - %t27 = fadd float %t24, %t25 - %e14 = fadd float %t26, %t27 - %t28 = extractelement <4 x float> %res15, i32 0 - %t29 = extractelement <4 x float> %res15, i32 1 - %t30 = extractelement <4 x float> %res15, i32 2 - %t31 = extractelement <4 x float> %res15, i32 3 - %t32 = fadd float %t28, %t29 - %t33 = fadd float %t30, %t31 - %e15 = fadd float %t32, %t33 - %e16 = extractelement <4 x float> %res16, i32 3 - %s1 = fadd float %e1, %e2 - %s2 = fadd float %s1, %e3 - %s3 = fadd float %s2, %e4 - %s4 = fadd float %s3, %e5 - %s5 = fadd float %s4, %e6 - %s6 = fadd float %s5, %e7 - %s7 = fadd float %s6, %e8 - %s8 = fadd float %s7, %e9 - %s9 = fadd float %s8, %e10 - %s10 = fadd float %s9, %e11 - %s11 = fadd float %s10, %e12 - %s12 = fadd float %s11, %e13 - %s13 = fadd float %s12, %e14 - %s14 = fadd float %s13, %e15 - %s15 = fadd float %s14, %e16 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s15, float %s15, float %s15, float %s15) - ret void -} - -; CHECK: {{^}}v1: -; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15 -define void @v1(i32 %a1) #0 { -entry: - %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 - %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0) - %2 = extractelement <4 x float> %1, i32 0 - %3 = extractelement <4 x float> %1, i32 1 - %4 = extractelement <4 x float> %1, i32 2 - %5 = extractelement <4 x float> %1, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %5) - ret void -} - - -declare <4 x float> @llvm.SI.sample.v1i32(<1 x i32>, <32 x i8>, <16 x i8>, i32) readnone - -declare <4 x float> @llvm.SI.sample.(<4 x i32>, <32 x i8>, <16 x i8>, i32) readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } diff --git a/test/CodeGen/R600/llvm.SI.sampled.ll b/test/CodeGen/R600/llvm.SI.sampled.ll deleted file mode 100644 index f2badff2a99..00000000000 --- a/test/CodeGen/R600/llvm.SI.sampled.ll +++ /dev/null @@ -1,143 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 15 -;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 3 -;CHECK-DAG: image_sample_d {{v[0-9]+}}, 2 -;CHECK-DAG: image_sample_d {{v[0-9]+}}, 1 -;CHECK-DAG: image_sample_d {{v[0-9]+}}, 4 -;CHECK-DAG: image_sample_d {{v[0-9]+}}, 8 -;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 5 -;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 9 -;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 6 -;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 10 -;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 12 -;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 7 -;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 11 -;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 13 -;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 14 -;CHECK-DAG: image_sample_d {{v[0-9]+}}, 8 - -define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) #0 { - %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0 - %v2 = insertelement <4 x i32> undef, i32 %a1, i32 1 - %v3 = insertelement <4 x i32> undef, i32 %a1, i32 2 - %v4 = insertelement <4 x i32> undef, i32 %a1, i32 3 - %v5 = insertelement <4 x i32> undef, i32 %a2, i32 0 - %v6 = insertelement <4 x i32> undef, i32 %a2, i32 1 - %v7 = insertelement <4 x i32> undef, i32 %a2, i32 2 - %v8 = insertelement <4 x i32> undef, i32 %a2, i32 3 - %v9 = insertelement <4 x i32> undef, i32 %a3, i32 0 - %v10 = insertelement <4 x i32> undef, i32 %a3, i32 1 - %v11 = insertelement <4 x i32> undef, i32 %a3, i32 2 - %v12 = insertelement <4 x i32> undef, i32 %a3, i32 3 - %v13 = insertelement <4 x i32> undef, i32 %a4, i32 0 - %v14 = insertelement <4 x i32> undef, i32 %a4, i32 1 - %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2 - %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3 - %res1 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v1, - <32 x i8> undef, <16 x i8> undef, i32 1) - %res2 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v2, - <32 x i8> undef, <16 x i8> undef, i32 2) - %res3 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v3, - <32 x i8> undef, <16 x i8> undef, i32 3) - %res4 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v4, - <32 x i8> undef, <16 x i8> undef, i32 4) - %res5 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v5, - <32 x i8> undef, <16 x i8> undef, i32 5) - %res6 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v6, - <32 x i8> undef, <16 x i8> undef, i32 6) - %res7 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v7, - <32 x i8> undef, <16 x i8> undef, i32 7) - %res8 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v8, - <32 x i8> undef, <16 x i8> undef, i32 8) - %res9 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v9, - <32 x i8> undef, <16 x i8> undef, i32 9) - %res10 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v10, - <32 x i8> undef, <16 x i8> undef, i32 10) - %res11 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v11, - <32 x i8> undef, <16 x i8> undef, i32 11) - %res12 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v12, - <32 x i8> undef, <16 x i8> undef, i32 12) - %res13 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v13, - <32 x i8> undef, <16 x i8> undef, i32 13) - %res14 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v14, - <32 x i8> undef, <16 x i8> undef, i32 14) - %res15 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v15, - <32 x i8> undef, <16 x i8> undef, i32 15) - %res16 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v16, - <32 x i8> undef, <16 x i8> undef, i32 16) - %e1 = extractelement <4 x float> %res1, i32 0 - %e2 = extractelement <4 x float> %res2, i32 1 - %e3 = extractelement <4 x float> %res3, i32 2 - %e4 = extractelement <4 x float> %res4, i32 3 - %t0 = extractelement <4 x float> %res5, i32 0 - %t1 = extractelement <4 x float> %res5, i32 1 - %e5 = fadd float %t0, %t1 - %t2 = extractelement <4 x float> %res6, i32 0 - %t3 = extractelement <4 x float> %res6, i32 2 - %e6 = fadd float %t2, %t3 - %t4 = extractelement <4 x float> %res7, i32 0 - %t5 = extractelement <4 x float> %res7, i32 3 - %e7 = fadd float %t4, %t5 - %t6 = extractelement <4 x float> %res8, i32 1 - %t7 = extractelement <4 x float> %res8, i32 2 - %e8 = fadd float %t6, %t7 - %t8 = extractelement <4 x float> %res9, i32 1 - %t9 = extractelement <4 x float> %res9, i32 3 - %e9 = fadd float %t8, %t9 - %t10 = extractelement <4 x float> %res10, i32 2 - %t11 = extractelement <4 x float> %res10, i32 3 - %e10 = fadd float %t10, %t11 - %t12 = extractelement <4 x float> %res11, i32 0 - %t13 = extractelement <4 x float> %res11, i32 1 - %t14 = extractelement <4 x float> %res11, i32 2 - %t15 = fadd float %t12, %t13 - %e11 = fadd float %t14, %t15 - %t16 = extractelement <4 x float> %res12, i32 0 - %t17 = extractelement <4 x float> %res12, i32 1 - %t18 = extractelement <4 x float> %res12, i32 3 - %t19 = fadd float %t16, %t17 - %e12 = fadd float %t18, %t19 - %t20 = extractelement <4 x float> %res13, i32 0 - %t21 = extractelement <4 x float> %res13, i32 2 - %t22 = extractelement <4 x float> %res13, i32 3 - %t23 = fadd float %t20, %t21 - %e13 = fadd float %t22, %t23 - %t24 = extractelement <4 x float> %res14, i32 1 - %t25 = extractelement <4 x float> %res14, i32 2 - %t26 = extractelement <4 x float> %res14, i32 3 - %t27 = fadd float %t24, %t25 - %e14 = fadd float %t26, %t27 - %t28 = extractelement <4 x float> %res15, i32 0 - %t29 = extractelement <4 x float> %res15, i32 1 - %t30 = extractelement <4 x float> %res15, i32 2 - %t31 = extractelement <4 x float> %res15, i32 3 - %t32 = fadd float %t28, %t29 - %t33 = fadd float %t30, %t31 - %e15 = fadd float %t32, %t33 - %e16 = extractelement <4 x float> %res16, i32 3 - %s1 = fadd float %e1, %e2 - %s2 = fadd float %s1, %e3 - %s3 = fadd float %s2, %e4 - %s4 = fadd float %s3, %e5 - %s5 = fadd float %s4, %e6 - %s6 = fadd float %s5, %e7 - %s7 = fadd float %s6, %e8 - %s8 = fadd float %s7, %e9 - %s9 = fadd float %s8, %e10 - %s10 = fadd float %s9, %e11 - %s11 = fadd float %s10, %e12 - %s12 = fadd float %s11, %e13 - %s13 = fadd float %s12, %e14 - %s14 = fadd float %s13, %e15 - %s15 = fadd float %s14, %e16 - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %s15, float %s15, float %s15, float %s15) - ret void -} - -declare <4 x float> @llvm.SI.sampled.(<4 x i32>, <32 x i8>, <16 x i8>, i32) readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } diff --git a/test/CodeGen/R600/llvm.SI.sendmsg-m0.ll b/test/CodeGen/R600/llvm.SI.sendmsg-m0.ll deleted file mode 100644 index 2198590f2df..00000000000 --- a/test/CodeGen/R600/llvm.SI.sendmsg-m0.ll +++ /dev/null @@ -1,20 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=BOTH %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=BOTH %s - -; BOTH-LABEL: {{^}}main: -; BOTH: s_mov_b32 m0, s0 -; VI-NEXT: s_nop 0 -; BOTH-NEXT: s_sendmsg Gs_done(nop) -; BOTH-NEXT: s_endpgm - -define void @main(i32 inreg %a) #0 { -main_body: - call void @llvm.SI.sendmsg(i32 3, i32 %a) - ret void -} - -; Function Attrs: nounwind -declare void @llvm.SI.sendmsg(i32, i32) #1 - -attributes #0 = { "ShaderType"="2" "unsafe-fp-math"="true" } -attributes #1 = { nounwind } diff --git a/test/CodeGen/R600/llvm.SI.sendmsg.ll b/test/CodeGen/R600/llvm.SI.sendmsg.ll deleted file mode 100644 index 09675d50335..00000000000 --- a/test/CodeGen/R600/llvm.SI.sendmsg.ll +++ /dev/null @@ -1,24 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -; CHECK-LABEL: {{^}}main: -; CHECK: s_mov_b32 m0, 0 -; CHECK-NOT: s_mov_b32 m0 -; CHECK: s_sendmsg Gs(emit stream 0) -; CHECK: s_sendmsg Gs(cut stream 1) -; CHECK: s_sendmsg Gs(emit-cut stream 2) -; CHECK: s_sendmsg Gs_done(nop) - -define void @main() { -main_body: - call void @llvm.SI.sendmsg(i32 34, i32 0); - call void @llvm.SI.sendmsg(i32 274, i32 0); - call void @llvm.SI.sendmsg(i32 562, i32 0); - call void @llvm.SI.sendmsg(i32 3, i32 0); - ret void -} - -; Function Attrs: nounwind -declare void @llvm.SI.sendmsg(i32, i32) #0 - -attributes #0 = { nounwind } diff --git a/test/CodeGen/R600/llvm.SI.tbuffer.store.ll b/test/CodeGen/R600/llvm.SI.tbuffer.store.ll deleted file mode 100644 index 71f51548a5f..00000000000 --- a/test/CodeGen/R600/llvm.SI.tbuffer.store.ll +++ /dev/null @@ -1,47 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-LABEL: {{^}}test1: -;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, 0x20, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 -define void @test1(i32 %a1, i32 %vaddr) #0 { - %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 - call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, - i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1, - i32 1, i32 0) - ret void -} - -;CHECK-LABEL: {{^}}test2: -;CHECK: tbuffer_store_format_xyz {{v\[[0-9]+:[0-9]+\]}}, 0x18, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 -define void @test2(i32 %a1, i32 %vaddr) #0 { - %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 - call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, - i32 3, i32 %vaddr, i32 0, i32 24, i32 13, i32 4, i32 1, i32 0, i32 1, - i32 1, i32 0) - ret void -} - -;CHECK-LABEL: {{^}}test3: -;CHECK: tbuffer_store_format_xy {{v\[[0-9]+:[0-9]+\]}}, 0x10, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 -define void @test3(i32 %a1, i32 %vaddr) #0 { - %vdata = insertelement <2 x i32> undef, i32 %a1, i32 0 - call void @llvm.SI.tbuffer.store.v2i32(<16 x i8> undef, <2 x i32> %vdata, - i32 2, i32 %vaddr, i32 0, i32 16, i32 11, i32 4, i32 1, i32 0, i32 1, - i32 1, i32 0) - ret void -} - -;CHECK-LABEL: {{^}}test4: -;CHECK: tbuffer_store_format_x {{v[0-9]+}}, 0x8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 -define void @test4(i32 %vdata, i32 %vaddr) #0 { - call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %vdata, - i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1, - i32 1, i32 0) - ret void -} - -declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -declare void @llvm.SI.tbuffer.store.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) - -attributes #0 = { "ShaderType"="1" } diff --git a/test/CodeGen/R600/llvm.SI.tid.ll b/test/CodeGen/R600/llvm.SI.tid.ll deleted file mode 100644 index f6e6d7050ba..00000000000 --- a/test/CodeGen/R600/llvm.SI.tid.ll +++ /dev/null @@ -1,18 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN %s - -;GCN: v_mbcnt_lo_u32_b32_e64 -;SI: v_mbcnt_hi_u32_b32_e32 -;VI: v_mbcnt_hi_u32_b32_e64 - -define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" { -main_body: - %4 = call i32 @llvm.SI.tid() - %5 = bitcast i32 %4 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %5, float %5, float %5) - ret void -} - -declare i32 @llvm.SI.tid() readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/test/CodeGen/R600/llvm.amdgpu.dp4.ll b/test/CodeGen/R600/llvm.amdgpu.dp4.ll deleted file mode 100644 index 036cd2ca82a..00000000000 --- a/test/CodeGen/R600/llvm.amdgpu.dp4.ll +++ /dev/null @@ -1,11 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s - -declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) nounwind readnone - -define void @test_dp4(float addrspace(1)* %out, <4 x float> addrspace(1)* %a, <4 x float> addrspace(1)* %b) nounwind { - %src0 = load <4 x float>, <4 x float> addrspace(1)* %a, align 16 - %src1 = load <4 x float>, <4 x float> addrspace(1)* %b, align 16 - %dp4 = call float @llvm.AMDGPU.dp4(<4 x float> %src0, <4 x float> %src1) nounwind readnone - store float %dp4, float addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/llvm.amdgpu.kilp.ll b/test/CodeGen/R600/llvm.amdgpu.kilp.ll deleted file mode 100644 index 42df6db1ccf..00000000000 --- a/test/CodeGen/R600/llvm.amdgpu.kilp.ll +++ /dev/null @@ -1,21 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}kilp_gs_const: -; SI: s_mov_b64 exec, 0 -define void @kilp_gs_const() #0 { -main_body: - %0 = icmp ule i32 0, 3 - %1 = select i1 %0, float 1.000000e+00, float -1.000000e+00 - call void @llvm.AMDGPU.kilp(float %1) - %2 = icmp ule i32 3, 0 - %3 = select i1 %2, float 1.000000e+00, float -1.000000e+00 - call void @llvm.AMDGPU.kilp(float %3) - ret void -} - -declare void @llvm.AMDGPU.kilp(float) - -attributes #0 = { "ShaderType"="2" } - -!0 = !{!"const", null, i32 1} diff --git a/test/CodeGen/R600/llvm.amdgpu.lrp.ll b/test/CodeGen/R600/llvm.amdgpu.lrp.ll deleted file mode 100644 index 4e4c2ec7791..00000000000 --- a/test/CodeGen/R600/llvm.amdgpu.lrp.ll +++ /dev/null @@ -1,13 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare float @llvm.AMDGPU.lrp(float, float, float) nounwind readnone - -; FUNC-LABEL: {{^}}test_lrp: -; SI: v_sub_f32 -; SI: v_mad_f32 -define void @test_lrp(float addrspace(1)* %out, float %src0, float %src1, float %src2) nounwind { - %mad = call float @llvm.AMDGPU.lrp(float %src0, float %src1, float %src2) nounwind readnone - store float %mad, float addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/llvm.cos.ll b/test/CodeGen/R600/llvm.cos.ll deleted file mode 100644 index c65df8b3e8d..00000000000 --- a/test/CodeGen/R600/llvm.cos.ll +++ /dev/null @@ -1,41 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -check-prefix=EG -check-prefix=FUNC -;RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s -check-prefix=SI -check-prefix=FUNC -;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s -check-prefix=SI -check-prefix=FUNC - -;FUNC-LABEL: test -;EG: MULADD_IEEE * -;EG: FRACT * -;EG: ADD * -;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -;EG-NOT: COS -;SI: v_cos_f32 -;SI-NOT: v_cos_f32 - -define void @test(float addrspace(1)* %out, float %x) #1 { - %cos = call float @llvm.cos.f32(float %x) - store float %cos, float addrspace(1)* %out - ret void -} - -;FUNC-LABEL: testv -;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -;EG-NOT: COS -;SI: v_cos_f32 -;SI: v_cos_f32 -;SI: v_cos_f32 -;SI: v_cos_f32 -;SI-NOT: v_cos_f32 - -define void @testv(<4 x float> addrspace(1)* %out, <4 x float> inreg %vx) #1 { - %cos = call <4 x float> @llvm.cos.v4f32(<4 x float> %vx) - store <4 x float> %cos, <4 x float> addrspace(1)* %out - ret void -} - -declare float @llvm.cos.f32(float) readnone -declare <4 x float> @llvm.cos.v4f32(<4 x float>) readnone - -attributes #0 = { "ShaderType"="0" } diff --git a/test/CodeGen/R600/llvm.exp2.ll b/test/CodeGen/R600/llvm.exp2.ll deleted file mode 100644 index 42698925aae..00000000000 --- a/test/CodeGen/R600/llvm.exp2.ll +++ /dev/null @@ -1,80 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC -;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC -;RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s --check-prefix=SI --check-prefix=FUNC -;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=SI --check-prefix=FUNC - -;FUNC-LABEL: {{^}}test: -;EG: EXP_IEEE -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} -;SI: v_exp_f32 - -define void @test(float addrspace(1)* %out, float %in) { -entry: - %0 = call float @llvm.exp2.f32(float %in) - store float %0, float addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}testv2: -;EG: EXP_IEEE -;EG: EXP_IEEE -; FIXME: We should be able to merge these packets together on Cayman so we -; have a maximum of 4 instructions. -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} -;SI: v_exp_f32 -;SI: v_exp_f32 - -define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) { -entry: - %0 = call <2 x float> @llvm.exp2.v2f32(<2 x float> %in) - store <2 x float> %0, <2 x float> addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}testv4: -;EG: EXP_IEEE -;EG: EXP_IEEE -;EG: EXP_IEEE -;EG: EXP_IEEE -; FIXME: We should be able to merge these packets together on Cayman so we -; have a maximum of 4 instructions. -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} -;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} -;SI: v_exp_f32 -;SI: v_exp_f32 -;SI: v_exp_f32 -;SI: v_exp_f32 -define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) { -entry: - %0 = call <4 x float> @llvm.exp2.v4f32(<4 x float> %in) - store <4 x float> %0, <4 x float> addrspace(1)* %out - ret void -} - -declare float @llvm.exp2.f32(float) readnone -declare <2 x float> @llvm.exp2.v2f32(<2 x float>) readnone -declare <4 x float> @llvm.exp2.v4f32(<4 x float>) readnone diff --git a/test/CodeGen/R600/llvm.log2.ll b/test/CodeGen/R600/llvm.log2.ll deleted file mode 100644 index c75e7850b35..00000000000 --- a/test/CodeGen/R600/llvm.log2.ll +++ /dev/null @@ -1,80 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC -;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC -;RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s --check-prefix=SI --check-prefix=FUNC -;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=SI --check-prefix=FUNC - -;FUNC-LABEL: {{^}}test: -;EG: LOG_IEEE -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} -;SI: v_log_f32 - -define void @test(float addrspace(1)* %out, float %in) { -entry: - %0 = call float @llvm.log2.f32(float %in) - store float %0, float addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}testv2: -;EG: LOG_IEEE -;EG: LOG_IEEE -; FIXME: We should be able to merge these packets together on Cayman so we -; have a maximum of 4 instructions. -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} -;SI: v_log_f32 -;SI: v_log_f32 - -define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) { -entry: - %0 = call <2 x float> @llvm.log2.v2f32(<2 x float> %in) - store <2 x float> %0, <2 x float> addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}testv4: -;EG: LOG_IEEE -;EG: LOG_IEEE -;EG: LOG_IEEE -;EG: LOG_IEEE -; FIXME: We should be able to merge these packets together on Cayman so we -; have a maximum of 4 instructions. -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} -;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} -;SI: v_log_f32 -;SI: v_log_f32 -;SI: v_log_f32 -;SI: v_log_f32 -define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) { -entry: - %0 = call <4 x float> @llvm.log2.v4f32(<4 x float> %in) - store <4 x float> %0, <4 x float> addrspace(1)* %out - ret void -} - -declare float @llvm.log2.f32(float) readnone -declare <2 x float> @llvm.log2.v2f32(<2 x float>) readnone -declare <4 x float> @llvm.log2.v4f32(<4 x float>) readnone diff --git a/test/CodeGen/R600/llvm.memcpy.ll b/test/CodeGen/R600/llvm.memcpy.ll deleted file mode 100644 index e491732cf9c..00000000000 --- a/test/CodeGen/R600/llvm.memcpy.ll +++ /dev/null @@ -1,365 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(3)* nocapture, i32, i32, i1) nounwind -declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i32, i1) nounwind - - -; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align1: -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 - -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 - -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 - -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 - -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 - -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 - -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 - -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 - -; SI: s_endpgm -define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { - %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* - %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* - call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 1, i1 false) nounwind - ret void -} - -; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align2: -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 - -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 - -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 - -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 - -; SI: s_endpgm -define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { - %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* - %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* - call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 2, i1 false) nounwind - ret void -} - -; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align4: -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI: s_endpgm -define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { - %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* - %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* - call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 4, i1 false) nounwind - ret void -} - -; FIXME: Use 64-bit ops -; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align8: - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: ds_read_b32 -; SI-DAG: ds_write_b32 - -; SI-DAG: s_endpgm -define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { - %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)* - %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)* - call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 8, i1 false) nounwind - ret void -} - -; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align1: -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte - -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte - -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte - -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_load_ubyte -; SI-DAG: buffer_store_byte - -; SI: s_endpgm -define void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { - %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* - %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* - call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 1, i1 false) nounwind - ret void -} - -; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align2: -; SI-DAG: buffer_load_ushort -; SI-DAG: buffer_load_ushort -; SI-DAG: buffer_load_ushort -; SI-DAG: buffer_load_ushort -; SI-DAG: buffer_load_ushort -; SI-DAG: buffer_load_ushort -; SI-DAG: buffer_load_ushort -; SI-DAG: buffer_load_ushort -; SI-DAG: buffer_load_ushort -; SI-DAG: buffer_load_ushort -; SI-DAG: buffer_load_ushort -; SI-DAG: buffer_load_ushort -; SI-DAG: buffer_load_ushort -; SI-DAG: buffer_load_ushort -; SI-DAG: buffer_load_ushort -; SI-DAG: buffer_load_ushort - -; SI-DAG: buffer_store_short -; SI-DAG: buffer_store_short -; SI-DAG: buffer_store_short -; SI-DAG: buffer_store_short -; SI-DAG: buffer_store_short -; SI-DAG: buffer_store_short -; SI-DAG: buffer_store_short -; SI-DAG: buffer_store_short -; SI-DAG: buffer_store_short -; SI-DAG: buffer_store_short -; SI-DAG: buffer_store_short -; SI-DAG: buffer_store_short -; SI-DAG: buffer_store_short -; SI-DAG: buffer_store_short -; SI-DAG: buffer_store_short -; SI-DAG: buffer_store_short - -; SI: s_endpgm -define void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { - %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* - %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* - call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 2, i1 false) nounwind - ret void -} - -; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align4: -; SI: buffer_load_dwordx4 -; SI: buffer_load_dwordx4 -; SI: buffer_store_dwordx4 -; SI: buffer_store_dwordx4 -; SI: s_endpgm -define void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { - %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* - %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* - call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 4, i1 false) nounwind - ret void -} - -; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align8: -; SI: buffer_load_dwordx4 -; SI: buffer_load_dwordx4 -; SI: buffer_store_dwordx4 -; SI: buffer_store_dwordx4 -; SI: s_endpgm -define void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { - %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* - %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* - call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 8, i1 false) nounwind - ret void -} - -; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align16: -; SI: buffer_load_dwordx4 -; SI: buffer_load_dwordx4 -; SI: buffer_store_dwordx4 -; SI: buffer_store_dwordx4 -; SI: s_endpgm -define void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { - %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)* - %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)* - call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 16, i1 false) nounwind - ret void -} diff --git a/test/CodeGen/R600/llvm.pow.ll b/test/CodeGen/R600/llvm.pow.ll deleted file mode 100644 index c4ae652619c..00000000000 --- a/test/CodeGen/R600/llvm.pow.ll +++ /dev/null @@ -1,40 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;CHECK-LABEL: test1: -;CHECK: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, -;CHECK-NEXT: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}, -;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}, - -define void @test1(<4 x float> inreg %reg0) #0 { - %r0 = extractelement <4 x float> %reg0, i32 0 - %r1 = extractelement <4 x float> %reg0, i32 1 - %r2 = call float @llvm.pow.f32( float %r0, float %r1) - %vec = insertelement <4 x float> undef, float %r2, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) - ret void -} - -;CHECK-LABEL: test2: -;CHECK: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, -;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}, -;CHECK-NEXT: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, -;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}, -;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}, -;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}, -;CHECK-NEXT: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, -;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}, -;CHECK-NEXT: LOG_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, -;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}, -;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}, -;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}, -define void @test2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { - %vec = call <4 x float> @llvm.pow.v4f32( <4 x float> %reg0, <4 x float> %reg1) - call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) - ret void -} - -declare float @llvm.pow.f32(float ,float ) readonly -declare <4 x float> @llvm.pow.v4f32(<4 x float> ,<4 x float> ) readonly -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } diff --git a/test/CodeGen/R600/llvm.rint.f64.ll b/test/CodeGen/R600/llvm.rint.f64.ll deleted file mode 100644 index c63fb172794..00000000000 --- a/test/CodeGen/R600/llvm.rint.f64.ll +++ /dev/null @@ -1,46 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}rint_f64: -; CI: v_rndne_f64_e32 - -; SI-DAG: v_add_f64 -; SI-DAG: v_add_f64 -; SI-DAG v_cmp_gt_f64_e64 -; SI: v_cndmask_b32 -; SI: v_cndmask_b32 -; SI: s_endpgm -define void @rint_f64(double addrspace(1)* %out, double %in) { -entry: - %0 = call double @llvm.rint.f64(double %in) - store double %0, double addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}rint_v2f64: -; CI: v_rndne_f64_e32 -; CI: v_rndne_f64_e32 -define void @rint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { -entry: - %0 = call <2 x double> @llvm.rint.v2f64(<2 x double> %in) - store <2 x double> %0, <2 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}rint_v4f64: -; CI: v_rndne_f64_e32 -; CI: v_rndne_f64_e32 -; CI: v_rndne_f64_e32 -; CI: v_rndne_f64_e32 -define void @rint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { -entry: - %0 = call <4 x double> @llvm.rint.v4f64(<4 x double> %in) - store <4 x double> %0, <4 x double> addrspace(1)* %out - ret void -} - - -declare double @llvm.rint.f64(double) #0 -declare <2 x double> @llvm.rint.v2f64(<2 x double>) #0 -declare <4 x double> @llvm.rint.v4f64(<4 x double>) #0 diff --git a/test/CodeGen/R600/llvm.rint.ll b/test/CodeGen/R600/llvm.rint.ll deleted file mode 100644 index 661db51ad03..00000000000 --- a/test/CodeGen/R600/llvm.rint.ll +++ /dev/null @@ -1,62 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}rint_f32: -; R600: RNDNE - -; SI: v_rndne_f32_e32 -define void @rint_f32(float addrspace(1)* %out, float %in) { -entry: - %0 = call float @llvm.rint.f32(float %in) #0 - store float %0, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}rint_v2f32: -; R600: RNDNE -; R600: RNDNE - -; SI: v_rndne_f32_e32 -; SI: v_rndne_f32_e32 -define void @rint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { -entry: - %0 = call <2 x float> @llvm.rint.v2f32(<2 x float> %in) #0 - store <2 x float> %0, <2 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}rint_v4f32: -; R600: RNDNE -; R600: RNDNE -; R600: RNDNE -; R600: RNDNE - -; SI: v_rndne_f32_e32 -; SI: v_rndne_f32_e32 -; SI: v_rndne_f32_e32 -; SI: v_rndne_f32_e32 -define void @rint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { -entry: - %0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %in) #0 - store <4 x float> %0, <4 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}legacy_amdil_round_nearest_f32: -; R600: RNDNE - -; SI: v_rndne_f32_e32 -define void @legacy_amdil_round_nearest_f32(float addrspace(1)* %out, float %in) { -entry: - %0 = call float @llvm.AMDIL.round.nearest.f32(float %in) #0 - store float %0, float addrspace(1)* %out - ret void -} - -declare float @llvm.AMDIL.round.nearest.f32(float) #0 -declare float @llvm.rint.f32(float) #0 -declare <2 x float> @llvm.rint.v2f32(<2 x float>) #0 -declare <4 x float> @llvm.rint.v4f32(<4 x float>) #0 - -attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/R600/llvm.round.f64.ll b/test/CodeGen/R600/llvm.round.f64.ll deleted file mode 100644 index 3d0f57e3328..00000000000 --- a/test/CodeGen/R600/llvm.round.f64.ll +++ /dev/null @@ -1,74 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}round_f64: -; SI: s_endpgm -define void @round_f64(double addrspace(1)* %out, double %x) #0 { - %result = call double @llvm.round.f64(double %x) #1 - store double %result, double addrspace(1)* %out - ret void -} - -; This is a pretty large function, so just test a few of the -; instructions that are necessary. - -; FUNC-LABEL: {{^}}v_round_f64: -; SI: buffer_load_dwordx2 -; SI: v_bfe_u32 [[EXP:v[0-9]+]], v{{[0-9]+}}, 20, 11 - -; SI-DAG: v_not_b32_e32 -; SI-DAG: v_not_b32_e32 - -; SI-DAG: v_cmp_eq_i32 - -; SI-DAG: s_mov_b32 [[BFIMASK:s[0-9]+]], 0x7fffffff -; SI-DAG: v_cmp_gt_i32_e64 -; SI-DAG: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[BFIMASK]] - -; SI-DAG: v_cmp_gt_i32_e64 - - -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() #1 - %gep = getelementptr double, double addrspace(1)* %in, i32 %tid - %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid - %x = load double, double addrspace(1)* %gep - %result = call double @llvm.round.f64(double %x) #1 - store double %result, double addrspace(1)* %out.gep - ret void -} - -; FUNC-LABEL: {{^}}round_v2f64: -; SI: s_endpgm -define void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 { - %result = call <2 x double> @llvm.round.v2f64(<2 x double> %in) #1 - store <2 x double> %result, <2 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}round_v4f64: -; SI: s_endpgm -define void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 { - %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1 - store <4 x double> %result, <4 x double> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}round_v8f64: -; SI: s_endpgm -define void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 { - %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1 - store <8 x double> %result, <8 x double> addrspace(1)* %out - ret void -} - -declare i32 @llvm.r600.read.tidig.x() #1 - -declare double @llvm.round.f64(double) #1 -declare <2 x double> @llvm.round.v2f64(<2 x double>) #1 -declare <4 x double> @llvm.round.v4f64(<4 x double>) #1 -declare <8 x double> @llvm.round.v8f64(<8 x double>) #1 - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/llvm.round.ll b/test/CodeGen/R600/llvm.round.ll deleted file mode 100644 index f5f124d915a..00000000000 --- a/test/CodeGen/R600/llvm.round.ll +++ /dev/null @@ -1,67 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}round_f32: -; SI-DAG: s_load_dword [[SX:s[0-9]+]] -; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x7fffffff -; SI: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[SX]] -; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]] -; SI: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]] -; SI: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]] -; SI: v_cmp_le_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0.5, |[[SUB]]| -; SI: v_cndmask_b32_e64 [[SEL:v[0-9]+]], 0, [[VX]], [[CMP]] -; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]] -; SI: buffer_store_dword [[RESULT]] - -; R600: TRUNC {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]] -; R600-DAG: ADD {{.*}}, -; R600-DAG: BFI_INT -; R600-DAG: SETGE -; R600-DAG: CNDE -; R600-DAG: ADD -define void @round_f32(float addrspace(1)* %out, float %x) #0 { - %result = call float @llvm.round.f32(float %x) #1 - store float %result, float addrspace(1)* %out - ret void -} - -; The vector tests are really difficult to verify, since it can be hard to -; predict how the scheduler will order the instructions. We already have -; a test for the scalar case, so the vector tests just check that the -; compiler doesn't crash. - -; FUNC-LABEL: {{^}}round_v2f32: -; SI: s_endpgm -; R600: CF_END -define void @round_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #0 { - %result = call <2 x float> @llvm.round.v2f32(<2 x float> %in) #1 - store <2 x float> %result, <2 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}round_v4f32: -; SI: s_endpgm -; R600: CF_END -define void @round_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #0 { - %result = call <4 x float> @llvm.round.v4f32(<4 x float> %in) #1 - store <4 x float> %result, <4 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}round_v8f32: -; SI: s_endpgm -; R600: CF_END -define void @round_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %in) #0 { - %result = call <8 x float> @llvm.round.v8f32(<8 x float> %in) #1 - store <8 x float> %result, <8 x float> addrspace(1)* %out - ret void -} - -declare float @llvm.round.f32(float) #1 -declare <2 x float> @llvm.round.v2f32(<2 x float>) #1 -declare <4 x float> @llvm.round.v4f32(<4 x float>) #1 -declare <8 x float> @llvm.round.v8f32(<8 x float>) #1 - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/llvm.sin.ll b/test/CodeGen/R600/llvm.sin.ll deleted file mode 100644 index 3bb245c2e24..00000000000 --- a/test/CodeGen/R600/llvm.sin.ll +++ /dev/null @@ -1,92 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-UNSAFE -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-UNSAFE -check-prefix=FUNC %s - -; FUNC-LABEL: sin_f32 -; EG: MULADD_IEEE * -; EG: FRACT * -; EG: ADD * -; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; EG-NOT: SIN -; SI: v_mul_f32 -; SI: v_fract_f32 -; SI: v_sin_f32 -; SI-NOT: v_sin_f32 - -define void @sin_f32(float addrspace(1)* %out, float %x) #1 { - %sin = call float @llvm.sin.f32(float %x) - store float %sin, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sin_3x_f32: -; SI-UNSAFE-NOT: v_add_f32 -; SI-UNSAFE: 0x3ef47644 -; SI-UNSAFE: v_mul_f32 -; SI-SAFE: v_mul_f32 -; SI-SAFE: v_mul_f32 -; SI: v_fract_f32 -; SI: v_sin_f32 -; SI-NOT: v_sin_f32 -define void @sin_3x_f32(float addrspace(1)* %out, float %x) #1 { - %y = fmul float 3.0, %x - %sin = call float @llvm.sin.f32(float %y) - store float %sin, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sin_2x_f32: -; SI-UNSAFE-NOT: v_add_f32 -; SI-UNSAFE: 0x3ea2f983 -; SI-UNSAFE: v_mul_f32 -; SI-SAFE: v_add_f32 -; SI-SAFE: v_mul_f32 -; SI: v_fract_f32 -; SI: v_sin_f32 -; SI-NOT: v_sin_f32 -define void @sin_2x_f32(float addrspace(1)* %out, float %x) #1 { - %y = fmul float 2.0, %x - %sin = call float @llvm.sin.f32(float %y) - store float %sin, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_2sin_f32: -; SI-UNSAFE: 0x3ea2f983 -; SI-UNSAFE: v_mul_f32 -; SI-SAFE: v_add_f32 -; SI-SAFE: v_mul_f32 -; SI: v_fract_f32 -; SI: v_sin_f32 -; SI-NOT: v_sin_f32 -define void @test_2sin_f32(float addrspace(1)* %out, float %x) #1 { - %y = fmul float 2.0, %x - %sin = call float @llvm.sin.f32(float %y) - store float %sin, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sin_v4f32: -; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}} -; EG-NOT: SIN -; SI: v_sin_f32 -; SI: v_sin_f32 -; SI: v_sin_f32 -; SI: v_sin_f32 -; SI-NOT: v_sin_f32 - -define void @sin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %vx) #1 { - %sin = call <4 x float> @llvm.sin.v4f32( <4 x float> %vx) - store <4 x float> %sin, <4 x float> addrspace(1)* %out - ret void -} - -declare float @llvm.sin.f32(float) readnone -declare <4 x float> @llvm.sin.v4f32(<4 x float>) readnone - -attributes #0 = { "ShaderType"="0" } diff --git a/test/CodeGen/R600/llvm.sqrt.ll b/test/CodeGen/R600/llvm.sqrt.ll deleted file mode 100644 index c6da047f539..00000000000 --- a/test/CodeGen/R600/llvm.sqrt.ll +++ /dev/null @@ -1,105 +0,0 @@ -; RUN: llc < %s -march=r600 --mcpu=redwood | FileCheck %s --check-prefix=R600 -; RUN: llc < %s -march=amdgcn --mcpu=SI -verify-machineinstrs| FileCheck %s --check-prefix=SI -; RUN: llc < %s -march=amdgcn --mcpu=tonga -verify-machineinstrs| FileCheck %s --check-prefix=SI - -; R600-LABEL: {{^}}sqrt_f32: -; R600: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z -; R600: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].Z, PS -; SI-LABEL: {{^}}sqrt_f32: -; SI: v_sqrt_f32_e32 -define void @sqrt_f32(float addrspace(1)* %out, float %in) { -entry: - %0 = call float @llvm.sqrt.f32(float %in) - store float %0, float addrspace(1)* %out - ret void -} - -; R600-LABEL: {{^}}sqrt_v2f32: -; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].W -; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].W, PS -; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].X -; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].X, PS -; SI-LABEL: {{^}}sqrt_v2f32: -; SI: v_sqrt_f32_e32 -; SI: v_sqrt_f32_e32 -define void @sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { -entry: - %0 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) - store <2 x float> %0, <2 x float> addrspace(1)* %out - ret void -} - -; R600-LABEL: {{^}}sqrt_v4f32: -; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Y -; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Y, PS -; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Z -; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Z, PS -; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].W -; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].W, PS -; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[4].X -; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[4].X, PS -; SI-LABEL: {{^}}sqrt_v4f32: -; SI: v_sqrt_f32_e32 -; SI: v_sqrt_f32_e32 -; SI: v_sqrt_f32_e32 -; SI: v_sqrt_f32_e32 -define void @sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { -entry: - %0 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) - store <4 x float> %0, <4 x float> addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}elim_redun_check: -; SI: v_sqrt_f32_e32 -; SI-NOT: v_cndmask -define void @elim_redun_check(float addrspace(1)* %out, float %in) { -entry: - %sqrt = call float @llvm.sqrt.f32(float %in) - %cmp = fcmp olt float %in, -0.000000e+00 - %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt - store float %res, float addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}elim_redun_check_ult: -; SI: v_sqrt_f32_e32 -; SI-NOT: v_cndmask -define void @elim_redun_check_ult(float addrspace(1)* %out, float %in) { -entry: - %sqrt = call float @llvm.sqrt.f32(float %in) - %cmp = fcmp ult float %in, -0.000000e+00 - %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt - store float %res, float addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}elim_redun_check_v2: -; SI: v_sqrt_f32_e32 -; SI: v_sqrt_f32_e32 -; SI-NOT: v_cndmask -define void @elim_redun_check_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) { -entry: - %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) - %cmp = fcmp olt <2 x float> %in, - %res = select <2 x i1> %cmp, <2 x float> , <2 x float> %sqrt - store <2 x float> %res, <2 x float> addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}elim_redun_check_v2_ult -; SI: v_sqrt_f32_e32 -; SI: v_sqrt_f32_e32 -; SI-NOT: v_cndmask -define void @elim_redun_check_v2_ult(<2 x float> addrspace(1)* %out, <2 x float> %in) { -entry: - %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) - %cmp = fcmp ult <2 x float> %in, - %res = select <2 x i1> %cmp, <2 x float> , <2 x float> %sqrt - store <2 x float> %res, <2 x float> addrspace(1)* %out - ret void -} - -declare float @llvm.sqrt.f32(float %in) -declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) -declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) diff --git a/test/CodeGen/R600/load-i1.ll b/test/CodeGen/R600/load-i1.ll deleted file mode 100644 index 0ca49fde3e7..00000000000 --- a/test/CodeGen/R600/load-i1.ll +++ /dev/null @@ -1,149 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}global_copy_i1_to_i1: -; SI: buffer_load_ubyte -; SI: v_and_b32_e32 v{{[0-9]+}}, 1 -; SI: buffer_store_byte -; SI: s_endpgm - -; EG: VTX_READ_8 -; EG: AND_INT -define void @global_copy_i1_to_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - store i1 %load, i1 addrspace(1)* %out, align 1 - ret void -} - -; FUNC-LABEL: {{^}}local_copy_i1_to_i1: -; SI: ds_read_u8 -; SI: v_and_b32_e32 v{{[0-9]+}}, 1 -; SI: ds_write_b8 -; SI: s_endpgm - -; EG: LDS_UBYTE_READ_RET -; EG: AND_INT -; EG: LDS_BYTE_WRITE -define void @local_copy_i1_to_i1(i1 addrspace(3)* %out, i1 addrspace(3)* %in) nounwind { - %load = load i1, i1 addrspace(3)* %in - store i1 %load, i1 addrspace(3)* %out, align 1 - ret void -} - -; FUNC-LABEL: {{^}}constant_copy_i1_to_i1: -; SI: buffer_load_ubyte -; SI: v_and_b32_e32 v{{[0-9]+}}, 1 -; SI: buffer_store_byte -; SI: s_endpgm - -; EG: VTX_READ_8 -; EG: AND_INT -define void @constant_copy_i1_to_i1(i1 addrspace(1)* %out, i1 addrspace(2)* %in) nounwind { - %load = load i1, i1 addrspace(2)* %in - store i1 %load, i1 addrspace(1)* %out, align 1 - ret void -} - -; FUNC-LABEL: {{^}}global_sextload_i1_to_i32: -; SI: buffer_load_ubyte -; SI: v_bfe_i32 -; SI: buffer_store_dword -; SI: s_endpgm - -; EG: VTX_READ_8 -; EG: BFE_INT -define void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = sext i1 %load to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}global_zextload_i1_to_i32: -; SI: buffer_load_ubyte -; SI: buffer_store_dword -; SI: s_endpgm - -define void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = zext i1 %load to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}global_sextload_i1_to_i64: -; SI: buffer_load_ubyte -; SI: v_bfe_i32 -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = sext i1 %load to i64 - store i64 %ext, i64 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}global_zextload_i1_to_i64: -; SI: buffer_load_ubyte -; SI: v_mov_b32_e32 {{v[0-9]+}}, 0 -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = zext i1 %load to i64 - store i64 %ext, i64 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}i1_arg: -; SI: buffer_load_ubyte -; SI: v_and_b32_e32 -; SI: buffer_store_byte -; SI: s_endpgm -define void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind { - store i1 %x, i1 addrspace(1)* %out, align 1 - ret void -} - -; FUNC-LABEL: {{^}}i1_arg_zext_i32: -; SI: buffer_load_ubyte -; SI: buffer_store_dword -; SI: s_endpgm -define void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { - %ext = zext i1 %x to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}i1_arg_zext_i64: -; SI: buffer_load_ubyte -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { - %ext = zext i1 %x to i64 - store i64 %ext, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}i1_arg_sext_i32: -; SI: buffer_load_ubyte -; SI: buffer_store_dword -; SI: s_endpgm -define void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { - %ext = sext i1 %x to i32 - store i32 %ext, i32addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}i1_arg_sext_i64: -; SI: buffer_load_ubyte -; SI: v_bfe_i32 -; SI: v_ashrrev_i32 -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { - %ext = sext i1 %x to i64 - store i64 %ext, i64 addrspace(1)* %out, align 8 - ret void -} diff --git a/test/CodeGen/R600/load-input-fold.ll b/test/CodeGen/R600/load-input-fold.ll deleted file mode 100644 index 1daf0e6527b..00000000000 --- a/test/CodeGen/R600/load-input-fold.ll +++ /dev/null @@ -1,117 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=cayman - -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) #0 { -main_body: - %0 = extractelement <4 x float> %reg1, i32 0 - %1 = extractelement <4 x float> %reg1, i32 1 - %2 = extractelement <4 x float> %reg1, i32 2 - %3 = extractelement <4 x float> %reg1, i32 3 - %4 = extractelement <4 x float> %reg2, i32 0 - %5 = extractelement <4 x float> %reg2, i32 1 - %6 = extractelement <4 x float> %reg2, i32 2 - %7 = extractelement <4 x float> %reg2, i32 3 - %8 = extractelement <4 x float> %reg3, i32 0 - %9 = extractelement <4 x float> %reg3, i32 1 - %10 = extractelement <4 x float> %reg3, i32 2 - %11 = extractelement <4 x float> %reg3, i32 3 - %12 = load <4 x float>, <4 x float> addrspace(8)* null - %13 = extractelement <4 x float> %12, i32 0 - %14 = fmul float %0, %13 - %15 = load <4 x float>, <4 x float> addrspace(8)* null - %16 = extractelement <4 x float> %15, i32 1 - %17 = fmul float %0, %16 - %18 = load <4 x float>, <4 x float> addrspace(8)* null - %19 = extractelement <4 x float> %18, i32 2 - %20 = fmul float %0, %19 - %21 = load <4 x float>, <4 x float> addrspace(8)* null - %22 = extractelement <4 x float> %21, i32 3 - %23 = fmul float %0, %22 - %24 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %25 = extractelement <4 x float> %24, i32 0 - %26 = fmul float %1, %25 - %27 = fadd float %26, %14 - %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %29 = extractelement <4 x float> %28, i32 1 - %30 = fmul float %1, %29 - %31 = fadd float %30, %17 - %32 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %33 = extractelement <4 x float> %32, i32 2 - %34 = fmul float %1, %33 - %35 = fadd float %34, %20 - %36 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %37 = extractelement <4 x float> %36, i32 3 - %38 = fmul float %1, %37 - %39 = fadd float %38, %23 - %40 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %41 = extractelement <4 x float> %40, i32 0 - %42 = fmul float %2, %41 - %43 = fadd float %42, %27 - %44 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %45 = extractelement <4 x float> %44, i32 1 - %46 = fmul float %2, %45 - %47 = fadd float %46, %31 - %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %49 = extractelement <4 x float> %48, i32 2 - %50 = fmul float %2, %49 - %51 = fadd float %50, %35 - %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %53 = extractelement <4 x float> %52, i32 3 - %54 = fmul float %2, %53 - %55 = fadd float %54, %39 - %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) - %57 = extractelement <4 x float> %56, i32 0 - %58 = fmul float %3, %57 - %59 = fadd float %58, %43 - %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) - %61 = extractelement <4 x float> %60, i32 1 - %62 = fmul float %3, %61 - %63 = fadd float %62, %47 - %64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) - %65 = extractelement <4 x float> %64, i32 2 - %66 = fmul float %3, %65 - %67 = fadd float %66, %51 - %68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) - %69 = extractelement <4 x float> %68, i32 3 - %70 = fmul float %3, %69 - %71 = fadd float %70, %55 - %72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) - %73 = extractelement <4 x float> %72, i32 0 - %74 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) - %75 = extractelement <4 x float> %74, i32 1 - %76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) - %77 = extractelement <4 x float> %76, i32 2 - %78 = insertelement <4 x float> undef, float %4, i32 0 - %79 = insertelement <4 x float> %78, float %5, i32 1 - %80 = insertelement <4 x float> %79, float %6, i32 2 - %81 = insertelement <4 x float> %80, float 0.000000e+00, i32 3 - %82 = insertelement <4 x float> undef, float %73, i32 0 - %83 = insertelement <4 x float> %82, float %75, i32 1 - %84 = insertelement <4 x float> %83, float %77, i32 2 - %85 = insertelement <4 x float> %84, float 0.000000e+00, i32 3 - %86 = call float @llvm.AMDGPU.dp4(<4 x float> %81, <4 x float> %85) - %87 = insertelement <4 x float> undef, float %86, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %87, i32 2, i32 2) - ret void -} - -; Function Attrs: readnone -declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 - -; Function Attrs: readonly -declare float @fabs(float) #2 - -; Function Attrs: readnone -declare float @llvm.AMDGPU.rsq(float) #1 - -; Function Attrs: readnone -declare float @llvm.AMDIL.clamp.(float, float, float) #1 - -; Function Attrs: nounwind readonly -declare float @llvm.pow.f32(float, float) #3 - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="1" } -attributes #1 = { readnone } -attributes #2 = { readonly } -attributes #3 = { nounwind readonly } diff --git a/test/CodeGen/R600/load.ll b/test/CodeGen/R600/load.ll deleted file mode 100644 index 93b1b51a0d0..00000000000 --- a/test/CodeGen/R600/load.ll +++ /dev/null @@ -1,709 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=R600 --check-prefix=FUNC %s -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s - -;===------------------------------------------------------------------------===; -; GLOBAL ADDRESS SPACE -;===------------------------------------------------------------------------===; - -; Load an i8 value from the global address space. -; FUNC-LABEL: {{^}}load_i8: -; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}} - -; SI: buffer_load_ubyte v{{[0-9]+}}, -define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { - %1 = load i8, i8 addrspace(1)* %in - %2 = zext i8 %1 to i32 - store i32 %2, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_i8_sext: -; R600: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]] -; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal -; R600: 8 -; SI: buffer_load_sbyte -define void @load_i8_sext(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { -entry: - %0 = load i8, i8 addrspace(1)* %in - %1 = sext i8 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v2i8: -; R600: VTX_READ_8 -; R600: VTX_READ_8 -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -define void @load_v2i8(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) { -entry: - %0 = load <2 x i8>, <2 x i8> addrspace(1)* %in - %1 = zext <2 x i8> %0 to <2 x i32> - store <2 x i32> %1, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v2i8_sext: -; R600-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] -; R600-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] -; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal -; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal -; R600-DAG: 8 -; R600-DAG: 8 - -; SI: buffer_load_sbyte -; SI: buffer_load_sbyte -define void @load_v2i8_sext(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) { -entry: - %0 = load <2 x i8>, <2 x i8> addrspace(1)* %in - %1 = sext <2 x i8> %0 to <2 x i32> - store <2 x i32> %1, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v4i8: -; R600: VTX_READ_8 -; R600: VTX_READ_8 -; R600: VTX_READ_8 -; R600: VTX_READ_8 -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -define void @load_v4i8(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) { -entry: - %0 = load <4 x i8>, <4 x i8> addrspace(1)* %in - %1 = zext <4 x i8> %0 to <4 x i32> - store <4 x i32> %1, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v4i8_sext: -; R600-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] -; R600-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] -; R600-DAG: VTX_READ_8 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]] -; R600-DAG: VTX_READ_8 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]] -; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal -; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal -; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Z]], 0.0, literal -; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_W]], 0.0, literal -; R600-DAG: 8 -; R600-DAG: 8 -; R600-DAG: 8 -; R600-DAG: 8 -; SI: buffer_load_sbyte -; SI: buffer_load_sbyte -; SI: buffer_load_sbyte -; SI: buffer_load_sbyte -define void @load_v4i8_sext(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) { -entry: - %0 = load <4 x i8>, <4 x i8> addrspace(1)* %in - %1 = sext <4 x i8> %0 to <4 x i32> - store <4 x i32> %1, <4 x i32> addrspace(1)* %out - ret void -} - -; Load an i16 value from the global address space. -; FUNC-LABEL: {{^}}load_i16: -; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}} -; SI: buffer_load_ushort -define void @load_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { -entry: - %0 = load i16 , i16 addrspace(1)* %in - %1 = zext i16 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_i16_sext: -; R600: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]] -; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal -; R600: 16 -; SI: buffer_load_sshort -define void @load_i16_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) { -entry: - %0 = load i16, i16 addrspace(1)* %in - %1 = sext i16 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v2i16: -; R600: VTX_READ_16 -; R600: VTX_READ_16 -; SI: buffer_load_ushort -; SI: buffer_load_ushort -define void @load_v2i16(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { -entry: - %0 = load <2 x i16>, <2 x i16> addrspace(1)* %in - %1 = zext <2 x i16> %0 to <2 x i32> - store <2 x i32> %1, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v2i16_sext: -; R600-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] -; R600-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] -; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal -; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal -; R600-DAG: 16 -; R600-DAG: 16 -; SI: buffer_load_sshort -; SI: buffer_load_sshort -define void @load_v2i16_sext(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { -entry: - %0 = load <2 x i16>, <2 x i16> addrspace(1)* %in - %1 = sext <2 x i16> %0 to <2 x i32> - store <2 x i32> %1, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v4i16: -; R600: VTX_READ_16 -; R600: VTX_READ_16 -; R600: VTX_READ_16 -; R600: VTX_READ_16 -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_load_ushort -; SI: buffer_load_ushort -define void @load_v4i16(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { -entry: - %0 = load <4 x i16>, <4 x i16> addrspace(1)* %in - %1 = zext <4 x i16> %0 to <4 x i32> - store <4 x i32> %1, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v4i16_sext: -; R600-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] -; R600-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] -; R600-DAG: VTX_READ_16 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]] -; R600-DAG: VTX_READ_16 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]] -; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_X]], 0.0, literal -; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal -; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Z]], 0.0, literal -; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_W]], 0.0, literal -; R600-DAG: 16 -; R600-DAG: 16 -; R600-DAG: 16 -; R600-DAG: 16 -; SI: buffer_load_sshort -; SI: buffer_load_sshort -; SI: buffer_load_sshort -; SI: buffer_load_sshort -define void @load_v4i16_sext(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { -entry: - %0 = load <4 x i16>, <4 x i16> addrspace(1)* %in - %1 = sext <4 x i16> %0 to <4 x i32> - store <4 x i32> %1, <4 x i32> addrspace(1)* %out - ret void -} - -; load an i32 value from the global address space. -; FUNC-LABEL: {{^}}load_i32: -; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0 - -; SI: buffer_load_dword v{{[0-9]+}} -define void @load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { -entry: - %0 = load i32, i32 addrspace(1)* %in - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; load a f32 value from the global address space. -; FUNC-LABEL: {{^}}load_f32: -; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0 - -; SI: buffer_load_dword v{{[0-9]+}} -define void @load_f32(float addrspace(1)* %out, float addrspace(1)* %in) { -entry: - %0 = load float, float addrspace(1)* %in - store float %0, float addrspace(1)* %out - ret void -} - -; load a v2f32 value from the global address space -; FUNC-LABEL: {{^}}load_v2f32: -; R600: MEM_RAT -; R600: VTX_READ_64 -; SI: buffer_load_dwordx2 -define void @load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) { -entry: - %0 = load <2 x float>, <2 x float> addrspace(1)* %in - store <2 x float> %0, <2 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_i64: -; R600: VTX_READ_64 -; SI: buffer_load_dwordx2 -define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { -entry: - %0 = load i64, i64 addrspace(1)* %in - store i64 %0, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_i64_sext: -; R600: MEM_RAT -; R600: MEM_RAT -; R600: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal.x -; R600: 31 -; SI: buffer_load_dword - -define void @load_i64_sext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { -entry: - %0 = load i32, i32 addrspace(1)* %in - %1 = sext i32 %0 to i64 - store i64 %1, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_i64_zext: -; R600: MEM_RAT -; R600: MEM_RAT -define void @load_i64_zext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { -entry: - %0 = load i32, i32 addrspace(1)* %in - %1 = zext i32 %0 to i64 - store i64 %1, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v8i32: -; R600: VTX_READ_128 -; R600: VTX_READ_128 -; XXX: We should be using DWORDX4 instructions on SI. -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -define void @load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) { -entry: - %0 = load <8 x i32>, <8 x i32> addrspace(1)* %in - store <8 x i32> %0, <8 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v16i32: -; R600: VTX_READ_128 -; R600: VTX_READ_128 -; R600: VTX_READ_128 -; R600: VTX_READ_128 -; XXX: We should be using DWORDX4 instructions on SI. -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -define void @load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) { -entry: - %0 = load <16 x i32>, <16 x i32> addrspace(1)* %in - store <16 x i32> %0, <16 x i32> addrspace(1)* %out - ret void -} - -;===------------------------------------------------------------------------===; -; CONSTANT ADDRESS SPACE -;===------------------------------------------------------------------------===; - -; Load a sign-extended i8 value -; FUNC-LABEL: {{^}}load_const_i8_sext: -; R600: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]] -; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal -; R600: 8 -; SI: buffer_load_sbyte v{{[0-9]+}}, -define void @load_const_i8_sext(i32 addrspace(1)* %out, i8 addrspace(2)* %in) { -entry: - %0 = load i8, i8 addrspace(2)* %in - %1 = sext i8 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; Load an aligned i8 value -; FUNC-LABEL: {{^}}load_const_i8_aligned: -; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}} -; SI: buffer_load_ubyte v{{[0-9]+}}, -define void @load_const_i8_aligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) { -entry: - %0 = load i8, i8 addrspace(2)* %in - %1 = zext i8 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; Load an un-aligned i8 value -; FUNC-LABEL: {{^}}load_const_i8_unaligned: -; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}} -; SI: buffer_load_ubyte v{{[0-9]+}}, -define void @load_const_i8_unaligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) { -entry: - %0 = getelementptr i8, i8 addrspace(2)* %in, i32 1 - %1 = load i8, i8 addrspace(2)* %0 - %2 = zext i8 %1 to i32 - store i32 %2, i32 addrspace(1)* %out - ret void -} - -; Load a sign-extended i16 value -; FUNC-LABEL: {{^}}load_const_i16_sext: -; R600: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]] -; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal -; R600: 16 -; SI: buffer_load_sshort -define void @load_const_i16_sext(i32 addrspace(1)* %out, i16 addrspace(2)* %in) { -entry: - %0 = load i16, i16 addrspace(2)* %in - %1 = sext i16 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; Load an aligned i16 value -; FUNC-LABEL: {{^}}load_const_i16_aligned: -; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}} -; SI: buffer_load_ushort -define void @load_const_i16_aligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) { -entry: - %0 = load i16, i16 addrspace(2)* %in - %1 = zext i16 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; Load an un-aligned i16 value -; FUNC-LABEL: {{^}}load_const_i16_unaligned: -; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}} -; SI: buffer_load_ushort -define void @load_const_i16_unaligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) { -entry: - %0 = getelementptr i16, i16 addrspace(2)* %in, i32 1 - %1 = load i16, i16 addrspace(2)* %0 - %2 = zext i16 %1 to i32 - store i32 %2, i32 addrspace(1)* %out - ret void -} - -; Load an i32 value from the constant address space. -; FUNC-LABEL: {{^}}load_const_addrspace_i32: -; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0 - -; SI: s_load_dword s{{[0-9]+}} -define void @load_const_addrspace_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) { -entry: - %0 = load i32, i32 addrspace(2)* %in - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; Load a f32 value from the constant address space. -; FUNC-LABEL: {{^}}load_const_addrspace_f32: -; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0 - -; SI: s_load_dword s{{[0-9]+}} -define void @load_const_addrspace_f32(float addrspace(1)* %out, float addrspace(2)* %in) { - %1 = load float, float addrspace(2)* %in - store float %1, float addrspace(1)* %out - ret void -} - -;===------------------------------------------------------------------------===; -; LOCAL ADDRESS SPACE -;===------------------------------------------------------------------------===; - -; Load an i8 value from the local address space. -; FUNC-LABEL: {{^}}load_i8_local: -; R600: LDS_UBYTE_READ_RET -; SI-NOT: s_wqm_b64 -; SI: s_mov_b32 m0 -; SI: ds_read_u8 -define void @load_i8_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) { - %1 = load i8, i8 addrspace(3)* %in - %2 = zext i8 %1 to i32 - store i32 %2, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_i8_sext_local: -; R600: LDS_UBYTE_READ_RET -; R600: BFE_INT -; SI-NOT: s_wqm_b64 -; SI: s_mov_b32 m0 -; SI: ds_read_i8 -define void @load_i8_sext_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) { -entry: - %0 = load i8, i8 addrspace(3)* %in - %1 = sext i8 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v2i8_local: -; R600: LDS_UBYTE_READ_RET -; R600: LDS_UBYTE_READ_RET -; SI-NOT: s_wqm_b64 -; SI: s_mov_b32 m0 -; SI: ds_read_u8 -; SI: ds_read_u8 -define void @load_v2i8_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) { -entry: - %0 = load <2 x i8>, <2 x i8> addrspace(3)* %in - %1 = zext <2 x i8> %0 to <2 x i32> - store <2 x i32> %1, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v2i8_sext_local: -; R600-DAG: LDS_UBYTE_READ_RET -; R600-DAG: LDS_UBYTE_READ_RET -; R600-DAG: BFE_INT -; R600-DAG: BFE_INT -; SI-NOT: s_wqm_b64 -; SI: s_mov_b32 m0 -; SI: ds_read_i8 -; SI: ds_read_i8 -define void @load_v2i8_sext_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) { -entry: - %0 = load <2 x i8>, <2 x i8> addrspace(3)* %in - %1 = sext <2 x i8> %0 to <2 x i32> - store <2 x i32> %1, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v4i8_local: -; R600: LDS_UBYTE_READ_RET -; R600: LDS_UBYTE_READ_RET -; R600: LDS_UBYTE_READ_RET -; R600: LDS_UBYTE_READ_RET -; SI-NOT: s_wqm_b64 -; SI: s_mov_b32 m0 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -define void @load_v4i8_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) { -entry: - %0 = load <4 x i8>, <4 x i8> addrspace(3)* %in - %1 = zext <4 x i8> %0 to <4 x i32> - store <4 x i32> %1, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v4i8_sext_local: -; R600-DAG: LDS_UBYTE_READ_RET -; R600-DAG: LDS_UBYTE_READ_RET -; R600-DAG: LDS_UBYTE_READ_RET -; R600-DAG: LDS_UBYTE_READ_RET -; R600-DAG: BFE_INT -; R600-DAG: BFE_INT -; R600-DAG: BFE_INT -; R600-DAG: BFE_INT -; SI-NOT: s_wqm_b64 -; SI: s_mov_b32 m0 -; SI: ds_read_i8 -; SI: ds_read_i8 -; SI: ds_read_i8 -; SI: ds_read_i8 -define void @load_v4i8_sext_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) { -entry: - %0 = load <4 x i8>, <4 x i8> addrspace(3)* %in - %1 = sext <4 x i8> %0 to <4 x i32> - store <4 x i32> %1, <4 x i32> addrspace(1)* %out - ret void -} - -; Load an i16 value from the local address space. -; FUNC-LABEL: {{^}}load_i16_local: -; R600: LDS_USHORT_READ_RET -; SI-NOT: s_wqm_b64 -; SI: s_mov_b32 m0 -; SI: ds_read_u16 -define void @load_i16_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) { -entry: - %0 = load i16 , i16 addrspace(3)* %in - %1 = zext i16 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_i16_sext_local: -; R600: LDS_USHORT_READ_RET -; R600: BFE_INT -; SI-NOT: s_wqm_b64 -; SI: s_mov_b32 m0 -; SI: ds_read_i16 -define void @load_i16_sext_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) { -entry: - %0 = load i16, i16 addrspace(3)* %in - %1 = sext i16 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v2i16_local: -; R600: LDS_USHORT_READ_RET -; R600: LDS_USHORT_READ_RET -; SI-NOT: s_wqm_b64 -; SI: s_mov_b32 m0 -; SI: ds_read_u16 -; SI: ds_read_u16 -define void @load_v2i16_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) { -entry: - %0 = load <2 x i16>, <2 x i16> addrspace(3)* %in - %1 = zext <2 x i16> %0 to <2 x i32> - store <2 x i32> %1, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v2i16_sext_local: -; R600-DAG: LDS_USHORT_READ_RET -; R600-DAG: LDS_USHORT_READ_RET -; R600-DAG: BFE_INT -; R600-DAG: BFE_INT -; SI-NOT: s_wqm_b64 -; SI: s_mov_b32 m0 -; SI: ds_read_i16 -; SI: ds_read_i16 -define void @load_v2i16_sext_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) { -entry: - %0 = load <2 x i16>, <2 x i16> addrspace(3)* %in - %1 = sext <2 x i16> %0 to <2 x i32> - store <2 x i32> %1, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v4i16_local: -; R600: LDS_USHORT_READ_RET -; R600: LDS_USHORT_READ_RET -; R600: LDS_USHORT_READ_RET -; R600: LDS_USHORT_READ_RET -; SI-NOT: s_wqm_b64 -; SI: s_mov_b32 m0 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -define void @load_v4i16_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) { -entry: - %0 = load <4 x i16>, <4 x i16> addrspace(3)* %in - %1 = zext <4 x i16> %0 to <4 x i32> - store <4 x i32> %1, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}load_v4i16_sext_local: -; R600-DAG: LDS_USHORT_READ_RET -; R600-DAG: LDS_USHORT_READ_RET -; R600-DAG: LDS_USHORT_READ_RET -; R600-DAG: LDS_USHORT_READ_RET -; R600-DAG: BFE_INT -; R600-DAG: BFE_INT -; R600-DAG: BFE_INT -; R600-DAG: BFE_INT -; SI-NOT: s_wqm_b64 -; SI: s_mov_b32 m0 -; SI: ds_read_i16 -; SI: ds_read_i16 -; SI: ds_read_i16 -; SI: ds_read_i16 -define void @load_v4i16_sext_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) { -entry: - %0 = load <4 x i16>, <4 x i16> addrspace(3)* %in - %1 = sext <4 x i16> %0 to <4 x i32> - store <4 x i32> %1, <4 x i32> addrspace(1)* %out - ret void -} - -; load an i32 value from the local address space. -; FUNC-LABEL: {{^}}load_i32_local: -; R600: LDS_READ_RET -; SI-NOT: s_wqm_b64 -; SI: s_mov_b32 m0 -; SI: ds_read_b32 -define void @load_i32_local(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { -entry: - %0 = load i32, i32 addrspace(3)* %in - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; load a f32 value from the local address space. -; FUNC-LABEL: {{^}}load_f32_local: -; R600: LDS_READ_RET -; SI: s_mov_b32 m0 -; SI: ds_read_b32 -define void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) { -entry: - %0 = load float, float addrspace(3)* %in - store float %0, float addrspace(1)* %out - ret void -} - -; load a v2f32 value from the local address space -; FUNC-LABEL: {{^}}load_v2f32_local: -; R600: LDS_READ_RET -; R600: LDS_READ_RET -; SI: s_mov_b32 m0 -; SI: ds_read_b64 -define void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) { -entry: - %0 = load <2 x float>, <2 x float> addrspace(3)* %in - store <2 x float> %0, <2 x float> addrspace(1)* %out - ret void -} - -; Test loading a i32 and v2i32 value from the same base pointer. -; FUNC-LABEL: {{^}}load_i32_v2i32_local: -; R600: LDS_READ_RET -; R600: LDS_READ_RET -; R600: LDS_READ_RET -; SI-DAG: ds_read_b32 -; SI-DAG: ds_read2_b32 -define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) { - %scalar = load i32, i32 addrspace(3)* %in - %tmp0 = bitcast i32 addrspace(3)* %in to <2 x i32> addrspace(3)* - %vec_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(3)* %tmp0, i32 2 - %vec0 = load <2 x i32>, <2 x i32> addrspace(3)* %vec_ptr, align 4 - %vec1 = insertelement <2 x i32> , i32 %scalar, i32 0 - %vec = add <2 x i32> %vec0, %vec1 - store <2 x i32> %vec, <2 x i32> addrspace(1)* %out - ret void -} - - -@lds = addrspace(3) global [512 x i32] undef, align 4 - -; On SI we need to make sure that the base offset is a register and not -; an immediate. -; FUNC-LABEL: {{^}}load_i32_local_const_ptr: -; SI: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0 -; SI: ds_read_b32 v0, v[[ZERO]] offset:4 -; R600: LDS_READ_RET -define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { -entry: - %tmp0 = getelementptr [512 x i32], [512 x i32] addrspace(3)* @lds, i32 0, i32 1 - %tmp1 = load i32, i32 addrspace(3)* %tmp0 - %tmp2 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - store i32 %tmp1, i32 addrspace(1)* %tmp2 - ret void -} diff --git a/test/CodeGen/R600/load.vec.ll b/test/CodeGen/R600/load.vec.ll deleted file mode 100644 index 02f883cd8e9..00000000000 --- a/test/CodeGen/R600/load.vec.ll +++ /dev/null @@ -1,25 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s - -; load a v2i32 value from the global address space. -; EG: {{^}}load_v2i32: -; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0 -; SI: {{^}}load_v2i32: -; SI: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}] -define void @load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %a = load <2 x i32>, <2 x i32> addrspace(1) * %in - store <2 x i32> %a, <2 x i32> addrspace(1)* %out - ret void -} - -; load a v4i32 value from the global address space. -; EG: {{^}}load_v4i32: -; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0 -; SI: {{^}}load_v4i32: -; SI: buffer_load_dwordx4 v[{{[0-9]+:[0-9]+}}] -define void @load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %a = load <4 x i32>, <4 x i32> addrspace(1) * %in - store <4 x i32> %a, <4 x i32> addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/load64.ll b/test/CodeGen/R600/load64.ll deleted file mode 100644 index 74beabdc007..00000000000 --- a/test/CodeGen/R600/load64.ll +++ /dev/null @@ -1,31 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -; load a f64 value from the global address space. -; CHECK-LABEL: {{^}}load_f64: -; CHECK: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}] -; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}] -define void @load_f64(double addrspace(1)* %out, double addrspace(1)* %in) { - %1 = load double, double addrspace(1)* %in - store double %1, double addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}load_i64: -; CHECK: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}] -; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}] -define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { - %tmp = load i64, i64 addrspace(1)* %in - store i64 %tmp, i64 addrspace(1)* %out, align 8 - ret void -} - -; Load a f64 value from the constant address space. -; CHECK-LABEL: {{^}}load_const_addrspace_f64: -; CHECK: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}] -; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}] -define void @load_const_addrspace_f64(double addrspace(1)* %out, double addrspace(2)* %in) { - %1 = load double, double addrspace(2)* %in - store double %1, double addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/local-64.ll b/test/CodeGen/R600/local-64.ll deleted file mode 100644 index 33f3159d13e..00000000000 --- a/test/CodeGen/R600/local-64.ll +++ /dev/null @@ -1,167 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=BOTH %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s - -; BOTH-LABEL: {{^}}local_i32_load -; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28 -; BOTH: buffer_store_dword [[REG]], -define void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %in, i32 7 - %val = load i32, i32 addrspace(3)* %gep, align 4 - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - -; BOTH-LABEL: {{^}}local_i32_load_0_offset -; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} -; BOTH: buffer_store_dword [[REG]], -define void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind { - %val = load i32, i32 addrspace(3)* %in, align 4 - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - -; BOTH-LABEL: {{^}}local_i8_load_i16_max_offset: -; BOTH-NOT: ADD -; BOTH: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535 -; BOTH: buffer_store_byte [[REG]], -define void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind { - %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65535 - %val = load i8, i8 addrspace(3)* %gep, align 4 - store i8 %val, i8 addrspace(1)* %out, align 4 - ret void -} - -; BOTH-LABEL: {{^}}local_i8_load_over_i16_max_offset: -; The LDS offset will be 65536 bytes, which is larger than the size of LDS on -; SI, which is why it is being OR'd with the base pointer. -; SI: s_or_b32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000 -; CI: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000 -; BOTH: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]] -; BOTH: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]] -; BOTH: buffer_store_byte [[REG]], -define void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind { - %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65536 - %val = load i8, i8 addrspace(3)* %gep, align 4 - store i8 %val, i8 addrspace(1)* %out, align 4 - ret void -} - -; BOTH-LABEL: {{^}}local_i64_load: -; BOTH-NOT: ADD -; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 -; BOTH: buffer_store_dwordx2 [[REG]], -define void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %in, i32 7 - %val = load i64, i64 addrspace(3)* %gep, align 8 - store i64 %val, i64 addrspace(1)* %out, align 8 - ret void -} - -; BOTH-LABEL: {{^}}local_i64_load_0_offset -; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} -; BOTH: buffer_store_dwordx2 [[REG]], -define void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind { - %val = load i64, i64 addrspace(3)* %in, align 8 - store i64 %val, i64 addrspace(1)* %out, align 8 - ret void -} - -; BOTH-LABEL: {{^}}local_f64_load: -; BOTH-NOT: ADD -; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 -; BOTH: buffer_store_dwordx2 [[REG]], -define void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind { - %gep = getelementptr double, double addrspace(3)* %in, i32 7 - %val = load double, double addrspace(3)* %gep, align 8 - store double %val, double addrspace(1)* %out, align 8 - ret void -} - -; BOTH-LABEL: {{^}}local_f64_load_0_offset -; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} -; BOTH: buffer_store_dwordx2 [[REG]], -define void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind { - %val = load double, double addrspace(3)* %in, align 8 - store double %val, double addrspace(1)* %out, align 8 - ret void -} - -; BOTH-LABEL: {{^}}local_i64_store: -; BOTH-NOT: ADD -; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 -define void @local_i64_store(i64 addrspace(3)* %out) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %out, i32 7 - store i64 5678, i64 addrspace(3)* %gep, align 8 - ret void -} - -; BOTH-LABEL: {{^}}local_i64_store_0_offset: -; BOTH-NOT: ADD -; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} -define void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind { - store i64 1234, i64 addrspace(3)* %out, align 8 - ret void -} - -; BOTH-LABEL: {{^}}local_f64_store: -; BOTH-NOT: ADD -; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 -define void @local_f64_store(double addrspace(3)* %out) nounwind { - %gep = getelementptr double, double addrspace(3)* %out, i32 7 - store double 16.0, double addrspace(3)* %gep, align 8 - ret void -} - -; BOTH-LABEL: {{^}}local_f64_store_0_offset -; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} -define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind { - store double 20.0, double addrspace(3)* %out, align 8 - ret void -} - -; BOTH-LABEL: {{^}}local_v2i64_store: -; BOTH-NOT: ADD -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:112 -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:120 -; BOTH: s_endpgm -define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind { - %gep = getelementptr <2 x i64>, <2 x i64> addrspace(3)* %out, i32 7 - store <2 x i64> , <2 x i64> addrspace(3)* %gep, align 16 - ret void -} - -; BOTH-LABEL: {{^}}local_v2i64_store_0_offset: -; BOTH-NOT: ADD -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8 -; BOTH: s_endpgm -define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind { - store <2 x i64> , <2 x i64> addrspace(3)* %out, align 16 - ret void -} - -; BOTH-LABEL: {{^}}local_v4i64_store: -; BOTH-NOT: ADD -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:224 -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:232 -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:240 -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:248 -; BOTH: s_endpgm -define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind { - %gep = getelementptr <4 x i64>, <4 x i64> addrspace(3)* %out, i32 7 - store <4 x i64> , <4 x i64> addrspace(3)* %gep, align 16 - ret void -} - -; BOTH-LABEL: {{^}}local_v4i64_store_0_offset: -; BOTH-NOT: ADD -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8 -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:16 -; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:24 -; BOTH: s_endpgm -define void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind { - store <4 x i64> , <4 x i64> addrspace(3)* %out, align 16 - ret void -} diff --git a/test/CodeGen/R600/local-atomics.ll b/test/CodeGen/R600/local-atomics.ll deleted file mode 100644 index 2aaf977ab90..00000000000 --- a/test/CodeGen/R600/local-atomics.ll +++ /dev/null @@ -1,551 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32: -; EG: LDS_WRXCHG_RET * -; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 -; GCN: s_load_dword [[SPTR:s[0-9]+]], -; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] -; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm -define void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32_offset: -; EG: LDS_WRXCHG_RET * -; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; XXX - Is it really necessary to load 4 into VGPR? -; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32: -; EG: LDS_ADD_RET * -; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 -; GCN: s_load_dword [[SPTR:s[0-9]+]], -; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] -; GCN: ds_add_rtn_u32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] -; GCN: buffer_store_dword [[RESULT]], -; GCN: s_endpgm -define void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32_offset: -; EG: LDS_ADD_RET * -; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32_bad_si_offset: -; EG: LDS_ADD_RET * -; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { - %sub = sub i32 %a, %b - %add = add i32 %sub, 4 - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add - %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32: -; EG: LDS_ADD_RET * -; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 -; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] -; GCN: s_endpgm -define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32_offset: -; EG: LDS_ADD_RET * -; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 -; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16 -; GCN: s_endpgm -define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32_bad_si_offset: -; EG: LDS_ADD_RET * -; SI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CIVI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_inc_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { - %sub = sub i32 %a, %b - %add = add i32 %sub, 4 - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add - %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i32: -; EG: LDS_SUB_RET * -; GCN: ds_sub_rtn_u32 -; GCN: s_endpgm -define void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i32_offset: -; EG: LDS_SUB_RET * -; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32: -; EG: LDS_SUB_RET * -; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 -; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] -; GCN: s_endpgm -define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32_offset: -; EG: LDS_SUB_RET * -; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 -; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16 -; GCN: s_endpgm -define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_and_ret_i32: -; EG: LDS_AND_RET * -; GCN: ds_and_rtn_b32 -; GCN: s_endpgm -define void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_and_ret_i32_offset: -; EG: LDS_AND_RET * -; GCN: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_or_ret_i32: -; EG: LDS_OR_RET * -; GCN: ds_or_rtn_b32 -; GCN: s_endpgm -define void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_or_ret_i32_offset: -; EG: LDS_OR_RET * -; GCN: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i32: -; EG: LDS_XOR_RET * -; GCN: ds_xor_rtn_b32 -; GCN: s_endpgm -define void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i32_offset: -; EG: LDS_XOR_RET * -; GCN: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FIXME: There is no atomic nand instr -; XFUNC-LABEL: {{^}}lds_atomic_nand_ret_i32:uction, so we somehow need to expand this. -; define void @lds_atomic_nand_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { -; %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst -; store i32 %result, i32 addrspace(1)* %out, align 4 -; ret void -; } - -; FUNC-LABEL: {{^}}lds_atomic_min_ret_i32: -; EG: LDS_MIN_INT_RET * -; GCN: ds_min_rtn_i32 -; GCN: s_endpgm -define void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_min_ret_i32_offset: -; EG: LDS_MIN_INT_RET * -; GCN: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_max_ret_i32: -; EG: LDS_MAX_INT_RET * -; GCN: ds_max_rtn_i32 -; GCN: s_endpgm -define void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_max_ret_i32_offset: -; EG: LDS_MAX_INT_RET * -; GCN: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i32: -; EG: LDS_MIN_UINT_RET * -; GCN: ds_min_rtn_u32 -; GCN: s_endpgm -define void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i32_offset: -; EG: LDS_MIN_UINT_RET * -; GCN: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i32: -; EG: LDS_MAX_UINT_RET * -; GCN: ds_max_rtn_u32 -; GCN: s_endpgm -define void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i32_offset: -; EG: LDS_MAX_UINT_RET * -; GCN: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32: -; GCN: s_load_dword [[SPTR:s[0-9]+]], -; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 -; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] -; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] -; GCN: s_endpgm -define void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32_offset: -; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst - ret void -} - -; XXX - Is it really necessary to load 4 into VGPR? -; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32: -; GCN: s_load_dword [[SPTR:s[0-9]+]], -; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 -; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] -; GCN: ds_add_u32 [[VPTR]], [[DATA]] -; GCN: s_endpgm -define void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_offset: -; GCN: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_bad_si_offset -; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} -; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { - %sub = sub i32 %a, %b - %add = add i32 %sub, 4 - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add - %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32: -; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 -; GCN: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]] -; GCN: s_endpgm -define void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_offset: -; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 -; GCN: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]] offset:16 -; GCN: s_endpgm -define void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_bad_si_offset: -; SI: ds_inc_u32 v{{[0-9]+}}, v{{[0-9]+}} -; CIVI: ds_inc_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_inc_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind { - %sub = sub i32 %a, %b - %add = add i32 %sub, 4 - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add - %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32: -; GCN: ds_sub_u32 -; GCN: s_endpgm -define void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32_offset: -; GCN: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32: -; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 -; GCN: ds_dec_u32 v{{[0-9]+}}, [[NEGONE]] -; GCN: s_endpgm -define void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32_offset: -; GCN: v_mov_b32_e32 [[NEGONE:v[0-9]+]], -1 -; GCN: ds_dec_u32 v{{[0-9]+}}, [[NEGONE]] offset:16 -; GCN: s_endpgm -define void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32: -; GCN: ds_and_b32 -; GCN: s_endpgm -define void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32_offset: -; GCN: ds_and_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32: -; GCN: ds_or_b32 -; GCN: s_endpgm -define void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32_offset: -; GCN: ds_or_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32: -; GCN: ds_xor_b32 -; GCN: s_endpgm -define void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32_offset: -; GCN: ds_xor_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst - ret void -} - -; FIXME: There is no atomic nand instr -; XFUNC-LABEL: {{^}}lds_atomic_nand_noret_i32:uction, so we somehow need to expand this. -; define void @lds_atomic_nand_noret_i32(i32 addrspace(3)* %ptr) nounwind { -; %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst -; ret void -; } - -; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32: -; GCN: ds_min_i32 -; GCN: s_endpgm -define void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32_offset: -; GCN: ds_min_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32: -; GCN: ds_max_i32 -; GCN: s_endpgm -define void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32_offset: -; GCN: ds_max_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32: -; GCN: ds_min_u32 -; GCN: s_endpgm -define void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32_offset: -; GCN: ds_min_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32: -; GCN: ds_max_u32 -; GCN: s_endpgm -define void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind { - %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32_offset: -; GCN: ds_max_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 -; GCN: s_endpgm -define void @lds_atomic_umax_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 - %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst - ret void -} diff --git a/test/CodeGen/R600/local-atomics64.ll b/test/CodeGen/R600/local-atomics64.ll deleted file mode 100644 index 0ffa5e751b7..00000000000 --- a/test/CodeGen/R600/local-atomics64.ll +++ /dev/null @@ -1,470 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=VI -check-prefix=GCN %s - -; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i64: -; GCN: ds_wrxchg_rtn_b64 -; GCN: s_endpgm -define void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i64_offset: -; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64: -; GCN: ds_add_rtn_u64 -; GCN: s_endpgm -define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64_offset: -; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9 -; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0 -; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] -; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 -; GCN: buffer_store_dwordx2 [[RESULT]], -; GCN: s_endpgm -define void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4 - %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i64: -; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1 -; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1 -; GCN: ds_inc_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} -; GCN: buffer_store_dwordx2 [[RESULT]], -; GCN: s_endpgm -define void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i64_offset: -; GCN: ds_inc_rtn_u64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i64: -; GCN: ds_sub_rtn_u64 -; GCN: s_endpgm -define void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i64_offset: -; GCN: ds_sub_rtn_u64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i64: -; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1 -; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1 -; GCN: ds_dec_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} -; GCN: buffer_store_dwordx2 [[RESULT]], -; GCN: s_endpgm -define void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i64_offset: -; GCN: ds_dec_rtn_u64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_and_ret_i64: -; GCN: ds_and_rtn_b64 -; GCN: s_endpgm -define void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_and_ret_i64_offset: -; GCN: ds_and_rtn_b64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_or_ret_i64: -; GCN: ds_or_rtn_b64 -; GCN: s_endpgm -define void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_or_ret_i64_offset: -; GCN: ds_or_rtn_b64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i64: -; GCN: ds_xor_rtn_b64 -; GCN: s_endpgm -define void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i64_offset: -; GCN: ds_xor_rtn_b64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FIXME: There is no atomic nand instr -; XFUNC-LABEL: {{^}}lds_atomic_nand_ret_i64:uction, so we somehow need to expand this. -; define void @lds_atomic_nand_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { -; %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst -; store i64 %result, i64 addrspace(1)* %out, align 8 -; ret void -; } - -; FUNC-LABEL: {{^}}lds_atomic_min_ret_i64: -; GCN: ds_min_rtn_i64 -; GCN: s_endpgm -define void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_min_ret_i64_offset: -; GCN: ds_min_rtn_i64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_max_ret_i64: -; GCN: ds_max_rtn_i64 -; GCN: s_endpgm -define void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_max_ret_i64_offset: -; GCN: ds_max_rtn_i64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i64: -; GCN: ds_min_rtn_u64 -; GCN: s_endpgm -define void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i64_offset: -; GCN: ds_min_rtn_u64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i64: -; GCN: ds_max_rtn_u64 -; GCN: s_endpgm -define void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i64_offset: -; GCN: ds_max_rtn_u64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i64: -; GCN: ds_wrxchg_rtn_b64 -; GCN: s_endpgm -define void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i64_offset: -; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_add_noret_i64: -; GCN: ds_add_u64 -; GCN: s_endpgm -define void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_add_noret_i64_offset: -; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9 -; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24 -; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9 -; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0 -; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] -; GCN: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4 - %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64: -; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1 -; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1 -; GCN: ds_inc_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} -; GCN: s_endpgm -define void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64_offset: -; GCN: ds_inc_u64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i64: -; GCN: ds_sub_u64 -; GCN: s_endpgm -define void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i64_offset: -; GCN: ds_sub_u64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64: -; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1 -; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1 -; GCN: ds_dec_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} -; GCN: s_endpgm -define void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64_offset: -; GCN: ds_dec_u64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_and_noret_i64: -; GCN: ds_and_b64 -; GCN: s_endpgm -define void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_and_noret_i64_offset: -; GCN: ds_and_b64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_or_noret_i64: -; GCN: ds_or_b64 -; GCN: s_endpgm -define void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_or_noret_i64_offset: -; GCN: ds_or_b64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i64: -; GCN: ds_xor_b64 -; GCN: s_endpgm -define void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i64_offset: -; GCN: ds_xor_b64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst - ret void -} - -; FIXME: There is no atomic nand instr -; XFUNC-LABEL: {{^}}lds_atomic_nand_noret_i64:uction, so we somehow need to expand this. -; define void @lds_atomic_nand_noret_i64(i64 addrspace(3)* %ptr) nounwind { -; %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst -; ret void -; } - -; FUNC-LABEL: {{^}}lds_atomic_min_noret_i64: -; GCN: ds_min_i64 -; GCN: s_endpgm -define void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_min_noret_i64_offset: -; GCN: ds_min_i64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_max_noret_i64: -; GCN: ds_max_i64 -; GCN: s_endpgm -define void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_max_noret_i64_offset: -; GCN: ds_max_i64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i64: -; GCN: ds_min_u64 -; GCN: s_endpgm -define void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i64_offset: -; GCN: ds_min_u64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i64: -; GCN: ds_max_u64 -; GCN: s_endpgm -define void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind { - %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst - ret void -} - -; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i64_offset: -; GCN: ds_max_u64 {{.*}} offset:32 -; GCN: s_endpgm -define void @lds_atomic_umax_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { - %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 - %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst - ret void -} diff --git a/test/CodeGen/R600/local-memory-two-objects.ll b/test/CodeGen/R600/local-memory-two-objects.ll deleted file mode 100644 index 06a8b1246e6..00000000000 --- a/test/CodeGen/R600/local-memory-two-objects.ll +++ /dev/null @@ -1,63 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=SI %s -; RUN: llc < %s -march=amdgcn -mcpu=bonaire -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=CI %s - -@local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4 -@local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4 - - -; Check that the LDS size emitted correctly -; EG: .long 166120 -; EG-NEXT: .long 8 -; GCN: .long 47180 -; GCN-NEXT: .long 38792 - -; EG: {{^}}local_memory_two_objects: - -; We would like to check the the lds writes are using different -; addresses, but due to variations in the scheduler, we can't do -; this consistently on evergreen GPUs. -; EG: LDS_WRITE -; EG: LDS_WRITE -; GCN: ds_write_b32 {{v[0-9]*}}, v[[ADDRW:[0-9]*]] -; GCN-NOT: ds_write_b32 {{v[0-9]*}}, v[[ADDRW]] - -; GROUP_BARRIER must be the last instruction in a clause -; EG: GROUP_BARRIER -; EG-NEXT: ALU clause - -; Make sure the lds reads are using different addresses, at different -; constant offsets. -; EG: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]] -; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]] -; SI: v_add_i32_e32 [[SIPTR:v[0-9]+]], 16, v{{[0-9]+}} -; SI: ds_read_b32 {{v[0-9]+}}, [[SIPTR]] -; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]] offset:16 -; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR]] - -define void @local_memory_two_objects(i32 addrspace(1)* %out) { -entry: - %x.i = call i32 @llvm.r600.read.tidig.x() #0 - %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i - store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4 - %mul = shl nsw i32 %x.i, 1 - %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i - store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4 - %sub = sub nsw i32 3, %x.i - call void @llvm.AMDGPU.barrier.local() - %arrayidx2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub - %0 = load i32, i32 addrspace(3)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %x.i - store i32 %0, i32 addrspace(1)* %arrayidx3, align 4 - %arrayidx4 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub - %1 = load i32, i32 addrspace(3)* %arrayidx4, align 4 - %add = add nsw i32 %x.i, 4 - %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add - store i32 %1, i32 addrspace(1)* %arrayidx5, align 4 - ret void -} - -declare i32 @llvm.r600.read.tidig.x() #0 -declare void @llvm.AMDGPU.barrier.local() - -attributes #0 = { readnone } diff --git a/test/CodeGen/R600/local-memory.ll b/test/CodeGen/R600/local-memory.ll deleted file mode 100644 index 9494ed75bd0..00000000000 --- a/test/CodeGen/R600/local-memory.ll +++ /dev/null @@ -1,49 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s - -@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4 - - -; Check that the LDS size emitted correctly -; EG: .long 166120 -; EG-NEXT: .long 128 -; SI: .long 47180 -; SI-NEXT: .long 71560 -; CI: .long 47180 -; CI-NEXT: .long 38792 - -; FUNC-LABEL: {{^}}local_memory: - -; EG: LDS_WRITE -; SI-NOT: s_wqm_b64 -; SI: ds_write_b32 - -; GROUP_BARRIER must be the last instruction in a clause -; EG: GROUP_BARRIER -; EG-NEXT: ALU clause -; SI: s_barrier - -; EG: LDS_READ_RET -; SI: ds_read_b32 {{v[0-9]+}}, - -define void @local_memory(i32 addrspace(1)* %out) { -entry: - %y.i = call i32 @llvm.r600.read.tidig.x() #0 - %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i - store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4 - %add = add nsw i32 %y.i, 1 - %cmp = icmp eq i32 %add, 16 - %.add = select i1 %cmp, i32 0, i32 %add - call void @llvm.AMDGPU.barrier.local() - %arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add - %0 = load i32, i32 addrspace(3)* %arrayidx1, align 4 - %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i - store i32 %0, i32 addrspace(1)* %arrayidx2, align 4 - ret void -} - -declare i32 @llvm.r600.read.tidig.x() #0 -declare void @llvm.AMDGPU.barrier.local() - -attributes #0 = { readnone } diff --git a/test/CodeGen/R600/loop-address.ll b/test/CodeGen/R600/loop-address.ll deleted file mode 100644 index f60d574497d..00000000000 --- a/test/CodeGen/R600/loop-address.ll +++ /dev/null @@ -1,34 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood < %s | FileCheck %s - -;CHECK: ALU_PUSH -;CHECK: LOOP_START_DX10 @11 -;CHECK: LOOP_BREAK @10 -;CHECK: POP @10 - -define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) #0 { -entry: - %cmp5 = icmp sgt i32 %iterations, 0 - br i1 %cmp5, label %for.body, label %for.end - -for.body: ; preds = %for.body, %entry - %i.07.in = phi i32 [ %i.07, %for.body ], [ %iterations, %entry ] - %ai.06 = phi i32 [ %add, %for.body ], [ 0, %entry ] - %i.07 = add nsw i32 %i.07.in, -1 - %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %ai.06 - store i32 %i.07, i32 addrspace(1)* %arrayidx, align 4 - %add = add nsw i32 %ai.06, 1 - %exitcond = icmp eq i32 %add, %iterations - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - ret void -} - -attributes #0 = { nounwind "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" } - -!opencl.kernels = !{!0, !1, !2, !3} - -!0 = !{void (i32 addrspace(1)*, i32)* @loop_ge} -!1 = !{null} -!2 = !{null} -!3 = !{null} diff --git a/test/CodeGen/R600/loop-idiom.ll b/test/CodeGen/R600/loop-idiom.ll deleted file mode 100644 index 5fd9806813c..00000000000 --- a/test/CodeGen/R600/loop-idiom.ll +++ /dev/null @@ -1,51 +0,0 @@ -; RUN: opt -basicaa -loop-idiom -S < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s -; RUN: opt -basicaa -loop-idiom -S < %s -march=amdgcn -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s -; RUN: opt -basicaa -loop-idiom -S < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s - - -; Make sure loop-idiom doesn't create memcpy or memset. There are no library -; implementations of these for R600. - -; FUNC: @no_memcpy -; R600-NOT: {{^}}llvm.memcpy -; SI-NOT: {{^}}llvm.memcpy -define void @no_memcpy(i8 addrspace(3)* %in, i32 %size) { -entry: - %dest = alloca i8, i32 32 - br label %for.body - -for.body: - %0 = phi i32 [0, %entry], [%4, %for.body] - %1 = getelementptr i8, i8 addrspace(3)* %in, i32 %0 - %2 = getelementptr i8, i8* %dest, i32 %0 - %3 = load i8, i8 addrspace(3)* %1 - store i8 %3, i8* %2 - %4 = add i32 %0, 1 - %5 = icmp eq i32 %4, %size - br i1 %5, label %for.end, label %for.body - -for.end: - ret void -} - -; FUNC: @no_memset -; R600-NOT: {{^}}llvm.memset -; R600-NOT: {{^}}memset_pattern16: -; SI-NOT: {{^}}llvm.memset -; SI-NOT: {{^}}memset_pattern16: -define void @no_memset(i32 %size) { -entry: - %dest = alloca i8, i32 32 - br label %for.body - -for.body: - %0 = phi i32 [0, %entry], [%2, %for.body] - %1 = getelementptr i8, i8* %dest, i32 %0 - store i8 0, i8* %1 - %2 = add i32 %0, 1 - %3 = icmp eq i32 %2, %size - br i1 %3, label %for.end, label %for.body - -for.end: - ret void -} diff --git a/test/CodeGen/R600/lshl.ll b/test/CodeGen/R600/lshl.ll deleted file mode 100644 index 9ac988d38d1..00000000000 --- a/test/CodeGen/R600/lshl.ll +++ /dev/null @@ -1,15 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK: s_lshl_b32 s{{[0-9]}}, s{{[0-9]}}, 1 - -define void @test(i32 %p) { - %i = mul i32 %p, 2 - %r = bitcast i32 %i to float - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) - ret void -} - -declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/test/CodeGen/R600/lshr.ll b/test/CodeGen/R600/lshr.ll deleted file mode 100644 index 50e444ac26b..00000000000 --- a/test/CodeGen/R600/lshr.ll +++ /dev/null @@ -1,15 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK: s_lshr_b32 s{{[0-9]}}, s{{[0-9]}}, 1 - -define void @test(i32 %p) { - %i = udiv i32 %p, 2 - %r = bitcast i32 %i to float - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) - ret void -} - -declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/test/CodeGen/R600/m0-spill.ll b/test/CodeGen/R600/m0-spill.ll deleted file mode 100644 index 1dddc85f775..00000000000 --- a/test/CodeGen/R600/m0-spill.ll +++ /dev/null @@ -1,35 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -@lds = external addrspace(3) global [64 x float] - -; CHECK-LABEL: {{^}}main: -; CHECK-NOT: v_readlane_b32 m0 -define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" { -main_body: - %4 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3) - %cmp = fcmp ueq float 0.0, %4 - br i1 %cmp, label %if, label %else - -if: - %lds_ptr = getelementptr [64 x float], [64 x float] addrspace(3)* @lds, i32 0, i32 0 - %lds_data = load float, float addrspace(3)* %lds_ptr - br label %endif - -else: - %interp = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3) - br label %endif - -endif: - %export = phi float [%lds_data, %if], [%interp, %else] - %5 = call i32 @llvm.SI.packf16(float %export, float %export) - %6 = bitcast i32 %5 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %6, float %6, float %6, float %6) - ret void -} - -declare float @llvm.SI.fs.constant(i32, i32, i32) readnone - -declare i32 @llvm.SI.packf16(float, float) readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/test/CodeGen/R600/mad-combine.ll b/test/CodeGen/R600/mad-combine.ll deleted file mode 100644 index bc071628ead..00000000000 --- a/test/CodeGen/R600/mad-combine.ll +++ /dev/null @@ -1,567 +0,0 @@ -; Make sure we still form mad even when unsafe math or fp-contract is allowed instead of fma. - -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s - -; Make sure we don't form mad with denormals -; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=verde -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s - -declare i32 @llvm.r600.read.tidig.x() #0 -declare float @llvm.fabs.f32(float) #0 -declare float @llvm.fma.f32(float, float, float) #0 -declare float @llvm.fmuladd.f32(float, float, float) #0 - -; (fadd (fmul x, y), z) -> (fma x, y, z) -; FUNC-LABEL: {{^}}combine_to_mad_f32_0: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} - -; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] - -; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] - -; SI-DENORM-SLOWFMAF-NOT: v_fma -; SI-DENORM-SLOWFMAF-NOT: v_mad - -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] - -; SI: buffer_store_dword [[RESULT]] -define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %gep.0 - %b = load float, float addrspace(1)* %gep.1 - %c = load float, float addrspace(1)* %gep.2 - - %mul = fmul float %a, %b - %fma = fadd float %mul, %c - store float %fma, float addrspace(1)* %gep.out - ret void -} - -; (fadd (fmul x, y), z) -> (fma x, y, z) -; FUNC-LABEL: {{^}}combine_to_mad_f32_0_2use: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} - -; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]] -; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]] - -; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]] -; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]] - -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]] -; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] - -; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI: s_endpgm -define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 - %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0 - %b = load float, float addrspace(1)* %gep.1 - %c = load float, float addrspace(1)* %gep.2 - %d = load float, float addrspace(1)* %gep.3 - - %mul = fmul float %a, %b - %fma0 = fadd float %mul, %c - %fma1 = fadd float %mul, %d - - store float %fma0, float addrspace(1)* %gep.out.0 - store float %fma1, float addrspace(1)* %gep.out.1 - ret void -} - -; (fadd x, (fmul y, z)) -> (fma y, z, x) -; FUNC-LABEL: {{^}}combine_to_mad_f32_1: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} - -; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] -; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] - -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] - -; SI: buffer_store_dword [[RESULT]] -define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %gep.0 - %b = load float, float addrspace(1)* %gep.1 - %c = load float, float addrspace(1)* %gep.2 - - %mul = fmul float %a, %b - %fma = fadd float %c, %mul - store float %fma, float addrspace(1)* %gep.out - ret void -} - -; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) -; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} - -; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]] -; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]] - -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] - -; SI: buffer_store_dword [[RESULT]] -define void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %gep.0 - %b = load float, float addrspace(1)* %gep.1 - %c = load float, float addrspace(1)* %gep.2 - - %mul = fmul float %a, %b - %fma = fsub float %mul, %c - store float %fma, float addrspace(1)* %gep.out - ret void -} - -; (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) -; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32_2use: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} - -; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]] -; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] - -; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]] -; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] - -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]] -; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] - -; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI: s_endpgm -define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 - %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0 - %b = load float, float addrspace(1)* %gep.1 - %c = load float, float addrspace(1)* %gep.2 - %d = load float, float addrspace(1)* %gep.3 - - %mul = fmul float %a, %b - %fma0 = fsub float %mul, %c - %fma1 = fsub float %mul, %d - store float %fma0, float addrspace(1)* %gep.out.0 - store float %fma1, float addrspace(1)* %gep.out.1 - ret void -} - -; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) -; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} - -; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]] -; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]] - -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] - -; SI: buffer_store_dword [[RESULT]] -define void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %gep.0 - %b = load float, float addrspace(1)* %gep.1 - %c = load float, float addrspace(1)* %gep.2 - - %mul = fmul float %a, %b - %fma = fsub float %c, %mul - store float %fma, float addrspace(1)* %gep.out - ret void -} - -; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) -; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32_2use: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} - -; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]] -; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]] - -; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]] -; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]] - -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]] -; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]] - -; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI: s_endpgm -define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 - %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0 - %b = load float, float addrspace(1)* %gep.1 - %c = load float, float addrspace(1)* %gep.2 - %d = load float, float addrspace(1)* %gep.3 - - %mul = fmul float %a, %b - %fma0 = fsub float %c, %mul - %fma1 = fsub float %d, %mul - store float %fma0, float addrspace(1)* %gep.out.0 - store float %fma1, float addrspace(1)* %gep.out.1 - ret void -} - -; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) -; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} - -; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]] - -; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]] - -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[TMP]], [[C]] - -; SI: buffer_store_dword [[RESULT]] -define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %gep.0 - %b = load float, float addrspace(1)* %gep.1 - %c = load float, float addrspace(1)* %gep.2 - - %mul = fmul float %a, %b - %mul.neg = fsub float -0.0, %mul - %fma = fsub float %mul.neg, %c - - store float %fma, float addrspace(1)* %gep.out - ret void -} - -; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) -; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_neg: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} - -; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] -; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]] - -; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] -; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]] - -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]] -; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT1:v[0-9]+]], -[[TMP]], [[D]] - -; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI: s_endpgm -define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 - %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0 - %b = load float, float addrspace(1)* %gep.1 - %c = load float, float addrspace(1)* %gep.2 - %d = load float, float addrspace(1)* %gep.3 - - %mul = fmul float %a, %b - %mul.neg = fsub float -0.0, %mul - %fma0 = fsub float %mul.neg, %c - %fma1 = fsub float %mul.neg, %d - - store float %fma0, float addrspace(1)* %gep.out.0 - store float %fma1, float addrspace(1)* %gep.out.1 - ret void -} - -; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z)) -; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_mul: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} - -; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] -; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] - -; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]] -; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]] - -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]] -; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] - -; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI: s_endpgm -define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 - %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1 - - %a = load float, float addrspace(1)* %gep.0 - %b = load float, float addrspace(1)* %gep.1 - %c = load float, float addrspace(1)* %gep.2 - %d = load float, float addrspace(1)* %gep.3 - - %mul = fmul float %a, %b - %mul.neg = fsub float -0.0, %mul - %fma0 = fsub float %mul.neg, %c - %fma1 = fsub float %mul, %d - - store float %fma0, float addrspace(1)* %gep.out.0 - store float %fma1, float addrspace(1)* %gep.out.1 - ret void -} - -; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) - -; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_0_f32: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} -; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} - -; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] -; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]] -; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP1]] - -; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], [[D]], [[E]], -[[C]] -; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP0]] - -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] -; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]] -; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[C]], [[TMP1]] - -; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 - %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %x = load float, float addrspace(1)* %gep.0 - %y = load float, float addrspace(1)* %gep.1 - %z = load float, float addrspace(1)* %gep.2 - %u = load float, float addrspace(1)* %gep.3 - %v = load float, float addrspace(1)* %gep.4 - - %tmp0 = fmul float %u, %v - %tmp1 = call float @llvm.fma.f32(float %x, float %y, float %tmp0) #0 - %tmp2 = fsub float %tmp1, %z - - store float %tmp2, float addrspace(1)* %gep.out - ret void -} - -; fold (fsub x, (fma y, z, (fmul u, v))) -; -> (fma (fneg y), z, (fma (fneg u), v, x)) - -; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_1_f32: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} -; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} - -; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] -; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]] -; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]] - -; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], -[[D]], [[E]], [[A]] -; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP0]] - -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] -; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]] -; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]] - -; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: s_endpgm -define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 - %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %x = load float, float addrspace(1)* %gep.0 - %y = load float, float addrspace(1)* %gep.1 - %z = load float, float addrspace(1)* %gep.2 - %u = load float, float addrspace(1)* %gep.3 - %v = load float, float addrspace(1)* %gep.4 - - %tmp0 = fmul float %u, %v - %tmp1 = call float @llvm.fma.f32(float %y, float %z, float %tmp0) #0 - %tmp2 = fsub float %x, %tmp1 - - store float %tmp2, float addrspace(1)* %gep.out - ret void -} - -; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) - -; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_2_f32: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} -; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} - -; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]] -; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]] - -; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]] -; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]] - -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[A]] -; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]] -; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP2]] - -; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: s_endpgm -define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 - %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %x = load float, float addrspace(1)* %gep.0 - %y = load float, float addrspace(1)* %gep.1 - %z = load float, float addrspace(1)* %gep.2 - %u = load float, float addrspace(1)* %gep.3 - %v = load float, float addrspace(1)* %gep.4 - - %tmp0 = fmul float %u, %v - %tmp1 = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0) #0 - %tmp2 = fsub float %tmp1, %z - - store float %tmp2, float addrspace(1)* %gep.out - ret void -} - -; fold (fsub x, (fmuladd y, z, (fmul u, v))) -; -> (fmuladd (fneg y), z, (fmuladd (fneg u), v, x)) - -; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_3_f32: -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} -; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} - -; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]] -; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]] - -; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]] -; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]] - -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[C]], [[B]] -; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]] -; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[A]] - -; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: s_endpgm -define void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 - %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3 - %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %x = load float, float addrspace(1)* %gep.0 - %y = load float, float addrspace(1)* %gep.1 - %z = load float, float addrspace(1)* %gep.2 - %u = load float, float addrspace(1)* %gep.3 - %v = load float, float addrspace(1)* %gep.4 - - %tmp0 = fmul float %u, %v - %tmp1 = call float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0 - %tmp2 = fsub float %x, %tmp1 - - store float %tmp2, float addrspace(1)* %gep.out - ret void -} - -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind } diff --git a/test/CodeGen/R600/mad-sub.ll b/test/CodeGen/R600/mad-sub.ll deleted file mode 100644 index aa4194ff610..00000000000 --- a/test/CodeGen/R600/mad-sub.ll +++ /dev/null @@ -1,215 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.r600.read.tidig.x() #0 -declare float @llvm.fabs.f32(float) #0 - -; FUNC-LABEL: {{^}}mad_sub_f32: -; SI: buffer_load_dword [[REGA:v[0-9]+]] -; SI: buffer_load_dword [[REGB:v[0-9]+]] -; SI: buffer_load_dword [[REGC:v[0-9]+]] -; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] -; SI: buffer_store_dword [[RESULT]] -define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext - %a = load float, float addrspace(1)* %gep0, align 4 - %b = load float, float addrspace(1)* %gep1, align 4 - %c = load float, float addrspace(1)* %gep2, align 4 - %mul = fmul float %a, %b - %sub = fsub float %mul, %c - store float %sub, float addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: {{^}}mad_sub_inv_f32: -; SI: buffer_load_dword [[REGA:v[0-9]+]] -; SI: buffer_load_dword [[REGB:v[0-9]+]] -; SI: buffer_load_dword [[REGC:v[0-9]+]] -; SI: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] -; SI: buffer_store_dword [[RESULT]] -define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext - %a = load float, float addrspace(1)* %gep0, align 4 - %b = load float, float addrspace(1)* %gep1, align 4 - %c = load float, float addrspace(1)* %gep2, align 4 - %mul = fmul float %a, %b - %sub = fsub float %c, %mul - store float %sub, float addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: {{^}}mad_sub_f64: -; SI: v_mul_f64 -; SI: v_add_f64 -define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr double, double addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr double, double addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr double, double addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr double, double addrspace(1)* %out, i64 %tid.ext - %a = load double, double addrspace(1)* %gep0, align 8 - %b = load double, double addrspace(1)* %gep1, align 8 - %c = load double, double addrspace(1)* %gep2, align 8 - %mul = fmul double %a, %b - %sub = fsub double %mul, %c - store double %sub, double addrspace(1)* %outgep, align 8 - ret void -} - -; FUNC-LABEL: {{^}}mad_sub_fabs_f32: -; SI: buffer_load_dword [[REGA:v[0-9]+]] -; SI: buffer_load_dword [[REGB:v[0-9]+]] -; SI: buffer_load_dword [[REGC:v[0-9]+]] -; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| -; SI: buffer_store_dword [[RESULT]] -define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext - %a = load float, float addrspace(1)* %gep0, align 4 - %b = load float, float addrspace(1)* %gep1, align 4 - %c = load float, float addrspace(1)* %gep2, align 4 - %c.abs = call float @llvm.fabs.f32(float %c) #0 - %mul = fmul float %a, %b - %sub = fsub float %mul, %c.abs - store float %sub, float addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: {{^}}mad_sub_fabs_inv_f32: -; SI: buffer_load_dword [[REGA:v[0-9]+]] -; SI: buffer_load_dword [[REGB:v[0-9]+]] -; SI: buffer_load_dword [[REGC:v[0-9]+]] -; SI: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| -; SI: buffer_store_dword [[RESULT]] -define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext - %a = load float, float addrspace(1)* %gep0, align 4 - %b = load float, float addrspace(1)* %gep1, align 4 - %c = load float, float addrspace(1)* %gep2, align 4 - %c.abs = call float @llvm.fabs.f32(float %c) #0 - %mul = fmul float %a, %b - %sub = fsub float %c.abs, %mul - store float %sub, float addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: {{^}}neg_neg_mad_f32: -; SI: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext - %a = load float, float addrspace(1)* %gep0, align 4 - %b = load float, float addrspace(1)* %gep1, align 4 - %c = load float, float addrspace(1)* %gep2, align 4 - %nega = fsub float -0.000000e+00, %a - %negb = fsub float -0.000000e+00, %b - %mul = fmul float %nega, %negb - %sub = fadd float %mul, %c - store float %sub, float addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: {{^}}mad_fabs_sub_f32: -; SI: buffer_load_dword [[REGA:v[0-9]+]] -; SI: buffer_load_dword [[REGB:v[0-9]+]] -; SI: buffer_load_dword [[REGC:v[0-9]+]] -; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] -; SI: buffer_store_dword [[RESULT]] -define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { - %tid = tail call i32 @llvm.r600.read.tidig.x() #0 - %tid.ext = sext i32 %tid to i64 - %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext - %add1 = add i64 %tid.ext, 1 - %gep1 = getelementptr float, float addrspace(1)* %ptr, i64 %add1 - %add2 = add i64 %tid.ext, 2 - %gep2 = getelementptr float, float addrspace(1)* %ptr, i64 %add2 - %outgep = getelementptr float, float addrspace(1)* %out, i64 %tid.ext - %a = load float, float addrspace(1)* %gep0, align 4 - %b = load float, float addrspace(1)* %gep1, align 4 - %c = load float, float addrspace(1)* %gep2, align 4 - %b.abs = call float @llvm.fabs.f32(float %b) #0 - %mul = fmul float %a, %b.abs - %sub = fsub float %mul, %c - store float %sub, float addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: {{^}}fsub_c_fadd_a_a: -; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]] -; SI: buffer_store_dword [[RESULT]] -define void @fsub_c_fadd_a_a(float addrspace(1)* %out, float addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load float, float addrspace(1)* %gep.0 - %r2 = load float, float addrspace(1)* %gep.1 - - %add = fadd float %r1, %r1 - %r3 = fsub float %r2, %add - - store float %r3, float addrspace(1)* %gep.out - ret void -} - -; FUNC-LABEL: {{^}}fsub_fadd_a_a_c: -; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]] -; SI: buffer_store_dword [[RESULT]] -define void @fsub_fadd_a_a_c(float addrspace(1)* %out, float addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid - - %r1 = load float, float addrspace(1)* %gep.0 - %r2 = load float, float addrspace(1)* %gep.1 - - %add = fadd float %r1, %r1 - %r3 = fsub float %add, %r2 - - store float %r3, float addrspace(1)* %gep.out - ret void -} - -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind } diff --git a/test/CodeGen/R600/mad_int24.ll b/test/CodeGen/R600/mad_int24.ll deleted file mode 100644 index 86d75a63ca4..00000000000 --- a/test/CodeGen/R600/mad_int24.ll +++ /dev/null @@ -1,35 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC - -declare i32 @llvm.AMDGPU.imul24(i32, i32) nounwind readnone - -; FUNC-LABEL: {{^}}i32_mad24: -; Signed 24-bit multiply is not supported on pre-Cayman GPUs. -; EG: MULLO_INT -; Make sure we aren't masking the inputs. -; CM-NOT: AND -; CM: MULADD_INT24 -; SI-NOT: and -; SI: v_mad_i32_i24 -define void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { -entry: - %0 = shl i32 %a, 8 - %a_24 = ashr i32 %0, 8 - %1 = shl i32 %b, 8 - %b_24 = ashr i32 %1, 8 - %2 = mul i32 %a_24, %b_24 - %3 = add i32 %2, %c - store i32 %3, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: @test_imul24 -; SI: v_mad_i32_i24 -define void @test_imul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind { - %mul = call i32 @llvm.AMDGPU.imul24(i32 %src0, i32 %src1) nounwind readnone - %add = add i32 %mul, %src2 - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/mad_uint24.ll b/test/CodeGen/R600/mad_uint24.ll deleted file mode 100644 index 95fe3411959..00000000000 --- a/test/CodeGen/R600/mad_uint24.ll +++ /dev/null @@ -1,76 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC - -; FUNC-LABEL: {{^}}u32_mad24: -; EG: MULADD_UINT24 -; SI: v_mad_u32_u24 - -define void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { -entry: - %0 = shl i32 %a, 8 - %a_24 = lshr i32 %0, 8 - %1 = shl i32 %b, 8 - %b_24 = lshr i32 %1, 8 - %2 = mul i32 %a_24, %b_24 - %3 = add i32 %2, %c - store i32 %3, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i16_mad24: -; The order of A and B does not matter. -; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]] -; The result must be sign-extended -; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x -; EG: 16 -; SI: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}} -; SI: v_bfe_i32 v{{[0-9]}}, [[MAD]], 0, 16 - -define void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) { -entry: - %0 = mul i16 %a, %b - %1 = add i16 %0, %c - %2 = sext i16 %1 to i32 - store i32 %2, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i8_mad24: -; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]] -; The result must be sign-extended -; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x -; EG: 8 -; SI: v_mad_u32_u24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} -; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8 - -define void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) { -entry: - %0 = mul i8 %a, %b - %1 = add i8 %0, %c - %2 = sext i8 %1 to i32 - store i32 %2, i32 addrspace(1)* %out - ret void -} - -; This tests for a bug where the mad_u24 pattern matcher would call -; SimplifyDemandedBits on the first operand of the mul instruction -; assuming that the pattern would be matched to a 24-bit mad. This -; led to some instructions being incorrectly erased when the entire -; 24-bit mad pattern wasn't being matched. - -; Check that the select instruction is not deleted. -; FUNC-LABEL: {{^}}i24_i32_i32_mad: -; EG: CNDE_INT -; SI: v_cndmask -define void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { -entry: - %0 = ashr i32 %a, 8 - %1 = icmp ne i32 %c, 0 - %2 = select i1 %1, i32 %0, i32 34 - %3 = mul i32 %2, %c - %4 = add i32 %3, %d - store i32 %4, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/madak.ll b/test/CodeGen/R600/madak.ll deleted file mode 100644 index 933bb016d2c..00000000000 --- a/test/CodeGen/R600/madak.ll +++ /dev/null @@ -1,193 +0,0 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s -; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s - -; FIXME: Enable VI - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone -declare float @llvm.fabs.f32(float) nounwind readnone - -; GCN-LABEL: {{^}}madak_f32: -; GCN: buffer_load_dword [[VA:v[0-9]+]] -; GCN: buffer_load_dword [[VB:v[0-9]+]] -; GCN: v_madak_f32_e32 {{v[0-9]+}}, [[VB]], [[VA]], 0x41200000 -define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid - %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %in.a.gep, align 4 - %b = load float, float addrspace(1)* %in.b.gep, align 4 - - %mul = fmul float %a, %b - %madak = fadd float %mul, 10.0 - store float %madak, float addrspace(1)* %out.gep, align 4 - ret void -} - -; Make sure this is only folded with one use. This is a code size -; optimization and if we fold the immediate multiple times, we'll undo -; it. - -; GCN-LABEL: {{^}}madak_2_use_f32: -; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], [[VK]] -; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VC]], [[VK]] -; GCN: s_endpgm -define void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - - %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 - %in.gep.2 = getelementptr float, float addrspace(1)* %in.gep.0, i32 2 - - %out.gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %out.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 - - %a = load float, float addrspace(1)* %in.gep.0, align 4 - %b = load float, float addrspace(1)* %in.gep.1, align 4 - %c = load float, float addrspace(1)* %in.gep.2, align 4 - - %mul0 = fmul float %a, %b - %mul1 = fmul float %a, %c - %madak0 = fadd float %mul0, 10.0 - %madak1 = fadd float %mul1, 10.0 - - store float %madak0, float addrspace(1)* %out.gep.0, align 4 - store float %madak1, float addrspace(1)* %out.gep.1, align 4 - ret void -} - -; GCN-LABEL: {{^}}madak_m_inline_imm_f32: -; GCN: buffer_load_dword [[VA:v[0-9]+]] -; GCN: v_madak_f32_e32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 -define void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %in.a.gep, align 4 - - %mul = fmul float 4.0, %a - %madak = fadd float %mul, 10.0 - store float %madak, float addrspace(1)* %out.gep, align 4 - ret void -} - -; Make sure nothing weird happens with a value that is also allowed as -; an inline immediate. - -; GCN-LABEL: {{^}}madak_inline_imm_f32: -; GCN: buffer_load_dword [[VA:v[0-9]+]] -; GCN: buffer_load_dword [[VB:v[0-9]+]] -; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 -define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid - %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %in.a.gep, align 4 - %b = load float, float addrspace(1)* %in.b.gep, align 4 - - %mul = fmul float %a, %b - %madak = fadd float %mul, 4.0 - store float %madak, float addrspace(1)* %out.gep, align 4 - ret void -} - -; We can't use an SGPR when forming madak -; GCN-LABEL: {{^}}s_v_madak_f32: -; GCN: s_load_dword [[SB:s[0-9]+]] -; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]] -; GCN-NOT: v_madak_f32 -; GCN: v_mad_f32 {{v[0-9]+}}, [[SB]], [[VA]], [[VK]] -define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %in.a.gep, align 4 - - %mul = fmul float %a, %b - %madak = fadd float %mul, 10.0 - store float %madak, float addrspace(1)* %out.gep, align 4 - ret void -} - -; GCN-LABEL: @v_s_madak_f32 -; GCN-DAG: s_load_dword [[SB:s[0-9]+]] -; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]] -; GCN-NOT: v_madak_f32 -; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[SB]], [[VK]] -define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - - %b = load float, float addrspace(1)* %in.b.gep, align 4 - - %mul = fmul float %a, %b - %madak = fadd float %mul, 10.0 - store float %madak, float addrspace(1)* %out.gep, align 4 - ret void -} - -; GCN-LABEL: {{^}}s_s_madak_f32: -; GCN-NOT: v_madak_f32 -; GCN: v_mad_f32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} -define void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind { - %mul = fmul float %a, %b - %madak = fadd float %mul, 10.0 - store float %madak, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}no_madak_src0_modifier_f32: -; GCN: buffer_load_dword [[VA:v[0-9]+]] -; GCN: buffer_load_dword [[VB:v[0-9]+]] -; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}} -; GCN: s_endpgm -define void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid - %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %in.a.gep, align 4 - %b = load float, float addrspace(1)* %in.b.gep, align 4 - - %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone - - %mul = fmul float %a.fabs, %b - %madak = fadd float %mul, 10.0 - store float %madak, float addrspace(1)* %out.gep, align 4 - ret void -} - -; GCN-LABEL: {{^}}no_madak_src1_modifier_f32: -; GCN: buffer_load_dword [[VA:v[0-9]+]] -; GCN: buffer_load_dword [[VB:v[0-9]+]] -; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}} -; GCN: s_endpgm -define void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid - %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %in.a.gep, align 4 - %b = load float, float addrspace(1)* %in.b.gep, align 4 - - %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone - - %mul = fmul float %a, %b.fabs - %madak = fadd float %mul, 10.0 - store float %madak, float addrspace(1)* %out.gep, align 4 - ret void -} diff --git a/test/CodeGen/R600/madmk.ll b/test/CodeGen/R600/madmk.ll deleted file mode 100644 index ba7bb221a99..00000000000 --- a/test/CodeGen/R600/madmk.ll +++ /dev/null @@ -1,205 +0,0 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone -declare float @llvm.fabs.f32(float) nounwind readnone - -; GCN-LABEL: {{^}}madmk_f32: -; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GCN: v_madmk_f32_e32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 -define void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %mul = fmul float %a, 10.0 - %madmk = fadd float %mul, %b - store float %madmk, float addrspace(1)* %out.gep, align 4 - ret void -} - -; GCN-LABEL: {{^}}madmk_2_use_f32: -; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VK]], [[VB]] -; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VK]], [[VC]] -; GCN: s_endpgm -define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - - %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 - %in.gep.2 = getelementptr float, float addrspace(1)* %in.gep.0, i32 2 - - %out.gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid - %out.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1 - - %a = load float, float addrspace(1)* %in.gep.0, align 4 - %b = load float, float addrspace(1)* %in.gep.1, align 4 - %c = load float, float addrspace(1)* %in.gep.2, align 4 - - %mul0 = fmul float %a, 10.0 - %mul1 = fmul float %a, 10.0 - %madmk0 = fadd float %mul0, %b - %madmk1 = fadd float %mul1, %c - - store float %madmk0, float addrspace(1)* %out.gep.0, align 4 - store float %madmk1, float addrspace(1)* %out.gep.1, align 4 - ret void -} - -; We don't get any benefit if the constant is an inline immediate. -; GCN-LABEL: {{^}}madmk_inline_imm_f32: -; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GCN: v_mad_f32 {{v[0-9]+}}, 4.0, [[VA]], [[VB]] -define void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %mul = fmul float %a, 4.0 - %madmk = fadd float %mul, %b - store float %madmk, float addrspace(1)* %out.gep, align 4 - ret void -} - -; GCN-LABEL: {{^}}s_s_madmk_f32: -; GCN-NOT: v_madmk_f32 -; GCN: v_mad_f32 -; GCN: s_endpgm -define void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - - %mul = fmul float %a, 10.0 - %madmk = fadd float %mul, %b - store float %madmk, float addrspace(1)* %out.gep, align 4 - ret void -} - -; GCN-LABEL: {{^}}v_s_madmk_f32: -; GCN-NOT: v_madmk_f32 -; GCN: v_mad_f32 -; GCN: s_endpgm -define void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %b) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - %a = load float, float addrspace(1)* %gep.0, align 4 - - %mul = fmul float %a, 10.0 - %madmk = fadd float %mul, %b - store float %madmk, float addrspace(1)* %out.gep, align 4 - ret void -} - -; GCN-LABEL: {{^}}scalar_vector_madmk_f32: -; GCN-NOT: v_madmk_f32 -; GCN: v_mad_f32 -; GCN: s_endpgm -define void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %a) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - %b = load float, float addrspace(1)* %gep.0, align 4 - - %mul = fmul float %a, 10.0 - %madmk = fadd float %mul, %b - store float %madmk, float addrspace(1)* %out.gep, align 4 - ret void -} - -; GCN-LABEL: {{^}}no_madmk_src0_modifier_f32: -; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}} -define void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone - - %mul = fmul float %a.fabs, 10.0 - %madmk = fadd float %mul, %b - store float %madmk, float addrspace(1)* %out.gep, align 4 - ret void -} - -; GCN-LABEL: {{^}}no_madmk_src2_modifier_f32: -; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, |{{[sv][0-9]+}}| -define void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %gep.0, align 4 - %b = load float, float addrspace(1)* %gep.1, align 4 - - %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone - - %mul = fmul float %a, 10.0 - %madmk = fadd float %mul, %b.fabs - store float %madmk, float addrspace(1)* %out.gep, align 4 - ret void -} - -; GCN-LABEL: {{^}}madmk_add_inline_imm_f32: -; GCN: buffer_load_dword [[A:v[0-9]+]] -; GCN: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GCN: v_mad_f32 {{v[0-9]+}}, [[VK]], [[A]], 2.0 -define void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { - %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - - %a = load float, float addrspace(1)* %gep.0, align 4 - - %mul = fmul float %a, 10.0 - %madmk = fadd float %mul, 2.0 - store float %madmk, float addrspace(1)* %out.gep, align 4 - ret void -} - -; SI-LABEL: {{^}}kill_madmk_verifier_error: -; SI: s_xor_b64 -; SI: v_madmk_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, 0x472aee8c -; SI: s_or_b64 -define void @kill_madmk_verifier_error() nounwind { -bb: - br label %bb2 - -bb1: ; preds = %bb2 - ret void - -bb2: ; preds = %bb6, %bb - %tmp = phi float [ undef, %bb ], [ %tmp8, %bb6 ] - %tmp3 = fsub float undef, %tmp - %tmp5 = fcmp oeq float %tmp3, 1.000000e+04 - br i1 %tmp5, label %bb1, label %bb6 - -bb6: ; preds = %bb2 - %tmp4 = fmul float %tmp, undef - %tmp7 = fmul float %tmp4, 0x40E55DD180000000 - %tmp8 = fadd float %tmp7, undef - br label %bb2 -} diff --git a/test/CodeGen/R600/max-literals.ll b/test/CodeGen/R600/max-literals.ll deleted file mode 100644 index c357524b140..00000000000 --- a/test/CodeGen/R600/max-literals.ll +++ /dev/null @@ -1,67 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; CHECK-LABEL: {{^}}main: -; CHECK: ADD * - -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 { -main_body: - %0 = extractelement <4 x float> %reg1, i32 0 - %1 = extractelement <4 x float> %reg1, i32 1 - %2 = extractelement <4 x float> %reg1, i32 2 - %3 = extractelement <4 x float> %reg1, i32 3 - %4 = extractelement <4 x float> %reg2, i32 0 - %5 = fadd float %0, 2.0 - %6 = fadd float %1, 3.0 - %7 = fadd float %2, 4.0 - %8 = fadd float %3, 5.0 - %9 = bitcast float %4 to i32 - %10 = mul i32 %9, 6 - %11 = bitcast i32 %10 to float - %12 = insertelement <4 x float> undef, float %5, i32 0 - %13 = insertelement <4 x float> %12, float %6, i32 1 - %14 = insertelement <4 x float> %13, float %7, i32 2 - %15 = insertelement <4 x float> %14, float %8, i32 3 - %16 = insertelement <4 x float> %15, float %11, i32 3 - - %17 = call float @llvm.AMDGPU.dp4(<4 x float> %15,<4 x float> %16) - %18 = insertelement <4 x float> undef, float %17, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 2) - ret void -} - -; CHECK-LABEL: {{^}}main2: -; CHECK-NOT: ADD * - -define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 { -main_body: - %0 = extractelement <4 x float> %reg1, i32 0 - %1 = extractelement <4 x float> %reg1, i32 1 - %2 = extractelement <4 x float> %reg1, i32 2 - %3 = extractelement <4 x float> %reg1, i32 3 - %4 = extractelement <4 x float> %reg2, i32 0 - %5 = fadd float %0, 2.0 - %6 = fadd float %1, 3.0 - %7 = fadd float %2, 4.0 - %8 = fadd float %3, 2.0 - %9 = bitcast float %4 to i32 - %10 = mul i32 %9, 6 - %11 = bitcast i32 %10 to float - %12 = insertelement <4 x float> undef, float %5, i32 0 - %13 = insertelement <4 x float> %12, float %6, i32 1 - %14 = insertelement <4 x float> %13, float %7, i32 2 - %15 = insertelement <4 x float> %14, float %8, i32 3 - %16 = insertelement <4 x float> %15, float %11, i32 3 - - %17 = call float @llvm.AMDGPU.dp4(<4 x float> %15,<4 x float> %16) - %18 = insertelement <4 x float> undef, float %17, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 2) - ret void -} - -; Function Attrs: readnone -declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="1" } -attributes #1 = { readnone } diff --git a/test/CodeGen/R600/max.ll b/test/CodeGen/R600/max.ll deleted file mode 100644 index fef3e2f0a21..00000000000 --- a/test/CodeGen/R600/max.ll +++ /dev/null @@ -1,168 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -; FUNC-LABEL: @v_test_imax_sge_i32 -; SI: v_max_i32_e32 -define void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid - %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %cmp = icmp sge i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: @s_test_imax_sge_i32 -; SI: s_max_i32 -define void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %cmp = icmp sge i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}s_test_imax_sge_imm_i32: -; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9 -define void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { - %cmp = icmp sge i32 %a, 9 - %val = select i1 %cmp, i32 %a, i32 9 - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}s_test_imax_sgt_imm_i32: -; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9 -define void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { - %cmp = icmp sgt i32 %a, 9 - %val = select i1 %cmp, i32 %a, i32 9 - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @v_test_imax_sgt_i32 -; SI: v_max_i32_e32 -define void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid - %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %cmp = icmp sgt i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: @s_test_imax_sgt_i32 -; SI: s_max_i32 -define void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %cmp = icmp sgt i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @v_test_umax_uge_i32 -; SI: v_max_u32_e32 -define void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid - %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %cmp = icmp uge i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: @s_test_umax_uge_i32 -; SI: s_max_u32 -define void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %cmp = icmp uge i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @v_test_umax_ugt_i32 -; SI: v_max_u32_e32 -define void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid - %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %cmp = icmp ugt i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: @s_test_umax_ugt_i32 -; SI: s_max_u32 -define void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %cmp = icmp ugt i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - -; Make sure redundant and removed -; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umax_ugt_i16: -; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc -; SI: s_max_u32 [[MIN:s[0-9]+]], [[A]], [[B]] -; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] -; SI-NEXT: buffer_store_dword [[VMIN]] -define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind { - %a.ext = zext i16 %a to i32 - %b.ext = zext i16 %b to i32 - %cmp = icmp ugt i32 %a.ext, %b.ext - %val = select i1 %cmp, i32 %a.ext, i32 %b.ext - %mask = and i32 %val, 65535 - store i32 %mask, i32 addrspace(1)* %out - ret void -} - -; Make sure redundant sign_extend_inreg removed. - -; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16: -; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc -; SI: s_max_i32 [[MIN:s[0-9]+]], [[A]], [[B]] -; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] -; SI-NEXT: buffer_store_dword [[VMIN]] -define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind { - %a.ext = sext i16 %a to i32 - %b.ext = sext i16 %b to i32 - %cmp = icmp sgt i32 %a.ext, %b.ext - %val = select i1 %cmp, i32 %a.ext, i32 %b.ext - %shl = shl i32 %val, 16 - %sextinreg = ashr i32 %shl, 16 - store i32 %sextinreg, i32 addrspace(1)* %out - ret void -} - -; FIXME: Should get match min/max through extends inserted by -; legalization. - -; FUNC-LABEL: {{^}}s_test_imin_sge_i16: -; SI: s_sext_i32_i16 -; SI: s_sext_i32_i16 -; SI: v_cmp_ge_i32_e32 -; SI: v_cndmask_b32 -define void @s_test_imin_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { - %cmp = icmp sge i16 %a, %b - %val = select i1 %cmp, i16 %a, i16 %b - store i16 %val, i16 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/max3.ll b/test/CodeGen/R600/max3.ll deleted file mode 100644 index cfb94b272e5..00000000000 --- a/test/CodeGen/R600/max3.ll +++ /dev/null @@ -1,41 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -; FUNC-LABEL: @v_test_imax3_sgt_i32 -; SI: v_max3_i32 -define void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid - %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid - %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %c = load i32, i32 addrspace(1)* %gep2, align 4 - %icmp0 = icmp sgt i32 %a, %b - %i0 = select i1 %icmp0, i32 %a, i32 %b - %icmp1 = icmp sgt i32 %i0, %c - %i1 = select i1 %icmp1, i32 %i0, i32 %c - store i32 %i1, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @v_test_umax3_ugt_i32 -; SI: v_max3_u32 -define void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid - %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid - %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %c = load i32, i32 addrspace(1)* %gep2, align 4 - %icmp0 = icmp ugt i32 %a, %b - %i0 = select i1 %icmp0, i32 %a, i32 %b - %icmp1 = icmp ugt i32 %i0, %c - %i1 = select i1 %icmp1, i32 %i0, i32 %c - store i32 %i1, i32 addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/merge-stores.ll b/test/CodeGen/R600/merge-stores.ll deleted file mode 100644 index dbf9d4481ff..00000000000 --- a/test/CodeGen/R600/merge-stores.ll +++ /dev/null @@ -1,536 +0,0 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s - -; Run with devices with different unaligned load restrictions. - -; TODO: Vector element tests -; TODO: Non-zero base offset for load and store combinations -; TODO: Same base addrspacecasted - - -; GCN-LABEL: {{^}}merge_global_store_2_constants_i8: -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: s_endpgm -define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 - - store i8 123, i8 addrspace(1)* %out.gep.1 - store i8 456, i8 addrspace(1)* %out, align 2 - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align: -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: s_endpgm -define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 - - store i8 123, i8 addrspace(1)* %out.gep.1 - store i8 456, i8 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_2_constants_i16: -; GCN: buffer_store_dword v -define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 - - store i16 123, i16 addrspace(1)* %out.gep.1 - store i16 456, i16 addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16: -; GCN: buffer_store_dword v -define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 - - store i16 0, i16 addrspace(1)* %out.gep.1 - store i16 0, i16 addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align: -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: s_endpgm -define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 - - store i16 123, i16 addrspace(1)* %out.gep.1 - store i16 456, i16 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_2_constants_i32: -; SI-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8 -; SI-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b -; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]] -; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]] -; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - - store i32 123, i32 addrspace(1)* %out.gep.1 - store i32 456, i32 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32: -; GCN: buffer_store_dwordx2 -define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)* - store float 1.0, float addrspace(1)* %out.gep.1.bc - store i32 456, i32 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32: -; GCN: buffer_store_dwordx2 -define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 - %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)* - store i32 123, i32 addrspace(1)* %out.gep.1.bc - store float 4.0, float addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_4_constants_i32: -; GCN: buffer_store_dwordx4 -define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 - %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 - - store i32 123, i32 addrspace(1)* %out.gep.1 - store i32 456, i32 addrspace(1)* %out.gep.2 - store i32 333, i32 addrspace(1)* %out.gep.3 - store i32 1234, i32 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order: -; XGCN: buffer_store_dwordx4 -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dwordx2 v -define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 - %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 - - store float 8.0, float addrspace(1)* %out - store float 1.0, float addrspace(1)* %out.gep.1 - store float 2.0, float addrspace(1)* %out.gep.2 - store float 4.0, float addrspace(1)* %out.gep.3 - ret void -} - -; First store is out of order. Because of order of combines, the -; consecutive store fails because only some of the stores have been -; replaced with integer constant stores, and then won't merge because -; the types are different. - -; GCN-LABEL: {{^}}merge_global_store_4_constants_f32: -; XGCN: buffer_store_dwordx4 -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 - %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 - - store float 1.0, float addrspace(1)* %out.gep.1 - store float 2.0, float addrspace(1)* %out.gep.2 - store float 4.0, float addrspace(1)* %out.gep.3 - store float 8.0, float addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_3_constants_i32: -; SI-DAG: buffer_store_dwordx2 -; SI-DAG: buffer_store_dword -; SI-NOT: buffer_store_dword -; GCN: s_endpgm -define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 - - store i32 123, i32 addrspace(1)* %out.gep.1 - store i32 456, i32 addrspace(1)* %out.gep.2 - store i32 1234, i32 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_2_constants_i64: -; XGCN: buffer_store_dwordx4 -; GCN: buffer_store_dwordx2 -; GCN: buffer_store_dwordx2 -define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 - - store i64 123, i64 addrspace(1)* %out.gep.1 - store i64 456, i64 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_4_constants_i64: -; XGCN: buffer_store_dwordx4 -; XGCN: buffer_store_dwordx4 - -; GCN: buffer_store_dwordx2 -; GCN: buffer_store_dwordx2 -; GCN: buffer_store_dwordx2 -; GCN: buffer_store_dwordx2 -define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 { - %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1 - %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2 - %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3 - - store i64 123, i64 addrspace(1)* %out.gep.1 - store i64 456, i64 addrspace(1)* %out.gep.2 - store i64 333, i64 addrspace(1)* %out.gep.3 - store i64 1234, i64 addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32: -; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] -; GCN: buffer_store_dwordx2 [[LOAD]] -define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 - - %lo = load i32, i32 addrspace(1)* %in - %hi = load i32, i32 addrspace(1)* %in.gep.1 - - store i32 %lo, i32 addrspace(1)* %out - store i32 %hi, i32 addrspace(1)* %out.gep.1 - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base: -; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 -; GCN: buffer_store_dwordx2 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 -define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2 - %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3 - - %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2 - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3 - %lo = load i32, i32 addrspace(1)* %in.gep.0 - %hi = load i32, i32 addrspace(1)* %in.gep.1 - - store i32 %lo, i32 addrspace(1)* %out.gep.0 - store i32 %hi, i32 addrspace(1)* %out.gep.1 - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32: -; GCN: buffer_load_dword v -; GCN: buffer_load_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 - - %lo = load i32, i32 addrspace(1)* %in - %hi = load i32, i32 addrspace(1)* %in.gep.1 - - store i32 %hi, i32 addrspace(1)* %out - store i32 %lo, i32 addrspace(1)* %out.gep.1 - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32: -; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] -; GCN: buffer_store_dwordx4 [[LOAD]] -define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 - %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 - %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 - %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 - - %x = load i32, i32 addrspace(1)* %in - %y = load i32, i32 addrspace(1)* %in.gep.1 - %z = load i32, i32 addrspace(1)* %in.gep.2 - %w = load i32, i32 addrspace(1)* %in.gep.3 - - store i32 %x, i32 addrspace(1)* %out - store i32 %y, i32 addrspace(1)* %out.gep.1 - store i32 %z, i32 addrspace(1)* %out.gep.2 - store i32 %w, i32 addrspace(1)* %out.gep.3 - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32: -; SI-DAG: buffer_load_dwordx2 -; SI-DAG: buffer_load_dword v -; GCN: s_waitcnt -; SI-DAG: buffer_store_dword v -; SI-DAG: buffer_store_dwordx2 v -; GCN: s_endpgm -define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 - %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 - - %x = load i32, i32 addrspace(1)* %in - %y = load i32, i32 addrspace(1)* %in.gep.1 - %z = load i32, i32 addrspace(1)* %in.gep.2 - - store i32 %x, i32 addrspace(1)* %out - store i32 %y, i32 addrspace(1)* %out.gep.1 - store i32 %z, i32 addrspace(1)* %out.gep.2 - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32: -; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] -; GCN: buffer_store_dwordx4 [[LOAD]] -define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { - %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2 - %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3 - %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1 - %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2 - %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3 - - %x = load float, float addrspace(1)* %in - %y = load float, float addrspace(1)* %in.gep.1 - %z = load float, float addrspace(1)* %in.gep.2 - %w = load float, float addrspace(1)* %in.gep.3 - - store float %x, float addrspace(1)* %out - store float %y, float addrspace(1)* %out.gep.1 - store float %z, float addrspace(1)* %out.gep.2 - store float %w, float addrspace(1)* %out.gep.3 - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base: -; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 -; GCN: buffer_store_dwordx4 [[LOAD]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28 -define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11 - %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12 - %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13 - %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14 - %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7 - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8 - %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9 - %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10 - - %x = load i32, i32 addrspace(1)* %in.gep.0 - %y = load i32, i32 addrspace(1)* %in.gep.1 - %z = load i32, i32 addrspace(1)* %in.gep.2 - %w = load i32, i32 addrspace(1)* %in.gep.3 - - store i32 %x, i32 addrspace(1)* %out.gep.0 - store i32 %y, i32 addrspace(1)* %out.gep.1 - store i32 %z, i32 addrspace(1)* %out.gep.2 - store i32 %w, i32 addrspace(1)* %out.gep.3 - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32: -; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] -; GCN: s_barrier -; GCN: buffer_store_dwordx4 [[LOAD]] -define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 - %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 - %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 - %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 - - %x = load i32, i32 addrspace(1)* %in - %y = load i32, i32 addrspace(1)* %in.gep.1 - %z = load i32, i32 addrspace(1)* %in.gep.2 - %w = load i32, i32 addrspace(1)* %in.gep.3 - - ; Make sure the barrier doesn't stop this - tail call void @llvm.AMDGPU.barrier.local() #1 - - store i32 %w, i32 addrspace(1)* %out.gep.3 - store i32 %z, i32 addrspace(1)* %out.gep.2 - store i32 %y, i32 addrspace(1)* %out.gep.1 - store i32 %x, i32 addrspace(1)* %out - - ret void -} - -; TODO: Re-packing of loaded register required. Maybe an IR pass -; should catch this? - -; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32: -; GCN: buffer_load_dword v -; GCN: buffer_load_dword v -; GCN: buffer_load_dword v -; GCN: buffer_load_dword v -; GCN: s_barrier -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 - %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 - %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2 - %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3 - - %x = load i32, i32 addrspace(1)* %in - %y = load i32, i32 addrspace(1)* %in.gep.1 - %z = load i32, i32 addrspace(1)* %in.gep.2 - %w = load i32, i32 addrspace(1)* %in.gep.3 - - ; Make sure the barrier doesn't stop this - tail call void @llvm.AMDGPU.barrier.local() #1 - - store i32 %w, i32 addrspace(1)* %out - store i32 %z, i32 addrspace(1)* %out.gep.1 - store i32 %y, i32 addrspace(1)* %out.gep.2 - store i32 %x, i32 addrspace(1)* %out.gep.3 - - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8: -; GCN: buffer_load_dword [[LOAD:v[0-9]+]] -; GCN: buffer_store_dword [[LOAD]] -; GCN: s_endpgm -define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { - %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1 - %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2 - %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3 - %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1 - %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2 - %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3 - - %x = load i8, i8 addrspace(1)* %in, align 4 - %y = load i8, i8 addrspace(1)* %in.gep.1 - %z = load i8, i8 addrspace(1)* %in.gep.2 - %w = load i8, i8 addrspace(1)* %in.gep.3 - - store i8 %x, i8 addrspace(1)* %out, align 4 - store i8 %y, i8 addrspace(1)* %out.gep.1 - store i8 %z, i8 addrspace(1)* %out.gep.2 - store i8 %w, i8 addrspace(1)* %out.gep.3 - ret void -} - -; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: s_endpgm -define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { - %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1 - %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2 - %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3 - %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1 - %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2 - %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3 - - %x = load i8, i8 addrspace(1)* %in - %y = load i8, i8 addrspace(1)* %in.gep.1 - %z = load i8, i8 addrspace(1)* %in.gep.2 - %w = load i8, i8 addrspace(1)* %in.gep.3 - - store i8 %x, i8 addrspace(1)* %out - store i8 %y, i8 addrspace(1)* %out.gep.1 - store i8 %z, i8 addrspace(1)* %out.gep.2 - store i8 %w, i8 addrspace(1)* %out.gep.3 - ret void -} - -; This works once AA is enabled on the subtarget -; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32: -; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]] -; XGCN: buffer_store_dwordx4 [[LOAD]] -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -; GCN: buffer_store_dword v -define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2 - %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3 - %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in - - %x = extractelement <4 x i32> %vec, i32 0 - %y = extractelement <4 x i32> %vec, i32 1 - %z = extractelement <4 x i32> %vec, i32 2 - %w = extractelement <4 x i32> %vec, i32 3 - - store i32 %x, i32 addrspace(1)* %out - store i32 %y, i32 addrspace(1)* %out.gep.1 - store i32 %z, i32 addrspace(1)* %out.gep.2 - store i32 %w, i32 addrspace(1)* %out.gep.3 - ret void -} - -; GCN-LABEL: {{^}}merge_local_store_2_constants_i8: -; GCN: ds_write_b8 -; GCN: ds_write_b8 -; GCN: s_endpgm -define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 { - %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1 - - store i8 123, i8 addrspace(3)* %out.gep.1 - store i8 456, i8 addrspace(3)* %out, align 2 - ret void -} - -; GCN-LABEL: {{^}}merge_local_store_2_constants_i32: -; GCN-DAG: s_movk_i32 [[SLO:s[0-9]+]], 0x1c8 -; GCN-DAG: s_movk_i32 [[SHI:s[0-9]+]], 0x7b -; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[SLO]] -; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHI]] -; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}} -define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 - - store i32 123, i32 addrspace(3)* %out.gep.1 - store i32 456, i32 addrspace(3)* %out - ret void -} - -; GCN-LABEL: {{^}}merge_local_store_4_constants_i32: -; GCN: ds_write_b32 -; GCN: ds_write_b32 -; GCN: ds_write_b32 -; GCN: ds_write_b32 -define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 { - %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 - %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2 - %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3 - - store i32 123, i32 addrspace(3)* %out.gep.1 - store i32 456, i32 addrspace(3)* %out.gep.2 - store i32 333, i32 addrspace(3)* %out.gep.3 - store i32 1234, i32 addrspace(3)* %out - ret void -} - -declare void @llvm.AMDGPU.barrier.local() #1 - -attributes #0 = { nounwind } -attributes #1 = { noduplicate nounwind } diff --git a/test/CodeGen/R600/min.ll b/test/CodeGen/R600/min.ll deleted file mode 100644 index 0332d1a8e40..00000000000 --- a/test/CodeGen/R600/min.ll +++ /dev/null @@ -1,189 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -; FUNC-LABEL: @v_test_imin_sle_i32 -; SI: v_min_i32_e32 -define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid - %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %cmp = icmp sle i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: @s_test_imin_sle_i32 -; SI: s_min_i32 -define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %cmp = icmp sle i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @v_test_imin_slt_i32 -; SI: v_min_i32_e32 -define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid - %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %cmp = icmp slt i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: @s_test_imin_slt_i32 -; SI: s_min_i32 -define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %cmp = icmp slt i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}s_test_imin_slt_imm_i32: -; SI: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8 -define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { - %cmp = icmp slt i32 %a, 8 - %val = select i1 %cmp, i32 %a, i32 8 - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}s_test_imin_sle_imm_i32: -; SI: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8 -define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { - %cmp = icmp sle i32 %a, 8 - %val = select i1 %cmp, i32 %a, i32 8 - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @v_test_umin_ule_i32 -; SI: v_min_u32_e32 -define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid - %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %cmp = icmp ule i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: @s_test_umin_ule_i32 -; SI: s_min_u32 -define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %cmp = icmp ule i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @v_test_umin_ult_i32 -; SI: v_min_u32_e32 -define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid - %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %cmp = icmp ult i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: @s_test_umin_ult_i32 -; SI: s_min_u32 -define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %cmp = icmp ult i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @v_test_umin_ult_i32_multi_use -; SI-NOT: v_min -; SI: v_cmp_lt_u32 -; SI-NEXT: v_cndmask_b32 -; SI-NOT: v_min -; SI: s_endpgm -define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid - %outgep0 = getelementptr i32, i32 addrspace(1)* %out0, i32 %tid - %outgep1 = getelementptr i1, i1 addrspace(1)* %out1, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %cmp = icmp ult i32 %a, %b - %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %outgep0, align 4 - store i1 %cmp, i1 addrspace(1)* %outgep1 - ret void -} - -; Make sure redundant and removed -; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umin_ult_i16: -; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc -; SI: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]] -; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] -; SI-NEXT: buffer_store_dword [[VMIN]] -define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind { - %a.ext = zext i16 %a to i32 - %b.ext = zext i16 %b to i32 - %cmp = icmp ult i32 %a.ext, %b.ext - %val = select i1 %cmp, i32 %a.ext, i32 %b.ext - %mask = and i32 %val, 65535 - store i32 %mask, i32 addrspace(1)* %out - ret void -} - -; Make sure redundant sign_extend_inreg removed. - -; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16: -; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc -; SI: s_min_i32 [[MIN:s[0-9]+]], [[A]], [[B]] -; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] -; SI-NEXT: buffer_store_dword [[VMIN]] -define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind { - %a.ext = sext i16 %a to i32 - %b.ext = sext i16 %b to i32 - %cmp = icmp slt i32 %a.ext, %b.ext - %val = select i1 %cmp, i32 %a.ext, i32 %b.ext - %shl = shl i32 %val, 16 - %sextinreg = ashr i32 %shl, 16 - store i32 %sextinreg, i32 addrspace(1)* %out - ret void -} - -; FIXME: Should get match min/max through extends inserted by -; legalization. - -; FUNC-LABEL: {{^}}s_test_imin_sle_i16: -; SI: s_sext_i32_i16 -; SI: s_sext_i32_i16 -; SI: v_cmp_le_i32_e32 -; SI: v_cndmask_b32 -define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { - %cmp = icmp sle i16 %a, %b - %val = select i1 %cmp, i16 %a, i16 %b - store i16 %val, i16 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/min3.ll b/test/CodeGen/R600/min3.ll deleted file mode 100644 index 38ef46d1bdd..00000000000 --- a/test/CodeGen/R600/min3.ll +++ /dev/null @@ -1,111 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -; FUNC-LABEL: @v_test_imin3_slt_i32 -; SI: v_min3_i32 -define void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid - %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid - %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %c = load i32, i32 addrspace(1)* %gep2, align 4 - %icmp0 = icmp slt i32 %a, %b - %i0 = select i1 %icmp0, i32 %a, i32 %b - %icmp1 = icmp slt i32 %i0, %c - %i1 = select i1 %icmp1, i32 %i0, i32 %c - store i32 %i1, i32 addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: @v_test_umin3_ult_i32 -; SI: v_min3_u32 -define void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid - %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid - %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %c = load i32, i32 addrspace(1)* %gep2, align 4 - %icmp0 = icmp ult i32 %a, %b - %i0 = select i1 %icmp0, i32 %a, i32 %b - %icmp1 = icmp ult i32 %i0, %c - %i1 = select i1 %icmp1, i32 %i0, i32 %c - store i32 %i1, i32 addrspace(1)* %outgep, align 4 - ret void -} - -; FUNC-LABEL: @v_test_umin_umin_umin -; SI: v_min_i32 -; SI: v_min3_i32 -define void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %tid2 = mul i32 %tid, 2 - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid - %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid - - %gep3 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid2 - %gep4 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid2 - %gep5 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid2 - - %outgep0 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %outgep1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2 - - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %c = load i32, i32 addrspace(1)* %gep2, align 4 - %d = load i32, i32 addrspace(1)* %gep3, align 4 - - %icmp0 = icmp slt i32 %a, %b - %i0 = select i1 %icmp0, i32 %a, i32 %b - - %icmp1 = icmp slt i32 %c, %d - %i1 = select i1 %icmp1, i32 %c, i32 %d - - %icmp2 = icmp slt i32 %i0, %i1 - %i2 = select i1 %icmp2, i32 %i0, i32 %i1 - - store i32 %i2, i32 addrspace(1)* %outgep1, align 4 - ret void -} - -; FUNC-LABEL: @v_test_umin3_2_uses -; SI-NOT: v_min3 -define void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %tid2 = mul i32 %tid, 2 - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid - %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid - - %gep3 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid2 - %gep4 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid2 - %gep5 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid2 - - %outgep0 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %outgep1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2 - - %a = load i32, i32 addrspace(1)* %gep0, align 4 - %b = load i32, i32 addrspace(1)* %gep1, align 4 - %c = load i32, i32 addrspace(1)* %gep2, align 4 - %d = load i32, i32 addrspace(1)* %gep3, align 4 - - %icmp0 = icmp slt i32 %a, %b - %i0 = select i1 %icmp0, i32 %a, i32 %b - - %icmp1 = icmp slt i32 %c, %d - %i1 = select i1 %icmp1, i32 %c, i32 %d - - %icmp2 = icmp slt i32 %i0, %c - %i2 = select i1 %icmp2, i32 %i0, i32 %c - - store i32 %i2, i32 addrspace(1)* %outgep0, align 4 - store i32 %i0, i32 addrspace(1)* %outgep1, align 4 - ret void -} diff --git a/test/CodeGen/R600/missing-store.ll b/test/CodeGen/R600/missing-store.ll deleted file mode 100644 index 4af9cdf1b96..00000000000 --- a/test/CodeGen/R600/missing-store.ll +++ /dev/null @@ -1,26 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s - -@ptr_load = addrspace(3) global i32 addrspace(2)* undef, align 8 - -; Make sure when the load from %ptr2 is folded the chain isn't lost, -; resulting in losing the store to gptr - -; FUNC-LABEL: {{^}}missing_store_reduced: -; SI: ds_read_b64 -; SI: buffer_store_dword -; SI: buffer_load_dword -; SI: buffer_store_dword -; SI: s_endpgm -define void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { - %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @ptr_load, align 8 - %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2 - - store i32 99, i32 addrspace(1)* %gptr, align 4 - %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4 - - store i32 %tmp2, i32 addrspace(1)* %out, align 4 - ret void -} - -attributes #0 = { nounwind } - diff --git a/test/CodeGen/R600/mubuf.ll b/test/CodeGen/R600/mubuf.ll deleted file mode 100644 index b19163f294e..00000000000 --- a/test/CodeGen/R600/mubuf.ll +++ /dev/null @@ -1,183 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s - -declare i32 @llvm.r600.read.tidig.x() readnone - -;;;==========================================================================;;; -;;; MUBUF LOAD TESTS -;;;==========================================================================;;; - -; MUBUF load with an immediate byte offset that fits into 12-bits -; CHECK-LABEL: {{^}}mubuf_load0: -; CHECK: buffer_load_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x30,0xe0 -define void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { -entry: - %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1 - %1 = load i32, i32 addrspace(1)* %0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; MUBUF load with the largest possible immediate offset -; CHECK-LABEL: {{^}}mubuf_load1: -; CHECK: buffer_load_ubyte v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0 -define void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { -entry: - %0 = getelementptr i8, i8 addrspace(1)* %in, i64 4095 - %1 = load i8, i8 addrspace(1)* %0 - store i8 %1, i8 addrspace(1)* %out - ret void -} - -; MUBUF load with an immediate byte offset that doesn't fit into 12-bits -; CHECK-LABEL: {{^}}mubuf_load2: -; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000 -; CHECK: buffer_load_dword v{{[0-9]}}, s[{{[0-9]+:[0-9]+}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x30,0xe0 -define void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { -entry: - %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1024 - %1 = load i32, i32 addrspace(1)* %0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; MUBUF load with a 12-bit immediate offset and a register offset -; CHECK-LABEL: {{^}}mubuf_load3: -; CHECK-NOT: ADD -; CHECK: buffer_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x30,0xe0 -define void @mubuf_load3(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i64 %offset) { -entry: - %0 = getelementptr i32, i32 addrspace(1)* %in, i64 %offset - %1 = getelementptr i32, i32 addrspace(1)* %0, i64 1 - %2 = load i32, i32 addrspace(1)* %1 - store i32 %2, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}soffset_max_imm: -; CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 64 offen glc -define void @soffset_max_imm([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) #1 { -main_body: - %tmp0 = getelementptr [6 x <16 x i8>], [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0 - %tmp1 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp0 - %tmp2 = shl i32 %6, 2 - %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 64, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) - %tmp4 = add i32 %6, 16 - %tmp5 = bitcast float 0.0 to i32 - call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp5, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) - ret void -} - -; Make sure immediates that aren't inline constants don't get folded into -; the soffset operand. -; FIXME: for this test we should be smart enough to shift the immediate into -; the offset field. -; CHECK-LABEL: {{^}}soffset_no_fold: -; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x41 -; CHECK: buffer_load_dword v{{[0-9+]}}, v{{[0-9+]}}, s[{{[0-9]+}}:{{[0-9]+}}], [[SOFFSET]] offen glc -define void @soffset_no_fold([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) #1 { -main_body: - %tmp0 = getelementptr [6 x <16 x i8>], [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0 - %tmp1 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp0 - %tmp2 = shl i32 %6, 2 - %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 65, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) - %tmp4 = add i32 %6, 16 - %tmp5 = bitcast float 0.0 to i32 - call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp5, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) - ret void -} - -;;;==========================================================================;;; -;;; MUBUF STORE TESTS -;;;==========================================================================;;; - -; MUBUF store with an immediate byte offset that fits into 12-bits -; CHECK-LABEL: {{^}}mubuf_store0: -; CHECK: buffer_store_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x70,0xe0 -define void @mubuf_store0(i32 addrspace(1)* %out) { -entry: - %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1 - store i32 0, i32 addrspace(1)* %0 - ret void -} - -; MUBUF store with the largest possible immediate offset -; CHECK-LABEL: {{^}}mubuf_store1: -; CHECK: buffer_store_byte v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0 - -define void @mubuf_store1(i8 addrspace(1)* %out) { -entry: - %0 = getelementptr i8, i8 addrspace(1)* %out, i64 4095 - store i8 0, i8 addrspace(1)* %0 - ret void -} - -; MUBUF store with an immediate byte offset that doesn't fit into 12-bits -; CHECK-LABEL: {{^}}mubuf_store2: -; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000 -; CHECK: buffer_store_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x70,0xe0 -define void @mubuf_store2(i32 addrspace(1)* %out) { -entry: - %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1024 - store i32 0, i32 addrspace(1)* %0 - ret void -} - -; MUBUF store with a 12-bit immediate offset and a register offset -; CHECK-LABEL: {{^}}mubuf_store3: -; CHECK-NOT: ADD -; CHECK: buffer_store_dword v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x70,0xe0 -define void @mubuf_store3(i32 addrspace(1)* %out, i64 %offset) { -entry: - %0 = getelementptr i32, i32 addrspace(1)* %out, i64 %offset - %1 = getelementptr i32, i32 addrspace(1)* %0, i64 1 - store i32 0, i32 addrspace(1)* %1 - ret void -} - -; CHECK-LABEL: {{^}}store_sgpr_ptr: -; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 -define void @store_sgpr_ptr(i32 addrspace(1)* %out) #0 { - store i32 99, i32 addrspace(1)* %out, align 4 - ret void -} - -; CHECK-LABEL: {{^}}store_sgpr_ptr_offset: -; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:40 -define void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 { - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 10 - store i32 99, i32 addrspace(1)* %out.gep, align 4 - ret void -} - -; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset: -; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000 -; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]] -define void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 { - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768 - store i32 99, i32 addrspace(1)* %out.gep, align 4 - ret void -} - -; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset_atomic: -; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000 -; CHECK: buffer_atomic_add v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]] -define void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 { - %gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768 - %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 5 seq_cst - ret void -} - -; CHECK-LABEL: {{^}}store_vgpr_ptr: -; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 -define void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 { - %tid = call i32 @llvm.r600.read.tidig.x() readnone - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - store i32 99, i32 addrspace(1)* %out.gep, align 4 - ret void -} - -declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #3 -declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) - -attributes #1 = { "ShaderType"="2" "unsafe-fp-math"="true" } -attributes #3 = { nounwind readonly } diff --git a/test/CodeGen/R600/mul.ll b/test/CodeGen/R600/mul.ll deleted file mode 100644 index 94e0f96b323..00000000000 --- a/test/CodeGen/R600/mul.ll +++ /dev/null @@ -1,200 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; mul24 and mad24 are affected - -; FUNC-LABEL: {{^}}test_mul_v2i32: -; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -define void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 - %a = load <2 x i32>, <2 x i32> addrspace(1) * %in - %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr - %result = mul <2 x i32> %a, %b - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}v_mul_v4i32: -; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -define void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 - %a = load <4 x i32>, <4 x i32> addrspace(1) * %in - %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr - %result = mul <4 x i32> %a, %b - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}s_trunc_i64_mul_to_i32: -; SI: s_load_dword -; SI: s_load_dword -; SI: s_mul_i32 -; SI: buffer_store_dword -define void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { - %mul = mul i64 %b, %a - %trunc = trunc i64 %mul to i32 - store i32 %trunc, i32 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_trunc_i64_mul_to_i32: -; SI: s_load_dword -; SI: s_load_dword -; SI: v_mul_lo_i32 -; SI: buffer_store_dword -define void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { - %a = load i64, i64 addrspace(1)* %aptr, align 8 - %b = load i64, i64 addrspace(1)* %bptr, align 8 - %mul = mul i64 %b, %a - %trunc = trunc i64 %mul to i32 - store i32 %trunc, i32 addrspace(1)* %out, align 8 - ret void -} - -; This 64-bit multiply should just use MUL_HI and MUL_LO, since the top -; 32-bits of both arguments are sign bits. -; FUNC-LABEL: {{^}}mul64_sext_c: -; EG-DAG: MULLO_INT -; EG-DAG: MULHI_INT -; SI-DAG: s_mul_i32 -; SI-DAG: v_mul_hi_i32 -define void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) { -entry: - %0 = sext i32 %in to i64 - %1 = mul i64 %0, 80 - store i64 %1, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}v_mul64_sext_c: -; EG-DAG: MULLO_INT -; EG-DAG: MULHI_INT -; SI-DAG: v_mul_lo_i32 -; SI-DAG: v_mul_hi_i32 -; SI: s_endpgm -define void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { - %val = load i32, i32 addrspace(1)* %in, align 4 - %ext = sext i32 %val to i64 - %mul = mul i64 %ext, 80 - store i64 %mul, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_mul64_sext_inline_imm: -; SI-DAG: v_mul_lo_i32 v{{[0-9]+}}, 9, v{{[0-9]+}} -; SI-DAG: v_mul_hi_i32 v{{[0-9]+}}, 9, v{{[0-9]+}} -; SI: s_endpgm -define void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) { - %val = load i32, i32 addrspace(1)* %in, align 4 - %ext = sext i32 %val to i64 - %mul = mul i64 %ext, 9 - store i64 %mul, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_mul_i32: -; SI: s_load_dword [[SRC0:s[0-9]+]], -; SI: s_load_dword [[SRC1:s[0-9]+]], -; SI: s_mul_i32 [[SRESULT:s[0-9]+]], [[SRC0]], [[SRC1]] -; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]] -; SI: buffer_store_dword [[VRESULT]], -; SI: s_endpgm -define void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %mul = mul i32 %a, %b - store i32 %mul, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}v_mul_i32: -; SI: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %a = load i32, i32 addrspace(1)* %in - %b = load i32, i32 addrspace(1)* %b_ptr - %result = mul i32 %a, %b - store i32 %result, i32 addrspace(1)* %out - ret void -} - -; A standard 64-bit multiply. The expansion should be around 6 instructions. -; It would be difficult to match the expansion correctly without writing -; a really complicated list of FileCheck expressions. I don't want -; to confuse people who may 'break' this test with a correct optimization, -; so this test just uses FUNC-LABEL to make sure the compiler does not -; crash with a 'failed to select' error. - -; FUNC-LABEL: {{^}}s_mul_i64: -define void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %mul = mul i64 %a, %b - store i64 %mul, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_mul_i64: -; SI: v_mul_lo_i32 -define void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) { - %a = load i64, i64 addrspace(1)* %aptr, align 8 - %b = load i64, i64 addrspace(1)* %bptr, align 8 - %mul = mul i64 %a, %b - store i64 %mul, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}mul32_in_branch: -; SI: s_mul_i32 -define void @mul32_in_branch(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b, i32 %c) { -entry: - %0 = icmp eq i32 %a, 0 - br i1 %0, label %if, label %else - -if: - %1 = load i32, i32 addrspace(1)* %in - br label %endif - -else: - %2 = mul i32 %a, %b - br label %endif - -endif: - %3 = phi i32 [%1, %if], [%2, %else] - store i32 %3, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}mul64_in_branch: -; SI-DAG: s_mul_i32 -; SI-DAG: v_mul_hi_u32 -; SI: s_endpgm -define void @mul64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { -entry: - %0 = icmp eq i64 %a, 0 - br i1 %0, label %if, label %else - -if: - %1 = load i64, i64 addrspace(1)* %in - br label %endif - -else: - %2 = mul i64 %a, %b - br label %endif - -endif: - %3 = phi i64 [%1, %if], [%2, %else] - store i64 %3, i64 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/mul_int24.ll b/test/CodeGen/R600/mul_int24.ll deleted file mode 100644 index 7609dcc87af..00000000000 --- a/test/CodeGen/R600/mul_int24.ll +++ /dev/null @@ -1,23 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC - -; FUNC-LABEL: {{^}}i32_mul24: -; Signed 24-bit multiply is not supported on pre-Cayman GPUs. -; EG: MULLO_INT -; Make sure we are not masking the inputs -; CM-NOT: AND -; CM: MUL_INT24 -; SI-NOT: and -; SI: v_mul_i32_i24 -define void @i32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) { -entry: - %0 = shl i32 %a, 8 - %a_24 = ashr i32 %0, 8 - %1 = shl i32 %b, 8 - %b_24 = ashr i32 %1, 8 - %2 = mul i32 %a_24, %b_24 - store i32 %2, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/mul_uint24.ll b/test/CodeGen/R600/mul_uint24.ll deleted file mode 100644 index e640a7cd69f..00000000000 --- a/test/CodeGen/R600/mul_uint24.ll +++ /dev/null @@ -1,67 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC - -; FUNC-LABEL: {{^}}u32_mul24: -; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W -; SI: v_mul_u32_u24 - -define void @u32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) { -entry: - %0 = shl i32 %a, 8 - %a_24 = lshr i32 %0, 8 - %1 = shl i32 %b, 8 - %b_24 = lshr i32 %1, 8 - %2 = mul i32 %a_24, %b_24 - store i32 %2, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i16_mul24: -; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]] -; The result must be sign-extended -; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x -; EG: 16 -; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} -; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16 -define void @i16_mul24(i32 addrspace(1)* %out, i16 %a, i16 %b) { -entry: - %0 = mul i16 %a, %b - %1 = sext i16 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i8_mul24: -; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]] -; The result must be sign-extended -; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x -; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} -; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8 - -define void @i8_mul24(i32 addrspace(1)* %out, i8 %a, i8 %b) { -entry: - %0 = mul i8 %a, %b - %1 = sext i8 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; Multiply with 24-bit inputs and 64-bit output -; FUNC_LABEL: {{^}}mul24_i64: -; EG; MUL_UINT24 -; EG: MULHI -; SI: v_mul_u32_u24 -; FIXME: SI support 24-bit mulhi -; SI: v_mul_hi_u32 -define void @mul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { -entry: - %0 = shl i64 %a, 40 - %a_24 = lshr i64 %0, 40 - %1 = shl i64 %b, 40 - %b_24 = lshr i64 %1, 40 - %2 = mul i64 %a_24, %b_24 - store i64 %2, i64 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/mulhu.ll b/test/CodeGen/R600/mulhu.ll deleted file mode 100644 index 29b0944a553..00000000000 --- a/test/CodeGen/R600/mulhu.ll +++ /dev/null @@ -1,17 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xaaaaaaab -;CHECK: v_mul_hi_u32 v0, {{v[0-9]+}}, {{s[0-9]+}} -;CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0 - -define void @test(i32 %p) { - %i = udiv i32 %p, 3 - %r = bitcast i32 %i to float - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) - ret void -} - -declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/test/CodeGen/R600/no-initializer-constant-addrspace.ll b/test/CodeGen/R600/no-initializer-constant-addrspace.ll deleted file mode 100644 index 9a814b579de..00000000000 --- a/test/CodeGen/R600/no-initializer-constant-addrspace.ll +++ /dev/null @@ -1,21 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -o /dev/null %s -; RUN: llc -march=amdgcn -mcpu=tonga -o /dev/null %s -; RUN: llc -march=r600 -mcpu=cypress -o /dev/null %s - -@extern_const_addrspace = external unnamed_addr addrspace(2) constant [5 x i32], align 4 - -; FUNC-LABEL: {{^}}load_extern_const_init: -define void @load_extern_const_init(i32 addrspace(1)* %out) nounwind { - %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @extern_const_addrspace, i64 0, i64 3), align 4 - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} - -@undef_const_addrspace = unnamed_addr addrspace(2) constant [5 x i32] undef, align 4 - -; FUNC-LABEL: {{^}}load_undef_const_init: -define void @load_undef_const_init(i32 addrspace(1)* %out) nounwind { - %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @undef_const_addrspace, i64 0, i64 3), align 4 - store i32 %val, i32 addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/no-shrink-extloads.ll b/test/CodeGen/R600/no-shrink-extloads.ll deleted file mode 100644 index e4328ecbaca..00000000000 --- a/test/CodeGen/R600/no-shrink-extloads.ll +++ /dev/null @@ -1,191 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -; Make sure we don't turn the 32-bit argument load into a 16-bit -; load. There aren't extending scalar lods, so that would require -; using a buffer_load instruction. - -; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i16: -; SI: s_load_dword s -; SI: buffer_store_short v -define void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounwind { - %trunc = trunc i32 %arg to i16 - store i16 %trunc, i16 addrspace(1)* %out - ret void -} - -; It should be OK (and probably performance neutral) to reduce this, -; but we don't know if the load is uniform yet. - -; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i16: -; SI: buffer_load_dword v -; SI: buffer_store_short v -define void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i16, i16 addrspace(1)* %out, i32 %tid - %load = load i32, i32 addrspace(1)* %gep.in - %trunc = trunc i32 %load to i16 - store i16 %trunc, i16 addrspace(1)* %gep.out - ret void -} - -; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i8: -; SI: s_load_dword s -; SI: buffer_store_byte v -define void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwind { - %trunc = trunc i32 %arg to i8 - store i8 %trunc, i8 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i8: -; SI: buffer_load_dword v -; SI: buffer_store_byte v -define void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid - %load = load i32, i32 addrspace(1)* %gep.in - %trunc = trunc i32 %load to i8 - store i8 %trunc, i8 addrspace(1)* %gep.out - ret void -} - -; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i1: -; SI: s_load_dword s -; SI: buffer_store_byte v -define void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwind { - %trunc = trunc i32 %arg to i1 - store i1 %trunc, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i1: -; SI: buffer_load_dword v -; SI: buffer_store_byte v -define void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i1, i1 addrspace(1)* %out, i32 %tid - %load = load i32, i32 addrspace(1)* %gep.in - %trunc = trunc i32 %load to i1 - store i1 %trunc, i1 addrspace(1)* %gep.out - ret void -} - -; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i32: -; SI: s_load_dword s -; SI: buffer_store_dword v -define void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind { - %trunc = trunc i64 %arg to i32 - store i32 %trunc, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i32: -; SI: buffer_load_dword v -; SI: buffer_store_dword v -define void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %load = load i64, i64 addrspace(1)* %gep.in - %trunc = trunc i64 %load to i32 - store i32 %trunc, i32 addrspace(1)* %gep.out - ret void -} - -; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i32: -; SI: s_load_dword s -; SI: buffer_store_dword v -define void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind { - %srl = lshr i64 %arg, 32 - %trunc = trunc i64 %srl to i32 - store i32 %trunc, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i32: -; SI: buffer_load_dword v -; SI: buffer_store_dword v -define void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %load = load i64, i64 addrspace(1)* %gep.in - %srl = lshr i64 %load, 32 - %trunc = trunc i64 %srl to i32 - store i32 %trunc, i32 addrspace(1)* %gep.out - ret void -} - -; Might as well reduce to 8-bit loads. -; FUNC-LABEL: {{^}}truncate_kernarg_i16_to_i8: -; SI: s_load_dword s -; SI: buffer_store_byte v -define void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwind { - %trunc = trunc i16 %arg to i8 - store i8 %trunc, i8 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}truncate_buffer_load_i16_to_i8: -; SI: buffer_load_ubyte v -; SI: buffer_store_byte v -define void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.in = getelementptr i16, i16 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid - %load = load i16, i16 addrspace(1)* %gep.in - %trunc = trunc i16 %load to i8 - store i8 %trunc, i8 addrspace(1)* %gep.out - ret void -} - -; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i8: -; SI: s_load_dword s -; SI: buffer_store_byte v -define void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind { - %srl = lshr i64 %arg, 32 - %trunc = trunc i64 %srl to i8 - store i8 %trunc, i8 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i8: -; SI: buffer_load_dword v -; SI: buffer_store_byte v -define void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid - %load = load i64, i64 addrspace(1)* %gep.in - %srl = lshr i64 %load, 32 - %trunc = trunc i64 %srl to i8 - store i8 %trunc, i8 addrspace(1)* %gep.out - ret void -} - -; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i8: -; SI: s_load_dword s -; SI: buffer_store_byte v -define void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind { - %trunc = trunc i64 %arg to i8 - store i8 %trunc, i8 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i8: -; SI: buffer_load_dword v -; SI: buffer_store_byte v -define void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid - %load = load i64, i64 addrspace(1)* %gep.in - %trunc = trunc i64 %load to i8 - store i8 %trunc, i8 addrspace(1)* %gep.out - ret void -} diff --git a/test/CodeGen/R600/operand-folding.ll b/test/CodeGen/R600/operand-folding.ll deleted file mode 100644 index 816755efb07..00000000000 --- a/test/CodeGen/R600/operand-folding.ll +++ /dev/null @@ -1,113 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s - -; CHECK-LABEL: {{^}}fold_sgpr: -; CHECK: v_add_i32_e32 v{{[0-9]+}}, s -define void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) { -entry: - %tmp0 = icmp ne i32 %fold, 0 - br i1 %tmp0, label %if, label %endif - -if: - %id = call i32 @llvm.r600.read.tidig.x() - %offset = add i32 %fold, %id - %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %offset - store i32 0, i32 addrspace(1)* %tmp1 - br label %endif - -endif: - ret void -} - -; CHECK-LABEL: {{^}}fold_imm: -; CHECK: v_or_b32_e32 v{{[0-9]+}}, 5 -define void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) { -entry: - %fold = add i32 3, 2 - %tmp0 = icmp ne i32 %cmp, 0 - br i1 %tmp0, label %if, label %endif - -if: - %id = call i32 @llvm.r600.read.tidig.x() - %val = or i32 %id, %fold - store i32 %val, i32 addrspace(1)* %out - br label %endif - -endif: - ret void -} - -; CHECK-LABEL: {{^}}fold_64bit_constant_add: -; CHECK-NOT: s_mov_b64 -; FIXME: It would be better if we could use v_add here and drop the extra -; v_mov_b32 instructions. -; CHECK-DAG: s_add_u32 [[LO:s[0-9]+]], s{{[0-9]+}}, 1 -; CHECK-DAG: s_addc_u32 [[HI:s[0-9]+]], s{{[0-9]+}}, 0 -; CHECK-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[LO]] -; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[HI]] -; CHECK: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}, - -define void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) { -entry: - %tmp0 = add i64 %val, 1 - store i64 %tmp0, i64 addrspace(1)* %out - ret void -} - -; Inline constants should always be folded. - -; CHECK-LABEL: {{^}}vector_inline: -; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}} -; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}} -; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}} -; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}} - -define void @vector_inline(<4 x i32> addrspace(1)* %out) { -entry: - %tmp0 = call i32 @llvm.r600.read.tidig.x() - %tmp1 = add i32 %tmp0, 1 - %tmp2 = add i32 %tmp0, 2 - %tmp3 = add i32 %tmp0, 3 - %vec0 = insertelement <4 x i32> undef, i32 %tmp0, i32 0 - %vec1 = insertelement <4 x i32> %vec0, i32 %tmp1, i32 1 - %vec2 = insertelement <4 x i32> %vec1, i32 %tmp2, i32 2 - %vec3 = insertelement <4 x i32> %vec2, i32 %tmp3, i32 3 - %tmp4 = xor <4 x i32> , %vec3 - store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %out - ret void -} - -; Immediates with one use should be folded -; CHECK-LABEL: {{^}}imm_one_use: -; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 0x64, v{{[0-9]+}} - -define void @imm_one_use(i32 addrspace(1)* %out) { -entry: - %tmp0 = call i32 @llvm.r600.read.tidig.x() - %tmp1 = xor i32 %tmp0, 100 - store i32 %tmp1, i32 addrspace(1)* %out - ret void -} -; CHECK-LABEL: {{^}}vector_imm: -; CHECK: s_movk_i32 [[IMM:s[0-9]+]], 0x64 -; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}} -; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}} -; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}} -; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}} - -define void @vector_imm(<4 x i32> addrspace(1)* %out) { -entry: - %tmp0 = call i32 @llvm.r600.read.tidig.x() - %tmp1 = add i32 %tmp0, 1 - %tmp2 = add i32 %tmp0, 2 - %tmp3 = add i32 %tmp0, 3 - %vec0 = insertelement <4 x i32> undef, i32 %tmp0, i32 0 - %vec1 = insertelement <4 x i32> %vec0, i32 %tmp1, i32 1 - %vec2 = insertelement <4 x i32> %vec1, i32 %tmp2, i32 2 - %vec3 = insertelement <4 x i32> %vec2, i32 %tmp3, i32 3 - %tmp4 = xor <4 x i32> , %vec3 - store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %out - ret void -} - -declare i32 @llvm.r600.read.tidig.x() #0 -attributes #0 = { readnone } diff --git a/test/CodeGen/R600/operand-spacing.ll b/test/CodeGen/R600/operand-spacing.ll deleted file mode 100644 index 20420a84de6..00000000000 --- a/test/CodeGen/R600/operand-spacing.ll +++ /dev/null @@ -1,18 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=VI -check-prefix=GCN %s - -; Make sure there isn't an extra space between the instruction name and first operands. - -; GCN-LABEL: {{^}}add_f32: -; SI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; VI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 -; GCN: v_mov_b32_e32 [[VREGB:v[0-9]+]], [[SREGB]] -; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]] -; GCN: buffer_store_dword [[RESULT]], -define void @add_f32(float addrspace(1)* %out, float %a, float %b) { - %result = fadd float %a, %b - store float %result, float addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/or.ll b/test/CodeGen/R600/or.ll deleted file mode 100644 index 1c04090b407..00000000000 --- a/test/CodeGen/R600/or.ll +++ /dev/null @@ -1,178 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - - -; FUNC-LABEL: {{^}}or_v2i32: -; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 - %a = load <2 x i32>, <2 x i32> addrspace(1) * %in - %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr - %result = or <2 x i32> %a, %b - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}or_v4i32: -; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 - %a = load <4 x i32>, <4 x i32> addrspace(1) * %in - %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr - %result = or <4 x i32> %a, %b - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}scalar_or_i32: -; SI: s_or_b32 -define void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { - %or = or i32 %a, %b - store i32 %or, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}vector_or_i32: -; SI: v_or_b32_e32 v{{[0-9]}} -define void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b) { - %loada = load i32, i32 addrspace(1)* %a - %or = or i32 %loada, %b - store i32 %or, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}scalar_or_literal_i32: -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x1869f -define void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) { - %or = or i32 %a, 99999 - store i32 %or, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}vector_or_literal_i32: -; SI: v_or_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} -define void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) { - %loada = load i32, i32 addrspace(1)* %a, align 4 - %or = or i32 %loada, 65535 - store i32 %or, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}vector_or_inline_immediate_i32: -; SI: v_or_b32_e32 v{{[0-9]+}}, 4, v{{[0-9]+}} -define void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) { - %loada = load i32, i32 addrspace(1)* %a, align 4 - %or = or i32 %loada, 4 - store i32 %or, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}scalar_or_i64: -; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y -; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z - -; SI: s_or_b64 -define void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { - %or = or i64 %a, %b - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}vector_or_i64: -; SI: v_or_b32_e32 v{{[0-9]}} -; SI: v_or_b32_e32 v{{[0-9]}} -define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 8 - %loadb = load i64, i64 addrspace(1)* %a, align 8 - %or = or i64 %loada, %loadb - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}scalar_vector_or_i64: -; SI: v_or_b32_e32 v{{[0-9]}} -; SI: v_or_b32_e32 v{{[0-9]}} -define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 %b) { - %loada = load i64, i64 addrspace(1)* %a - %or = or i64 %loada, %b - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}vector_or_i64_loadimm: -; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xdf77987f -; SI-DAG: s_movk_i32 [[HI_S_IMM:s[0-9]+]], 0x146f -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] -; SI: s_endpgm -define void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 8 - %or = or i64 %loada, 22470723082367 - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; FIXME: The or 0 should really be removed. -; FUNC-LABEL: {{^}}vector_or_i64_imm: -; SI: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI: v_or_b32_e32 {{v[0-9]+}}, 8, v[[LO_VREG]] -; SI: v_or_b32_e32 {{v[0-9]+}}, 0, {{.*}} -; SI: s_endpgm -define void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 8 - %or = or i64 %loada, 8 - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}trunc_i64_or_to_i32: -; SI: s_load_dword s[[SREG0:[0-9]+]] -; SI: s_load_dword s[[SREG1:[0-9]+]] -; SI: s_or_b32 s[[SRESULT:[0-9]+]], s[[SREG1]], s[[SREG0]] -; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], s[[SRESULT]] -; SI: buffer_store_dword [[VRESULT]], -define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) { - %add = or i64 %b, %a - %trunc = trunc i64 %add to i32 - store i32 %trunc, i32 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}or_i1: -; EG: OR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}} - -; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}] -define void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) { - %a = load float, float addrspace(1)* %in0 - %b = load float, float addrspace(1)* %in1 - %acmp = fcmp oge float %a, 0.000000e+00 - %bcmp = fcmp oge float %b, 0.000000e+00 - %or = or i1 %acmp, %bcmp - %result = zext i1 %or to i32 - store i32 %result, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}s_or_i1: -; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}] -define void @s_or_i1(i1 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { - %cmp0 = icmp eq i32 %a, %b - %cmp1 = icmp eq i32 %c, %d - %or = or i1 %cmp0, %cmp1 - store i1 %or, i1 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/packetizer.ll b/test/CodeGen/R600/packetizer.ll deleted file mode 100644 index 49a7c0df748..00000000000 --- a/test/CodeGen/R600/packetizer.ll +++ /dev/null @@ -1,34 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s - -; CHECK: {{^}}test: -; CHECK: BIT_ALIGN_INT T{{[0-9]}}.X -; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Y -; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Z -; CHECK: BIT_ALIGN_INT * T{{[0-9]}}.W - -define void @test(i32 addrspace(1)* %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) { -entry: - %shl = sub i32 32, %e - %x = add i32 %x_arg, 1 - %x.0 = shl i32 %x, %shl - %x.1 = lshr i32 %x, %e - %x.2 = or i32 %x.0, %x.1 - %y = add i32 %y_arg, 1 - %y.0 = shl i32 %y, %shl - %y.1 = lshr i32 %y, %e - %y.2 = or i32 %y.0, %y.1 - %z = add i32 %z_arg, 1 - %z.0 = shl i32 %z, %shl - %z.1 = lshr i32 %z, %e - %z.2 = or i32 %z.0, %z.1 - %w = add i32 %w_arg, 1 - %w.0 = shl i32 %w, %shl - %w.1 = lshr i32 %w, %e - %w.2 = or i32 %w.0, %w.1 - %xy = or i32 %x.2, %y.2 - %zw = or i32 %z.2, %w.2 - %xyzw = or i32 %xy, %zw - store i32 %xyzw, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/parallelandifcollapse.ll b/test/CodeGen/R600/parallelandifcollapse.ll deleted file mode 100644 index f32b044198a..00000000000 --- a/test/CodeGen/R600/parallelandifcollapse.ll +++ /dev/null @@ -1,59 +0,0 @@ -; Function Attrs: nounwind -; RUN: llc -march=r600 -mcpu=redwood -mattr=-promote-alloca < %s | FileCheck %s -; -; CFG flattening should use parallel-and mode to generate branch conditions and -; then merge if-regions with the same bodies. -; -; CHECK: AND_INT -; CHECK-NEXT: AND_INT -; CHECK-NEXT: OR_INT - -; FIXME: For some reason having the allocas here allowed the flatten cfg pass -; to do its transfomation, however now that we are using local memory for -; allocas, the transformation isn't happening. - -define void @_Z9chk1D_512v() #0 { -entry: - %a0 = alloca i32, align 4 - %b0 = alloca i32, align 4 - %c0 = alloca i32, align 4 - %d0 = alloca i32, align 4 - %a1 = alloca i32, align 4 - %b1 = alloca i32, align 4 - %c1 = alloca i32, align 4 - %d1 = alloca i32, align 4 - %data = alloca i32, align 4 - %0 = load i32, i32* %a0, align 4 - %1 = load i32, i32* %b0, align 4 - %cmp = icmp ne i32 %0, %1 - br i1 %cmp, label %land.lhs.true, label %if.end - -land.lhs.true: ; preds = %entry - %2 = load i32, i32* %c0, align 4 - %3 = load i32, i32* %d0, align 4 - %cmp1 = icmp ne i32 %2, %3 - br i1 %cmp1, label %if.then, label %if.end - -if.then: ; preds = %land.lhs.true - store i32 1, i32* %data, align 4 - br label %if.end - -if.end: ; preds = %if.then, %land.lhs.true, %entry - %4 = load i32, i32* %a1, align 4 - %5 = load i32, i32* %b1, align 4 - %cmp2 = icmp ne i32 %4, %5 - br i1 %cmp2, label %land.lhs.true3, label %if.end6 - -land.lhs.true3: ; preds = %if.end - %6 = load i32, i32* %c1, align 4 - %7 = load i32, i32* %d1, align 4 - %cmp4 = icmp ne i32 %6, %7 - br i1 %cmp4, label %if.then5, label %if.end6 - -if.then5: ; preds = %land.lhs.true3 - store i32 1, i32* %data, align 4 - br label %if.end6 - -if.end6: ; preds = %if.then5, %land.lhs.true3, %if.end - ret void -} diff --git a/test/CodeGen/R600/parallelorifcollapse.ll b/test/CodeGen/R600/parallelorifcollapse.ll deleted file mode 100644 index 1da1e91b8ab..00000000000 --- a/test/CodeGen/R600/parallelorifcollapse.ll +++ /dev/null @@ -1,66 +0,0 @@ -; Function Attrs: nounwind -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -; -; CFG flattening should use parallel-or to generate branch conditions and -; then merge if-regions with the same bodies. - -; FIXME: For some reason having the allocas here allowed the flatten cfg pass -; to do its transfomation, however now that we are using local memory for -; allocas, the transformation isn't happening. -; XFAIL: * -; -; CHECK: OR_INT -; CHECK-NEXT: OR_INT -; CHECK-NEXT: OR_INT -define void @_Z9chk1D_512v() #0 { -entry: - %a0 = alloca i32, align 4 - %b0 = alloca i32, align 4 - %c0 = alloca i32, align 4 - %d0 = alloca i32, align 4 - %a1 = alloca i32, align 4 - %b1 = alloca i32, align 4 - %c1 = alloca i32, align 4 - %d1 = alloca i32, align 4 - %data = alloca i32, align 4 - %0 = load i32, i32* %a0, align 4 - %1 = load i32, i32* %b0, align 4 - %cmp = icmp ne i32 %0, %1 - br i1 %cmp, label %land.lhs.true, label %if.else - -land.lhs.true: ; preds = %entry - %2 = load i32, i32* %c0, align 4 - %3 = load i32, i32* %d0, align 4 - %cmp1 = icmp ne i32 %2, %3 - br i1 %cmp1, label %if.then, label %if.else - -if.then: ; preds = %land.lhs.true - br label %if.end - -if.else: ; preds = %land.lhs.true, %entry - store i32 1, i32* %data, align 4 - br label %if.end - -if.end: ; preds = %if.else, %if.then - %4 = load i32, i32* %a1, align 4 - %5 = load i32, i32* %b1, align 4 - %cmp2 = icmp ne i32 %4, %5 - br i1 %cmp2, label %land.lhs.true3, label %if.else6 - -land.lhs.true3: ; preds = %if.end - %6 = load i32, i32* %c1, align 4 - %7 = load i32, i32* %d1, align 4 - %cmp4 = icmp ne i32 %6, %7 - br i1 %cmp4, label %if.then5, label %if.else6 - -if.then5: ; preds = %land.lhs.true3 - br label %if.end7 - -if.else6: ; preds = %land.lhs.true3, %if.end - store i32 1, i32* %data, align 4 - br label %if.end7 - -if.end7: ; preds = %if.else6, %if.then5 - ret void -} - diff --git a/test/CodeGen/R600/predicate-dp4.ll b/test/CodeGen/R600/predicate-dp4.ll deleted file mode 100644 index 6bc18759435..00000000000 --- a/test/CodeGen/R600/predicate-dp4.ll +++ /dev/null @@ -1,27 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=cayman - -; CHECK-LABEL: {{^}}main: -; CHECK: PRED_SETE_INT * Pred, -; CHECK: DOT4 T{{[0-9]+}}.X, T0.X, T0.X, Pred_sel_one -define void @main(<4 x float> inreg) #0 { -main_body: - %1 = extractelement <4 x float> %0, i32 0 - %2 = bitcast float %1 to i32 - %3 = icmp eq i32 %2, 0 - br i1 %3, label %IF, label %ENDIF - -IF: ; preds = %main_body - %4 = call float @llvm.AMDGPU.dp4(<4 x float> %0, <4 x float> %0) - br label %ENDIF - -ENDIF: ; preds = %IF, %main_body - %5 = phi float [%4, %IF], [0.000000e+00, %main_body] - %6 = insertelement <4 x float> undef, float %5, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %6, i32 0, i32 0) - ret void -} - -declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) -attributes #1 = { readnone } -attributes #0 = { "ShaderType"="0" } diff --git a/test/CodeGen/R600/predicates.ll b/test/CodeGen/R600/predicates.ll deleted file mode 100644 index 0ce74d97ba8..00000000000 --- a/test/CodeGen/R600/predicates.ll +++ /dev/null @@ -1,104 +0,0 @@ -; RUN: llc < %s -march=r600 -mattr=disable-irstructurizer -mcpu=redwood | FileCheck %s - -; These tests make sure the compiler is optimizing branches using predicates -; when it is legal to do so. - -; CHECK: {{^}}simple_if: -; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred, -; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel -define void @simple_if(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = icmp sgt i32 %in, 0 - br i1 %0, label %IF, label %ENDIF - -IF: - %1 = shl i32 %in, 1 - br label %ENDIF - -ENDIF: - %2 = phi i32 [ %in, %entry ], [ %1, %IF ] - store i32 %2, i32 addrspace(1)* %out - ret void -} - -; CHECK: {{^}}simple_if_else: -; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred, -; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel -; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel -define void @simple_if_else(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = icmp sgt i32 %in, 0 - br i1 %0, label %IF, label %ELSE - -IF: - %1 = shl i32 %in, 1 - br label %ENDIF - -ELSE: - %2 = lshr i32 %in, 1 - br label %ENDIF - -ENDIF: - %3 = phi i32 [ %1, %IF ], [ %2, %ELSE ] - store i32 %3, i32 addrspace(1)* %out - ret void -} - -; CHECK: {{^}}nested_if: -; CHECK: ALU_PUSH_BEFORE -; CHECK: JUMP -; CHECK: POP -; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Exec -; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred, -; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel -define void @nested_if(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = icmp sgt i32 %in, 0 - br i1 %0, label %IF0, label %ENDIF - -IF0: - %1 = add i32 %in, 10 - %2 = icmp sgt i32 %1, 0 - br i1 %2, label %IF1, label %ENDIF - -IF1: - %3 = shl i32 %1, 1 - br label %ENDIF - -ENDIF: - %4 = phi i32 [%in, %entry], [%1, %IF0], [%3, %IF1] - store i32 %4, i32 addrspace(1)* %out - ret void -} - -; CHECK: {{^}}nested_if_else: -; CHECK: ALU_PUSH_BEFORE -; CHECK: JUMP -; CHECK: POP -; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Exec -; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred, -; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel -; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel -define void @nested_if_else(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = icmp sgt i32 %in, 0 - br i1 %0, label %IF0, label %ENDIF - -IF0: - %1 = add i32 %in, 10 - %2 = icmp sgt i32 %1, 0 - br i1 %2, label %IF1, label %ELSE1 - -IF1: - %3 = shl i32 %1, 1 - br label %ENDIF - -ELSE1: - %4 = lshr i32 %in, 1 - br label %ENDIF - -ENDIF: - %5 = phi i32 [%in, %entry], [%3, %IF1], [%4, %ELSE1] - store i32 %5, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/private-memory-atomics.ll b/test/CodeGen/R600/private-memory-atomics.ll deleted file mode 100644 index a008ac98a43..00000000000 --- a/test/CodeGen/R600/private-memory-atomics.ll +++ /dev/null @@ -1,32 +0,0 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s - -; This works because promote allocas pass replaces these with LDS atomics. - -; Private atomics have no real use, but at least shouldn't crash on it. -define void @atomicrmw_private(i32 addrspace(1)* %out, i32 %in) nounwind { -entry: - %tmp = alloca [2 x i32] - %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 - %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1 - store i32 0, i32* %tmp1 - store i32 1, i32* %tmp2 - %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in - %tmp4 = atomicrmw add i32* %tmp3, i32 7 acq_rel - store i32 %tmp4, i32 addrspace(1)* %out - ret void -} - -define void @cmpxchg_private(i32 addrspace(1)* %out, i32 %in) nounwind { -entry: - %tmp = alloca [2 x i32] - %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 - %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1 - store i32 0, i32* %tmp1 - store i32 1, i32* %tmp2 - %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in - %tmp4 = cmpxchg i32* %tmp3, i32 0, i32 1 acq_rel monotonic - %val = extractvalue { i32, i1 } %tmp4, 0 - store i32 %val, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/private-memory-broken.ll b/test/CodeGen/R600/private-memory-broken.ll deleted file mode 100644 index 6b18a19f195..00000000000 --- a/test/CodeGen/R600/private-memory-broken.ll +++ /dev/null @@ -1,21 +0,0 @@ -; RUN: not llc -verify-machineinstrs -march=amdgcn -mcpu=SI %s -o /dev/null 2>&1 | FileCheck %s -; RUN: not llc -verify-machineinstrs -march=amdgcn -mcpu=tonga %s -o /dev/null 2>&1 | FileCheck %s - -; Make sure promote alloca pass doesn't crash - -; CHECK: unsupported call - -declare i32 @foo(i32*) nounwind - -define void @call_private(i32 addrspace(1)* %out, i32 %in) nounwind { -entry: - %tmp = alloca [2 x i32] - %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 - %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1 - store i32 0, i32* %tmp1 - store i32 1, i32* %tmp2 - %tmp3 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 %in - %val = call i32 @foo(i32* %tmp3) nounwind - store i32 %val, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/private-memory.ll b/test/CodeGen/R600/private-memory.ll deleted file mode 100644 index 1c562978050..00000000000 --- a/test/CodeGen/R600/private-memory.ll +++ /dev/null @@ -1,313 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC -; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -; FUNC-LABEL: {{^}}mova_same_clause: - -; R600: LDS_WRITE -; R600: LDS_WRITE -; R600: LDS_READ -; R600: LDS_READ - -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_write_b32 -; SI-PROMOTE: ds_read_b32 -; SI-PROMOTE: ds_read_b32 - -; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0 -; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0 -define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { -entry: - %stack = alloca [5 x i32], align 4 - %0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 - store i32 4, i32* %arrayidx1, align 4 - %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 - %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 - store i32 5, i32* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 - %2 = load i32, i32* %arrayidx10, align 4 - store i32 %2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 - %3 = load i32, i32* %arrayidx12 - %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 - store i32 %3, i32 addrspace(1)* %arrayidx13 - ret void -} - -; This test checks that the stack offset is calculated correctly for structs. -; All register loads/stores should be optimized away, so there shouldn't be -; any MOVA instructions. -; -; XXX: This generated code has unnecessary MOVs, we should be able to optimize -; this. - -; FUNC-LABEL: {{^}}multiple_structs: -; R600-NOT: MOVA_INT -; SI-NOT: v_movrel -; SI-NOT: v_movrel -%struct.point = type { i32, i32 } - -define void @multiple_structs(i32 addrspace(1)* %out) { -entry: - %a = alloca %struct.point - %b = alloca %struct.point - %a.x.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 0 - %a.y.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 1 - %b.x.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 0 - %b.y.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 1 - store i32 0, i32* %a.x.ptr - store i32 1, i32* %a.y.ptr - store i32 2, i32* %b.x.ptr - store i32 3, i32* %b.y.ptr - %a.indirect.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 0 - %b.indirect.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 0 - %a.indirect = load i32, i32* %a.indirect.ptr - %b.indirect = load i32, i32* %b.indirect.ptr - %0 = add i32 %a.indirect, %b.indirect - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; Test direct access of a private array inside a loop. The private array -; loads and stores should be lowered to copies, so there shouldn't be any -; MOVA instructions. - -; FUNC-LABEL: {{^}}direct_loop: -; R600-NOT: MOVA_INT -; SI-NOT: v_movrel - -define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { -entry: - %prv_array_const = alloca [2 x i32] - %prv_array = alloca [2 x i32] - %a = load i32, i32 addrspace(1)* %in - %b_src_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %b = load i32, i32 addrspace(1)* %b_src_ptr - %a_dst_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0 - store i32 %a, i32* %a_dst_ptr - %b_dst_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 1 - store i32 %b, i32* %b_dst_ptr - br label %for.body - -for.body: - %inc = phi i32 [0, %entry], [%count, %for.body] - %x_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0 - %x = load i32, i32* %x_ptr - %y_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0 - %y = load i32, i32* %y_ptr - %xy = add i32 %x, %y - store i32 %xy, i32* %y_ptr - %count = add i32 %inc, 1 - %done = icmp eq i32 %count, 4095 - br i1 %done, label %for.end, label %for.body - -for.end: - %value_ptr = getelementptr [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0 - %value = load i32, i32* %value_ptr - store i32 %value, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}short_array: - -; R600: MOVA_INT - -; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x68,0xe0 -; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:2 ; encoding: [0x02,0x10,0x68,0xe0 -; SI-PROMOTE: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} -define void @short_array(i32 addrspace(1)* %out, i32 %index) { -entry: - %0 = alloca [2 x i16] - %1 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 0 - %2 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 1 - store i16 0, i16* %1 - store i16 1, i16* %2 - %3 = getelementptr [2 x i16], [2 x i16]* %0, i32 0, i32 %index - %4 = load i16, i16* %3 - %5 = sext i16 %4 to i32 - store i32 %5, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}char_array: - -; R600: MOVA_INT - -; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x60,0xe0 -; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:1 ; encoding: [0x01,0x10,0x60,0xe0 -define void @char_array(i32 addrspace(1)* %out, i32 %index) { -entry: - %0 = alloca [2 x i8] - %1 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 0 - %2 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 1 - store i8 0, i8* %1 - store i8 1, i8* %2 - %3 = getelementptr [2 x i8], [2 x i8]* %0, i32 0, i32 %index - %4 = load i8, i8* %3 - %5 = sext i8 %4 to i32 - store i32 %5, i32 addrspace(1)* %out - ret void - -} - -; Make sure we don't overwrite workitem information with private memory - -; FUNC-LABEL: {{^}}work_item_info: -; R600-NOT: MOV T0.X -; Additional check in case the move ends up in the last slot -; R600-NOT: MOV * TO.X - -; SI-NOT: v_mov_b32_e{{(32|64)}} v0 -define void @work_item_info(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = alloca [2 x i32] - %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0 - %2 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 1 - store i32 0, i32* %1 - store i32 1, i32* %2 - %3 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 %in - %4 = load i32, i32* %3 - %5 = call i32 @llvm.r600.read.tidig.x() - %6 = add i32 %4, %5 - store i32 %6, i32 addrspace(1)* %out - ret void -} - -; Test that two stack objects are not stored in the same register -; The second stack object should be in T3.X -; FUNC-LABEL: {{^}}no_overlap: -; R600_CHECK: MOV -; R600_CHECK: [[CHAN:[XYZW]]]+ -; R600-NOT: [[CHAN]]+ -; SI: v_mov_b32_e32 v3 -define void @no_overlap(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = alloca [3 x i8], align 1 - %1 = alloca [2 x i8], align 1 - %2 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 0 - %3 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 1 - %4 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 2 - %5 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 0 - %6 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 1 - store i8 0, i8* %2 - store i8 1, i8* %3 - store i8 2, i8* %4 - store i8 1, i8* %5 - store i8 0, i8* %6 - %7 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 %in - %8 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 %in - %9 = load i8, i8* %7 - %10 = load i8, i8* %8 - %11 = add i8 %9, %10 - %12 = sext i8 %11 to i32 - store i32 %12, i32 addrspace(1)* %out - ret void -} - -define void @char_array_array(i32 addrspace(1)* %out, i32 %index) { -entry: - %alloca = alloca [2 x [2 x i8]] - %gep0 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0 - %gep1 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 1 - store i8 0, i8* %gep0 - store i8 1, i8* %gep1 - %gep2 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 %index - %load = load i8, i8* %gep2 - %sext = sext i8 %load to i32 - store i32 %sext, i32 addrspace(1)* %out - ret void -} - -define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) { -entry: - %alloca = alloca [2 x [2 x i32]] - %gep0 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0 - %gep1 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 1 - store i32 0, i32* %gep0 - store i32 1, i32* %gep1 - %gep2 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index - %load = load i32, i32* %gep2 - store i32 %load, i32 addrspace(1)* %out - ret void -} - -define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) { -entry: - %alloca = alloca [2 x [2 x i64]] - %gep0 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0 - %gep1 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 1 - store i64 0, i64* %gep0 - store i64 1, i64* %gep1 - %gep2 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 %index - %load = load i64, i64* %gep2 - store i64 %load, i64 addrspace(1)* %out - ret void -} - -%struct.pair32 = type { i32, i32 } - -define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) { -entry: - %alloca = alloca [2 x [2 x %struct.pair32]] - %gep0 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1 - %gep1 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 1, i32 1 - store i32 0, i32* %gep0 - store i32 1, i32* %gep1 - %gep2 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 %index, i32 0 - %load = load i32, i32* %gep2 - store i32 %load, i32 addrspace(1)* %out - ret void -} - -define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) { -entry: - %alloca = alloca [2 x %struct.pair32] - %gep0 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1 - %gep1 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 1, i32 0 - store i32 0, i32* %gep0 - store i32 1, i32* %gep1 - %gep2 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 %index, i32 0 - %load = load i32, i32* %gep2 - store i32 %load, i32 addrspace(1)* %out - ret void -} - -define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind { -entry: - %tmp = alloca [2 x i32] - %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0 - %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1 - store i32 0, i32* %tmp1 - store i32 1, i32* %tmp2 - %cmp = icmp eq i32 %in, 0 - %sel = select i1 %cmp, i32* %tmp1, i32* %tmp2 - %load = load i32, i32* %sel - store i32 %load, i32 addrspace(1)* %out - ret void -} - -; AMDGPUPromoteAlloca does not know how to handle ptrtoint. When it -; finds one, it should stop trying to promote. - -; FUNC-LABEL: ptrtoint: -; SI-NOT: ds_write -; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen -; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:5 -define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) { - %alloca = alloca [16 x i32] - %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a - store i32 5, i32* %tmp0 - %tmp1 = ptrtoint [16 x i32]* %alloca to i32 - %tmp2 = add i32 %tmp1, 5 - %tmp3 = inttoptr i32 %tmp2 to i32* - %tmp4 = getelementptr i32, i32* %tmp3, i32 %b - %tmp5 = load i32, i32* %tmp4 - store i32 %tmp5, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/pv-packing.ll b/test/CodeGen/R600/pv-packing.ll deleted file mode 100644 index abeae563ff3..00000000000 --- a/test/CodeGen/R600/pv-packing.ll +++ /dev/null @@ -1,45 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s - -;CHECK: DOT4 T{{[0-9]\.X}} -;CHECK: MULADD_IEEE * T{{[0-9]\.W}} - -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) #0 { -main_body: - %0 = extractelement <4 x float> %reg1, i32 0 - %1 = extractelement <4 x float> %reg1, i32 1 - %2 = extractelement <4 x float> %reg1, i32 2 - %3 = extractelement <4 x float> %reg2, i32 0 - %4 = extractelement <4 x float> %reg2, i32 1 - %5 = extractelement <4 x float> %reg2, i32 2 - %6 = extractelement <4 x float> %reg3, i32 0 - %7 = extractelement <4 x float> %reg3, i32 1 - %8 = extractelement <4 x float> %reg3, i32 2 - %9 = load <4 x float>, <4 x float> addrspace(8)* null - %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %11 = call float @llvm.AMDGPU.dp4(<4 x float> %9, <4 x float> %9) - %12 = fmul float %0, %3 - %13 = fadd float %12, %6 - %14 = fmul float %1, %4 - %15 = fadd float %14, %7 - %16 = fmul float %2, %5 - %17 = fadd float %16, %8 - %18 = fmul float %11, %11 - %19 = fadd float %18, %0 - %20 = insertelement <4 x float> undef, float %13, i32 0 - %21 = insertelement <4 x float> %20, float %15, i32 1 - %22 = insertelement <4 x float> %21, float %17, i32 2 - %23 = insertelement <4 x float> %22, float %19, i32 3 - %24 = call float @llvm.AMDGPU.dp4(<4 x float> %23, <4 x float> %10) - %25 = insertelement <4 x float> undef, float %24, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %25, i32 0, i32 2) - ret void -} - -; Function Attrs: readnone -declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 - - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="1" } -attributes #1 = { readnone } diff --git a/test/CodeGen/R600/pv.ll b/test/CodeGen/R600/pv.ll deleted file mode 100644 index 9a57dd19765..00000000000 --- a/test/CodeGen/R600/pv.ll +++ /dev/null @@ -1,241 +0,0 @@ -; RUN: llc < %s -march=r600 | FileCheck %s - -; CHECK: DOT4 * T{{[0-9]\.W}} (MASKED) -; CHECK: MAX T{{[0-9].[XYZW]}}, 0.0, PV.X - -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7) #0 { -main_body: - %0 = extractelement <4 x float> %reg1, i32 0 - %1 = extractelement <4 x float> %reg1, i32 1 - %2 = extractelement <4 x float> %reg1, i32 2 - %3 = extractelement <4 x float> %reg1, i32 3 - %4 = extractelement <4 x float> %reg2, i32 0 - %5 = extractelement <4 x float> %reg2, i32 1 - %6 = extractelement <4 x float> %reg2, i32 2 - %7 = extractelement <4 x float> %reg2, i32 3 - %8 = extractelement <4 x float> %reg3, i32 0 - %9 = extractelement <4 x float> %reg3, i32 1 - %10 = extractelement <4 x float> %reg3, i32 2 - %11 = extractelement <4 x float> %reg3, i32 3 - %12 = extractelement <4 x float> %reg4, i32 0 - %13 = extractelement <4 x float> %reg4, i32 1 - %14 = extractelement <4 x float> %reg4, i32 2 - %15 = extractelement <4 x float> %reg4, i32 3 - %16 = extractelement <4 x float> %reg5, i32 0 - %17 = extractelement <4 x float> %reg5, i32 1 - %18 = extractelement <4 x float> %reg5, i32 2 - %19 = extractelement <4 x float> %reg5, i32 3 - %20 = extractelement <4 x float> %reg6, i32 0 - %21 = extractelement <4 x float> %reg6, i32 1 - %22 = extractelement <4 x float> %reg6, i32 2 - %23 = extractelement <4 x float> %reg6, i32 3 - %24 = extractelement <4 x float> %reg7, i32 0 - %25 = extractelement <4 x float> %reg7, i32 1 - %26 = extractelement <4 x float> %reg7, i32 2 - %27 = extractelement <4 x float> %reg7, i32 3 - %28 = load <4 x float>, <4 x float> addrspace(8)* null - %29 = extractelement <4 x float> %28, i32 0 - %30 = fmul float %0, %29 - %31 = load <4 x float>, <4 x float> addrspace(8)* null - %32 = extractelement <4 x float> %31, i32 1 - %33 = fmul float %0, %32 - %34 = load <4 x float>, <4 x float> addrspace(8)* null - %35 = extractelement <4 x float> %34, i32 2 - %36 = fmul float %0, %35 - %37 = load <4 x float>, <4 x float> addrspace(8)* null - %38 = extractelement <4 x float> %37, i32 3 - %39 = fmul float %0, %38 - %40 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %41 = extractelement <4 x float> %40, i32 0 - %42 = fmul float %1, %41 - %43 = fadd float %42, %30 - %44 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %45 = extractelement <4 x float> %44, i32 1 - %46 = fmul float %1, %45 - %47 = fadd float %46, %33 - %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %49 = extractelement <4 x float> %48, i32 2 - %50 = fmul float %1, %49 - %51 = fadd float %50, %36 - %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %53 = extractelement <4 x float> %52, i32 3 - %54 = fmul float %1, %53 - %55 = fadd float %54, %39 - %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %57 = extractelement <4 x float> %56, i32 0 - %58 = fmul float %2, %57 - %59 = fadd float %58, %43 - %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %61 = extractelement <4 x float> %60, i32 1 - %62 = fmul float %2, %61 - %63 = fadd float %62, %47 - %64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %65 = extractelement <4 x float> %64, i32 2 - %66 = fmul float %2, %65 - %67 = fadd float %66, %51 - %68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %69 = extractelement <4 x float> %68, i32 3 - %70 = fmul float %2, %69 - %71 = fadd float %70, %55 - %72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) - %73 = extractelement <4 x float> %72, i32 0 - %74 = fmul float %3, %73 - %75 = fadd float %74, %59 - %76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) - %77 = extractelement <4 x float> %76, i32 1 - %78 = fmul float %3, %77 - %79 = fadd float %78, %63 - %80 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) - %81 = extractelement <4 x float> %80, i32 2 - %82 = fmul float %3, %81 - %83 = fadd float %82, %67 - %84 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) - %85 = extractelement <4 x float> %84, i32 3 - %86 = fmul float %3, %85 - %87 = fadd float %86, %71 - %88 = insertelement <4 x float> undef, float %4, i32 0 - %89 = insertelement <4 x float> %88, float %5, i32 1 - %90 = insertelement <4 x float> %89, float %6, i32 2 - %91 = insertelement <4 x float> %90, float 0.000000e+00, i32 3 - %92 = insertelement <4 x float> undef, float %4, i32 0 - %93 = insertelement <4 x float> %92, float %5, i32 1 - %94 = insertelement <4 x float> %93, float %6, i32 2 - %95 = insertelement <4 x float> %94, float 0.000000e+00, i32 3 - %96 = call float @llvm.AMDGPU.dp4(<4 x float> %91, <4 x float> %95) - %97 = call float @fabs(float %96) - %98 = call float @llvm.AMDGPU.rsq.f32(float %97) - %99 = fmul float %4, %98 - %100 = fmul float %5, %98 - %101 = fmul float %6, %98 - %102 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) - %103 = extractelement <4 x float> %102, i32 0 - %104 = fmul float %103, %8 - %105 = fadd float %104, %20 - %106 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) - %107 = extractelement <4 x float> %106, i32 1 - %108 = fmul float %107, %9 - %109 = fadd float %108, %21 - %110 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) - %111 = extractelement <4 x float> %110, i32 2 - %112 = fmul float %111, %10 - %113 = fadd float %112, %22 - %114 = call float @llvm.AMDIL.clamp.(float %105, float 0.000000e+00, float 1.000000e+00) - %115 = call float @llvm.AMDIL.clamp.(float %109, float 0.000000e+00, float 1.000000e+00) - %116 = call float @llvm.AMDIL.clamp.(float %113, float 0.000000e+00, float 1.000000e+00) - %117 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00) - %118 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) - %119 = extractelement <4 x float> %118, i32 0 - %120 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) - %121 = extractelement <4 x float> %120, i32 1 - %122 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) - %123 = extractelement <4 x float> %122, i32 2 - %124 = insertelement <4 x float> undef, float %99, i32 0 - %125 = insertelement <4 x float> %124, float %100, i32 1 - %126 = insertelement <4 x float> %125, float %101, i32 2 - %127 = insertelement <4 x float> %126, float 0.000000e+00, i32 3 - %128 = insertelement <4 x float> undef, float %119, i32 0 - %129 = insertelement <4 x float> %128, float %121, i32 1 - %130 = insertelement <4 x float> %129, float %123, i32 2 - %131 = insertelement <4 x float> %130, float 0.000000e+00, i32 3 - %132 = call float @llvm.AMDGPU.dp4(<4 x float> %127, <4 x float> %131) - %133 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) - %134 = extractelement <4 x float> %133, i32 0 - %135 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) - %136 = extractelement <4 x float> %135, i32 1 - %137 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) - %138 = extractelement <4 x float> %137, i32 2 - %139 = insertelement <4 x float> undef, float %99, i32 0 - %140 = insertelement <4 x float> %139, float %100, i32 1 - %141 = insertelement <4 x float> %140, float %101, i32 2 - %142 = insertelement <4 x float> %141, float 0.000000e+00, i32 3 - %143 = insertelement <4 x float> undef, float %134, i32 0 - %144 = insertelement <4 x float> %143, float %136, i32 1 - %145 = insertelement <4 x float> %144, float %138, i32 2 - %146 = insertelement <4 x float> %145, float 0.000000e+00, i32 3 - %147 = call float @llvm.AMDGPU.dp4(<4 x float> %142, <4 x float> %146) - %148 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) - %149 = extractelement <4 x float> %148, i32 0 - %150 = fmul float %149, %8 - %151 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) - %152 = extractelement <4 x float> %151, i32 1 - %153 = fmul float %152, %9 - %154 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8) - %155 = extractelement <4 x float> %154, i32 2 - %156 = fmul float %155, %10 - %157 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) - %158 = extractelement <4 x float> %157, i32 0 - %159 = fmul float %158, %12 - %160 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) - %161 = extractelement <4 x float> %160, i32 1 - %162 = fmul float %161, %13 - %163 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) - %164 = extractelement <4 x float> %163, i32 2 - %165 = fmul float %164, %14 - %166 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10) - %167 = extractelement <4 x float> %166, i32 0 - %168 = fmul float %167, %16 - %169 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10) - %170 = extractelement <4 x float> %169, i32 1 - %171 = fmul float %170, %17 - %172 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10) - %173 = extractelement <4 x float> %172, i32 2 - %174 = fmul float %173, %18 - %175 = fcmp uge float %132, 0.000000e+00 - %176 = select i1 %175, float %132, float 0.000000e+00 - %177 = fcmp uge float %147, 0.000000e+00 - %178 = select i1 %177, float %147, float 0.000000e+00 - %179 = call float @llvm.pow.f32(float %178, float %24) - %180 = fcmp ult float %132, 0.000000e+00 - %181 = select i1 %180, float 0.000000e+00, float %179 - %182 = fadd float %150, %105 - %183 = fadd float %153, %109 - %184 = fadd float %156, %113 - %185 = fmul float %176, %159 - %186 = fadd float %185, %182 - %187 = fmul float %176, %162 - %188 = fadd float %187, %183 - %189 = fmul float %176, %165 - %190 = fadd float %189, %184 - %191 = fmul float %181, %168 - %192 = fadd float %191, %186 - %193 = fmul float %181, %171 - %194 = fadd float %193, %188 - %195 = fmul float %181, %174 - %196 = fadd float %195, %190 - %197 = call float @llvm.AMDIL.clamp.(float %192, float 0.000000e+00, float 1.000000e+00) - %198 = call float @llvm.AMDIL.clamp.(float %194, float 0.000000e+00, float 1.000000e+00) - %199 = call float @llvm.AMDIL.clamp.(float %196, float 0.000000e+00, float 1.000000e+00) - %200 = insertelement <4 x float> undef, float %75, i32 0 - %201 = insertelement <4 x float> %200, float %79, i32 1 - %202 = insertelement <4 x float> %201, float %83, i32 2 - %203 = insertelement <4 x float> %202, float %87, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %203, i32 60, i32 1) - %204 = insertelement <4 x float> undef, float %197, i32 0 - %205 = insertelement <4 x float> %204, float %198, i32 1 - %206 = insertelement <4 x float> %205, float %199, i32 2 - %207 = insertelement <4 x float> %206, float %117, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %207, i32 0, i32 2) - ret void -} - -; Function Attrs: readnone -declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 - -; Function Attrs: readonly -declare float @fabs(float) #2 - -; Function Attrs: readnone -declare float @llvm.AMDGPU.rsq.f32(float) #1 - -; Function Attrs: readnone -declare float @llvm.AMDIL.clamp.(float, float, float) #1 - -; Function Attrs: nounwind readonly -declare float @llvm.pow.f32(float, float) #3 - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="1" } -attributes #1 = { readnone } -attributes #2 = { readonly } -attributes #3 = { nounwind readonly } diff --git a/test/CodeGen/R600/r600-encoding.ll b/test/CodeGen/R600/r600-encoding.ll deleted file mode 100644 index 3a82ee30a32..00000000000 --- a/test/CodeGen/R600/r600-encoding.ll +++ /dev/null @@ -1,25 +0,0 @@ -; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=redwood | FileCheck --check-prefix=EG %s -; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=rs880 | FileCheck --check-prefix=R600 %s - -; The earliest R600 GPUs have a slightly different encoding than the rest of -; the VLIW4/5 GPUs. - -; EG: {{^}}test: -; EG: MUL_IEEE {{[ *TXYZWPVxyzw.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x01,0x[0-9a-f]+,0x[0-9a-f]+}}] - -; R600: {{^}}test: -; R600: MUL_IEEE {{[ *TXYZWPVxyzw.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x02,0x[0-9a-f]+,0x[0-9a-f]+}}] - -define void @test(<4 x float> inreg %reg0) #0 { -entry: - %r0 = extractelement <4 x float> %reg0, i32 0 - %r1 = extractelement <4 x float> %reg0, i32 1 - %r2 = fmul float %r0, %r1 - %vec = insertelement <4 x float> undef, float %r2, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) - ret void -} - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } diff --git a/test/CodeGen/R600/r600-export-fix.ll b/test/CodeGen/R600/r600-export-fix.ll deleted file mode 100644 index 7cb80195b36..00000000000 --- a/test/CodeGen/R600/r600-export-fix.ll +++ /dev/null @@ -1,142 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=cedar | FileCheck %s - -;CHECK: EXPORT T{{[0-9]}}.XYZW -;CHECK: EXPORT T{{[0-9]}}.0000 -;CHECK: EXPORT T{{[0-9]}}.0000 -;CHECK: EXPORT T{{[0-9]}}.0XYZ -;CHECK: EXPORT T{{[0-9]}}.XYZW -;CHECK: EXPORT T{{[0-9]}}.YZ00 -;CHECK: EXPORT T{{[0-9]}}.0000 -;CHECK: EXPORT T{{[0-9]}}.0000 - - -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { -main_body: - %0 = extractelement <4 x float> %reg1, i32 0 - %1 = extractelement <4 x float> %reg1, i32 1 - %2 = extractelement <4 x float> %reg1, i32 2 - %3 = extractelement <4 x float> %reg1, i32 3 - %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) - %5 = extractelement <4 x float> %4, i32 0 - %6 = fmul float %5, %0 - %7 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) - %8 = extractelement <4 x float> %7, i32 1 - %9 = fmul float %8, %0 - %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) - %11 = extractelement <4 x float> %10, i32 2 - %12 = fmul float %11, %0 - %13 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4) - %14 = extractelement <4 x float> %13, i32 3 - %15 = fmul float %14, %0 - %16 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) - %17 = extractelement <4 x float> %16, i32 0 - %18 = fmul float %17, %1 - %19 = fadd float %18, %6 - %20 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) - %21 = extractelement <4 x float> %20, i32 1 - %22 = fmul float %21, %1 - %23 = fadd float %22, %9 - %24 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) - %25 = extractelement <4 x float> %24, i32 2 - %26 = fmul float %25, %1 - %27 = fadd float %26, %12 - %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5) - %29 = extractelement <4 x float> %28, i32 3 - %30 = fmul float %29, %1 - %31 = fadd float %30, %15 - %32 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6) - %33 = extractelement <4 x float> %32, i32 0 - %34 = fmul float %33, %2 - %35 = fadd float %34, %19 - %36 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6) - %37 = extractelement <4 x float> %36, i32 1 - %38 = fmul float %37, %2 - %39 = fadd float %38, %23 - %40 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6) - %41 = extractelement <4 x float> %40, i32 2 - %42 = fmul float %41, %2 - %43 = fadd float %42, %27 - %44 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6) - %45 = extractelement <4 x float> %44, i32 3 - %46 = fmul float %45, %2 - %47 = fadd float %46, %31 - %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) - %49 = extractelement <4 x float> %48, i32 0 - %50 = fmul float %49, %3 - %51 = fadd float %50, %35 - %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) - %53 = extractelement <4 x float> %52, i32 1 - %54 = fmul float %53, %3 - %55 = fadd float %54, %39 - %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) - %57 = extractelement <4 x float> %56, i32 2 - %58 = fmul float %57, %3 - %59 = fadd float %58, %43 - %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7) - %61 = extractelement <4 x float> %60, i32 3 - %62 = fmul float %61, %3 - %63 = fadd float %62, %47 - %64 = load <4 x float>, <4 x float> addrspace(8)* null - %65 = extractelement <4 x float> %64, i32 0 - %66 = load <4 x float>, <4 x float> addrspace(8)* null - %67 = extractelement <4 x float> %66, i32 1 - %68 = load <4 x float>, <4 x float> addrspace(8)* null - %69 = extractelement <4 x float> %68, i32 2 - %70 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %71 = extractelement <4 x float> %70, i32 0 - %72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %73 = extractelement <4 x float> %72, i32 1 - %74 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %75 = extractelement <4 x float> %74, i32 2 - %76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) - %77 = extractelement <4 x float> %76, i32 0 - %78 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) - %79 = extractelement <4 x float> %78, i32 1 - %80 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3) - %81 = extractelement <4 x float> %80, i32 2 - %82 = insertelement <4 x float> undef, float %51, i32 0 - %83 = insertelement <4 x float> %82, float %55, i32 1 - %84 = insertelement <4 x float> %83, float %59, i32 2 - %85 = insertelement <4 x float> %84, float %63, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %85, i32 60, i32 1) - %86 = insertelement <4 x float> undef, float 0.000000e+00, i32 0 - %87 = insertelement <4 x float> %86, float 0.000000e+00, i32 1 - %88 = insertelement <4 x float> %87, float 0.000000e+00, i32 2 - %89 = insertelement <4 x float> %88, float 0.000000e+00, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %89, i32 0, i32 2) - %90 = insertelement <4 x float> undef, float 0.000000e+00, i32 0 - %91 = insertelement <4 x float> %90, float 0.000000e+00, i32 1 - %92 = insertelement <4 x float> %91, float 0.000000e+00, i32 2 - %93 = insertelement <4 x float> %92, float 0.000000e+00, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %93, i32 1, i32 2) - %94 = insertelement <4 x float> undef, float 0.000000e+00, i32 0 - %95 = insertelement <4 x float> %94, float %65, i32 1 - %96 = insertelement <4 x float> %95, float %67, i32 2 - %97 = insertelement <4 x float> %96, float %69, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %97, i32 2, i32 2) - %98 = insertelement <4 x float> undef, float %77, i32 0 - %99 = insertelement <4 x float> %98, float %79, i32 1 - %100 = insertelement <4 x float> %99, float %81, i32 2 - %101 = insertelement <4 x float> %100, float %71, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %101, i32 3, i32 2) - %102 = insertelement <4 x float> undef, float %73, i32 0 - %103 = insertelement <4 x float> %102, float %75, i32 1 - %104 = insertelement <4 x float> %103, float 0.000000e+00, i32 2 - %105 = insertelement <4 x float> %104, float 0.000000e+00, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %105, i32 4, i32 2) - %106 = insertelement <4 x float> undef, float 0.000000e+00, i32 0 - %107 = insertelement <4 x float> %106, float 0.000000e+00, i32 1 - %108 = insertelement <4 x float> %107, float 0.000000e+00, i32 2 - %109 = insertelement <4 x float> %108, float 0.000000e+00, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %109, i32 5, i32 2) - %110 = insertelement <4 x float> undef, float 0.000000e+00, i32 0 - %111 = insertelement <4 x float> %110, float 0.000000e+00, i32 1 - %112 = insertelement <4 x float> %111, float 0.000000e+00, i32 2 - %113 = insertelement <4 x float> %112, float 0.000000e+00, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %113, i32 6, i32 2) - ret void -} - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="1" } diff --git a/test/CodeGen/R600/r600-infinite-loop-bug-while-reorganizing-vector.ll b/test/CodeGen/R600/r600-infinite-loop-bug-while-reorganizing-vector.ll deleted file mode 100644 index f388f8ffe29..00000000000 --- a/test/CodeGen/R600/r600-infinite-loop-bug-while-reorganizing-vector.ll +++ /dev/null @@ -1,58 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=cayman - -define void @main(<4 x float> inreg, <4 x float> inreg) #0 { -main_body: - %2 = extractelement <4 x float> %0, i32 0 - %3 = extractelement <4 x float> %0, i32 1 - %4 = extractelement <4 x float> %0, i32 2 - %5 = extractelement <4 x float> %0, i32 3 - %6 = insertelement <4 x float> undef, float %2, i32 0 - %7 = insertelement <4 x float> %6, float %3, i32 1 - %8 = insertelement <4 x float> %7, float %4, i32 2 - %9 = insertelement <4 x float> %8, float %5, i32 3 - %10 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %9) - %11 = extractelement <4 x float> %10, i32 0 - %12 = extractelement <4 x float> %10, i32 1 - %13 = extractelement <4 x float> %10, i32 2 - %14 = extractelement <4 x float> %10, i32 3 - %15 = call float @fabs(float %13) - %16 = fdiv float 1.000000e+00, %15 - %17 = fmul float %11, %16 - %18 = fadd float %17, 1.500000e+00 - %19 = fmul float %12, %16 - %20 = fadd float %19, 1.500000e+00 - %21 = insertelement <4 x float> undef, float %20, i32 0 - %22 = insertelement <4 x float> %21, float %18, i32 1 - %23 = insertelement <4 x float> %22, float %14, i32 2 - %24 = insertelement <4 x float> %23, float %5, i32 3 - %25 = extractelement <4 x float> %24, i32 0 - %26 = extractelement <4 x float> %24, i32 1 - %27 = extractelement <4 x float> %24, i32 2 - %28 = extractelement <4 x float> %24, i32 3 - %29 = insertelement <4 x float> undef, float %25, i32 0 - %30 = insertelement <4 x float> %29, float %26, i32 1 - %31 = insertelement <4 x float> %30, float %27, i32 2 - %32 = insertelement <4 x float> %31, float %28, i32 3 - %33 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %32, i32 16, i32 0, i32 13) - %34 = extractelement <4 x float> %33, i32 0 - %35 = insertelement <4 x float> undef, float %34, i32 0 - %36 = insertelement <4 x float> %35, float %34, i32 1 - %37 = insertelement <4 x float> %36, float %34, i32 2 - %38 = insertelement <4 x float> %37, float 1.000000e+00, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %38, i32 0, i32 0) - ret void -} - -; Function Attrs: readnone -declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #1 - -; Function Attrs: readnone -declare float @fabs(float) #1 - -; Function Attrs: readnone -declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1 - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { readnone } diff --git a/test/CodeGen/R600/r600cfg.ll b/test/CodeGen/R600/r600cfg.ll deleted file mode 100644 index c7b9d65220f..00000000000 --- a/test/CodeGen/R600/r600cfg.ll +++ /dev/null @@ -1,119 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood - -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { -main_body: - %0 = extractelement <4 x float> %reg1, i32 0 - %1 = extractelement <4 x float> %reg1, i32 1 - %2 = extractelement <4 x float> %reg1, i32 2 - %3 = extractelement <4 x float> %reg1, i32 3 - %4 = bitcast float %0 to i32 - %5 = icmp eq i32 %4, 0 - %6 = sext i1 %5 to i32 - %7 = bitcast i32 %6 to float - %8 = bitcast float %7 to i32 - %9 = icmp ne i32 %8, 0 - %. = select i1 %9, float 0x36A0000000000000, float %0 - br label %LOOP - -LOOP: ; preds = %LOOP47, %main_body - %temp12.0 = phi float [ 0x36A0000000000000, %main_body ], [ %temp12.1, %LOOP47 ] - %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %38, %LOOP47 ] - %temp4.1 = phi float [ %., %main_body ], [ %52, %LOOP47 ] - %10 = bitcast float %temp4.1 to i32 - %11 = icmp eq i32 %10, 1 - %12 = sext i1 %11 to i32 - %13 = bitcast i32 %12 to float - %14 = bitcast float %13 to i32 - %15 = icmp ne i32 %14, 0 - br i1 %15, label %IF41, label %ENDIF40 - -IF41: ; preds = %LOOP - %16 = insertelement <4 x float> undef, float %0, i32 0 - %17 = insertelement <4 x float> %16, float %temp8.0, i32 1 - %18 = insertelement <4 x float> %17, float %temp12.0, i32 2 - %19 = insertelement <4 x float> %18, float 0.000000e+00, i32 3 - call void @llvm.R600.store.stream.output(<4 x float> %19, i32 0, i32 0, i32 1) - %20 = insertelement <4 x float> undef, float %0, i32 0 - %21 = insertelement <4 x float> %20, float %temp8.0, i32 1 - %22 = insertelement <4 x float> %21, float %temp12.0, i32 2 - %23 = insertelement <4 x float> %22, float 0.000000e+00, i32 3 - call void @llvm.R600.store.stream.output(<4 x float> %23, i32 0, i32 0, i32 2) - %24 = insertelement <4 x float> undef, float %0, i32 0 - %25 = insertelement <4 x float> %24, float %temp8.0, i32 1 - %26 = insertelement <4 x float> %25, float %temp12.0, i32 2 - %27 = insertelement <4 x float> %26, float 0.000000e+00, i32 3 - call void @llvm.R600.store.stream.output(<4 x float> %27, i32 0, i32 0, i32 4) - %28 = insertelement <4 x float> undef, float 0.000000e+00, i32 0 - %29 = insertelement <4 x float> %28, float 0.000000e+00, i32 1 - %30 = insertelement <4 x float> %29, float 0.000000e+00, i32 2 - %31 = insertelement <4 x float> %30, float 0.000000e+00, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %31, i32 60, i32 1) - %32 = insertelement <4 x float> undef, float %0, i32 0 - %33 = insertelement <4 x float> %32, float %temp8.0, i32 1 - %34 = insertelement <4 x float> %33, float %temp12.0, i32 2 - %35 = insertelement <4 x float> %34, float 0.000000e+00, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %35, i32 0, i32 2) - ret void - -ENDIF40: ; preds = %LOOP - %36 = bitcast float %temp8.0 to i32 - %37 = add i32 %36, 1 - %38 = bitcast i32 %37 to float - %39 = bitcast float %temp4.1 to i32 - %40 = urem i32 %39, 2 - %41 = bitcast i32 %40 to float - %42 = bitcast float %41 to i32 - %43 = icmp eq i32 %42, 0 - %44 = sext i1 %43 to i32 - %45 = bitcast i32 %44 to float - %46 = bitcast float %45 to i32 - %47 = icmp ne i32 %46, 0 - %48 = bitcast float %temp4.1 to i32 - br i1 %47, label %IF44, label %ELSE45 - -IF44: ; preds = %ENDIF40 - %49 = udiv i32 %48, 2 - br label %ENDIF43 - -ELSE45: ; preds = %ENDIF40 - %50 = mul i32 3, %48 - %51 = add i32 %50, 1 - br label %ENDIF43 - -ENDIF43: ; preds = %ELSE45, %IF44 - %.sink = phi i32 [ %49, %IF44 ], [ %51, %ELSE45 ] - %52 = bitcast i32 %.sink to float - %53 = load <4 x float>, <4 x float> addrspace(8)* null - %54 = extractelement <4 x float> %53, i32 0 - %55 = bitcast float %54 to i32 - br label %LOOP47 - -LOOP47: ; preds = %ENDIF48, %ENDIF43 - %temp12.1 = phi float [ %temp12.0, %ENDIF43 ], [ %67, %ENDIF48 ] - %temp28.0 = phi float [ 0.000000e+00, %ENDIF43 ], [ %70, %ENDIF48 ] - %56 = bitcast float %temp28.0 to i32 - %57 = icmp uge i32 %56, %55 - %58 = sext i1 %57 to i32 - %59 = bitcast i32 %58 to float - %60 = bitcast float %59 to i32 - %61 = icmp ne i32 %60, 0 - br i1 %61, label %LOOP, label %ENDIF48 - -ENDIF48: ; preds = %LOOP47 - %62 = bitcast float %temp12.1 to i32 - %63 = mul i32 %62, 2 - %64 = bitcast i32 %63 to float - %65 = bitcast float %64 to i32 - %66 = urem i32 %65, 2147483647 - %67 = bitcast i32 %66 to float - %68 = bitcast float %temp28.0 to i32 - %69 = add i32 %68, 1 - %70 = bitcast i32 %69 to float - br label %LOOP47 -} - -declare void @llvm.R600.store.stream.output(<4 x float>, i32, i32, i32) - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="1" } diff --git a/test/CodeGen/R600/reciprocal.ll b/test/CodeGen/R600/reciprocal.ll deleted file mode 100644 index b4ac47afced..00000000000 --- a/test/CodeGen/R600/reciprocal.ll +++ /dev/null @@ -1,15 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -define void @test(<4 x float> inreg %reg0) #0 { - %r0 = extractelement <4 x float> %reg0, i32 0 - %r1 = fdiv float 1.0, %r0 - %vec = insertelement <4 x float> undef, float %r1, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) - ret void -} - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } diff --git a/test/CodeGen/R600/register-count-comments.ll b/test/CodeGen/R600/register-count-comments.ll deleted file mode 100644 index de6bfb31088..00000000000 --- a/test/CodeGen/R600/register-count-comments.ll +++ /dev/null @@ -1,27 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -asm-verbose < %s | FileCheck -check-prefix=SI %s - -declare i32 @llvm.SI.tid() nounwind readnone - -; SI-LABEL: {{^}}foo: -; SI: .section .AMDGPU.csdata -; SI: ; Kernel info: -; SI: ; NumSgprs: {{[0-9]+}} -; SI: ; NumVgprs: {{[0-9]+}} -define void @foo(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %abase, i32 addrspace(1)* %bbase) nounwind { - %tid = call i32 @llvm.SI.tid() nounwind readnone - %aptr = getelementptr i32, i32 addrspace(1)* %abase, i32 %tid - %bptr = getelementptr i32, i32 addrspace(1)* %bbase, i32 %tid - %outptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %aptr, align 4 - %b = load i32, i32 addrspace(1)* %bptr, align 4 - %result = add i32 %a, %b - store i32 %result, i32 addrspace(1)* %outptr, align 4 - ret void -} - -; SI-LABEL: {{^}}one_vgpr_used: -; SI: NumVgprs: 1 -define void @one_vgpr_used(i32 addrspace(1)* %out, i32 %x) nounwind { - store i32 %x, i32 addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/reorder-stores.ll b/test/CodeGen/R600/reorder-stores.ll deleted file mode 100644 index 187650ff9a5..00000000000 --- a/test/CodeGen/R600/reorder-stores.ll +++ /dev/null @@ -1,105 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}no_reorder_v2f64_global_load_store: -; SI: buffer_load_dwordx2 -; SI: buffer_load_dwordx2 -; SI: buffer_load_dwordx2 -; SI: buffer_load_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind { - %tmp1 = load <2 x double>, <2 x double> addrspace(1)* %x, align 16 - %tmp4 = load <2 x double>, <2 x double> addrspace(1)* %y, align 16 - store <2 x double> %tmp4, <2 x double> addrspace(1)* %x, align 16 - store <2 x double> %tmp1, <2 x double> addrspace(1)* %y, align 16 - ret void -} - -; SI-LABEL: {{^}}no_reorder_scalarized_v2f64_local_load_store: -; SI: ds_read_b64 -; SI: ds_read_b64 -; SI: ds_write_b64 -; SI: ds_write_b64 -; SI: s_endpgm -define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind { - %tmp1 = load <2 x double>, <2 x double> addrspace(3)* %x, align 16 - %tmp4 = load <2 x double>, <2 x double> addrspace(3)* %y, align 16 - store <2 x double> %tmp4, <2 x double> addrspace(3)* %x, align 16 - store <2 x double> %tmp1, <2 x double> addrspace(3)* %y, align 16 - ret void -} - -; SI-LABEL: {{^}}no_reorder_split_v8i32_global_load_store: -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - - -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword - -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword - -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword - -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: s_endpgm -define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind { - %tmp1 = load <8 x i32>, <8 x i32> addrspace(1)* %x, align 32 - %tmp4 = load <8 x i32>, <8 x i32> addrspace(1)* %y, align 32 - store <8 x i32> %tmp4, <8 x i32> addrspace(1)* %x, align 32 - store <8 x i32> %tmp1, <8 x i32> addrspace(1)* %y, align 32 - ret void -} - -; SI-LABEL: {{^}}no_reorder_extload_64: -; SI: ds_read_b64 -; SI: ds_read_b64 -; SI: ds_write_b64 -; SI-NOT: ds_read -; SI: ds_write_b64 -; SI: s_endpgm -define void @no_reorder_extload_64(<2 x i32> addrspace(3)* nocapture %x, <2 x i32> addrspace(3)* nocapture %y) nounwind { - %tmp1 = load <2 x i32>, <2 x i32> addrspace(3)* %x, align 8 - %tmp4 = load <2 x i32>, <2 x i32> addrspace(3)* %y, align 8 - %tmp1ext = zext <2 x i32> %tmp1 to <2 x i64> - %tmp4ext = zext <2 x i32> %tmp4 to <2 x i64> - %tmp7 = add <2 x i64> %tmp1ext, - %tmp9 = add <2 x i64> %tmp4ext, - %trunctmp9 = trunc <2 x i64> %tmp9 to <2 x i32> - %trunctmp7 = trunc <2 x i64> %tmp7 to <2 x i32> - store <2 x i32> %trunctmp9, <2 x i32> addrspace(3)* %x, align 8 - store <2 x i32> %trunctmp7, <2 x i32> addrspace(3)* %y, align 8 - ret void -} diff --git a/test/CodeGen/R600/rotl.i64.ll b/test/CodeGen/R600/rotl.i64.ll deleted file mode 100644 index 3f4ceb7e031..00000000000 --- a/test/CodeGen/R600/rotl.i64.ll +++ /dev/null @@ -1,39 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s - -; BOTH-LABEL: {{^}}s_rotl_i64: -; BOTH-DAG: s_lshl_b64 -; BOTH-DAG: s_sub_i32 -; BOTH-DAG: s_lshr_b64 -; BOTH: s_or_b64 -; BOTH: s_endpgm -define void @s_rotl_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) { -entry: - %0 = shl i64 %x, %y - %1 = sub i64 64, %y - %2 = lshr i64 %x, %1 - %3 = or i64 %0, %2 - store i64 %3, i64 addrspace(1)* %in - ret void -} - -; BOTH-LABEL: {{^}}v_rotl_i64: -; SI-DAG: v_lshl_b64 -; VI-DAG: v_lshlrev_b64 -; BOTH-DAG: v_sub_i32 -; SI: v_lshr_b64 -; VI: v_lshrrev_b64 -; BOTH: v_or_b32 -; BOTH: v_or_b32 -; BOTH: s_endpgm -define void @v_rotl_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) { -entry: - %x = load i64, i64 addrspace(1)* %xptr, align 8 - %y = load i64, i64 addrspace(1)* %yptr, align 8 - %tmp0 = shl i64 %x, %y - %tmp1 = sub i64 64, %y - %tmp2 = lshr i64 %x, %tmp1 - %tmp3 = or i64 %tmp0, %tmp2 - store i64 %tmp3, i64 addrspace(1)* %in, align 8 - ret void -} diff --git a/test/CodeGen/R600/rotl.ll b/test/CodeGen/R600/rotl.ll deleted file mode 100644 index 6c144cd56ea..00000000000 --- a/test/CodeGen/R600/rotl.ll +++ /dev/null @@ -1,57 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}rotl_i32: -; R600: SUB_INT {{\** T[0-9]+\.[XYZW]}}, literal.x -; R600-NEXT: 32 -; R600: BIT_ALIGN_INT {{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].Z, PV.{{[XYZW]}} - -; SI: s_sub_i32 [[SDST:s[0-9]+]], 32, {{[s][0-9]+}} -; SI: v_mov_b32_e32 [[VDST:v[0-9]+]], [[SDST]] -; SI: v_alignbit_b32 {{v[0-9]+, [s][0-9]+, s[0-9]+}}, [[VDST]] -define void @rotl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) { -entry: - %0 = shl i32 %x, %y - %1 = sub i32 32, %y - %2 = lshr i32 %x, %1 - %3 = or i32 %0, %2 - store i32 %3, i32 addrspace(1)* %in - ret void -} - -; FUNC-LABEL: {{^}}rotl_v2i32: -; SI-DAG: s_sub_i32 -; SI-DAG: s_sub_i32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: v_alignbit_b32 -; SI: s_endpgm -define void @rotl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) { -entry: - %0 = shl <2 x i32> %x, %y - %1 = sub <2 x i32> , %y - %2 = lshr <2 x i32> %x, %1 - %3 = or <2 x i32> %0, %2 - store <2 x i32> %3, <2 x i32> addrspace(1)* %in - ret void -} - -; FUNC-LABEL: {{^}}rotl_v4i32: -; SI-DAG: s_sub_i32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: s_sub_i32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: s_sub_i32 -; SI-DAG: v_alignbit_b32 -; SI-DAG: s_sub_i32 -; SI-DAG: v_alignbit_b32 -; SI: s_endpgm -define void @rotl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) { -entry: - %0 = shl <4 x i32> %x, %y - %1 = sub <4 x i32> , %y - %2 = lshr <4 x i32> %x, %1 - %3 = or <4 x i32> %0, %2 - store <4 x i32> %3, <4 x i32> addrspace(1)* %in - ret void -} diff --git a/test/CodeGen/R600/rotr.i64.ll b/test/CodeGen/R600/rotr.i64.ll deleted file mode 100644 index 586de44a566..00000000000 --- a/test/CodeGen/R600/rotr.i64.ll +++ /dev/null @@ -1,61 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s - -; BOTH-LABEL: {{^}}s_rotr_i64: -; BOTH-DAG: s_sub_i32 -; BOTH-DAG: s_lshr_b64 -; BOTH-DAG: s_lshl_b64 -; BOTH: s_or_b64 -define void @s_rotr_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) { -entry: - %tmp0 = sub i64 64, %y - %tmp1 = shl i64 %x, %tmp0 - %tmp2 = lshr i64 %x, %y - %tmp3 = or i64 %tmp1, %tmp2 - store i64 %tmp3, i64 addrspace(1)* %in - ret void -} - -; BOTH-LABEL: {{^}}v_rotr_i64: -; BOTH-DAG: v_sub_i32 -; SI-DAG: v_lshr_b64 -; SI-DAG: v_lshl_b64 -; VI-DAG: v_lshrrev_b64 -; VI-DAG: v_lshlrev_b64 -; BOTH: v_or_b32 -; BOTH: v_or_b32 -define void @v_rotr_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) { -entry: - %x = load i64, i64 addrspace(1)* %xptr, align 8 - %y = load i64, i64 addrspace(1)* %yptr, align 8 - %tmp0 = sub i64 64, %y - %tmp1 = shl i64 %x, %tmp0 - %tmp2 = lshr i64 %x, %y - %tmp3 = or i64 %tmp1, %tmp2 - store i64 %tmp3, i64 addrspace(1)* %in - ret void -} - -; BOTH-LABEL: {{^}}s_rotr_v2i64: -define void @s_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> %x, <2 x i64> %y) { -entry: - %tmp0 = sub <2 x i64> , %y - %tmp1 = shl <2 x i64> %x, %tmp0 - %tmp2 = lshr <2 x i64> %x, %y - %tmp3 = or <2 x i64> %tmp1, %tmp2 - store <2 x i64> %tmp3, <2 x i64> addrspace(1)* %in - ret void -} - -; BOTH-LABEL: {{^}}v_rotr_v2i64: -define void @v_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> addrspace(1)* %xptr, <2 x i64> addrspace(1)* %yptr) { -entry: - %x = load <2 x i64>, <2 x i64> addrspace(1)* %xptr, align 8 - %y = load <2 x i64>, <2 x i64> addrspace(1)* %yptr, align 8 - %tmp0 = sub <2 x i64> , %y - %tmp1 = shl <2 x i64> %x, %tmp0 - %tmp2 = lshr <2 x i64> %x, %y - %tmp3 = or <2 x i64> %tmp1, %tmp2 - store <2 x i64> %tmp3, <2 x i64> addrspace(1)* %in - ret void -} diff --git a/test/CodeGen/R600/rotr.ll b/test/CodeGen/R600/rotr.ll deleted file mode 100644 index 044f9ffe6d6..00000000000 --- a/test/CodeGen/R600/rotr.ll +++ /dev/null @@ -1,53 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}rotr_i32: -; R600: BIT_ALIGN_INT - -; SI: v_alignbit_b32 -define void @rotr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) { -entry: - %tmp0 = sub i32 32, %y - %tmp1 = shl i32 %x, %tmp0 - %tmp2 = lshr i32 %x, %y - %tmp3 = or i32 %tmp1, %tmp2 - store i32 %tmp3, i32 addrspace(1)* %in - ret void -} - -; FUNC-LABEL: {{^}}rotr_v2i32: -; R600: BIT_ALIGN_INT -; R600: BIT_ALIGN_INT - -; SI: v_alignbit_b32 -; SI: v_alignbit_b32 -define void @rotr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) { -entry: - %tmp0 = sub <2 x i32> , %y - %tmp1 = shl <2 x i32> %x, %tmp0 - %tmp2 = lshr <2 x i32> %x, %y - %tmp3 = or <2 x i32> %tmp1, %tmp2 - store <2 x i32> %tmp3, <2 x i32> addrspace(1)* %in - ret void -} - -; FUNC-LABEL: {{^}}rotr_v4i32: -; R600: BIT_ALIGN_INT -; R600: BIT_ALIGN_INT -; R600: BIT_ALIGN_INT -; R600: BIT_ALIGN_INT - -; SI: v_alignbit_b32 -; SI: v_alignbit_b32 -; SI: v_alignbit_b32 -; SI: v_alignbit_b32 -define void @rotr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) { -entry: - %tmp0 = sub <4 x i32> , %y - %tmp1 = shl <4 x i32> %x, %tmp0 - %tmp2 = lshr <4 x i32> %x, %y - %tmp3 = or <4 x i32> %tmp1, %tmp2 - store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %in - ret void -} diff --git a/test/CodeGen/R600/rsq.ll b/test/CodeGen/R600/rsq.ll deleted file mode 100644 index b67b800c737..00000000000 --- a/test/CodeGen/R600/rsq.ll +++ /dev/null @@ -1,74 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone -declare float @llvm.sqrt.f32(float) nounwind readnone -declare double @llvm.sqrt.f64(double) nounwind readnone - -; SI-LABEL: {{^}}rsq_f32: -; SI: v_rsq_f32_e32 -; SI: s_endpgm -define void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { - %val = load float, float addrspace(1)* %in, align 4 - %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone - %div = fdiv float 1.0, %sqrt - store float %div, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}rsq_f64: -; SI-UNSAFE: v_rsq_f64_e32 -; SI-SAFE: v_sqrt_f64_e32 -; SI: s_endpgm -define void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind { - %val = load double, double addrspace(1)* %in, align 4 - %sqrt = call double @llvm.sqrt.f64(double %val) nounwind readnone - %div = fdiv double 1.0, %sqrt - store double %div, double addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}rsq_f32_sgpr: -; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} -; SI: s_endpgm -define void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind { - %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone - %div = fdiv float 1.0, %sqrt - store float %div, float addrspace(1)* %out, align 4 - ret void -} - -; Recognize that this is rsqrt(a) * rcp(b) * c, -; not 1 / ( 1 / sqrt(a)) * rcp(b) * c. - -; SI-LABEL: @rsqrt_fmul -; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 - -; SI-UNSAFE-DAG: v_rsq_f32_e32 [[RSQA:v[0-9]+]], [[A]] -; SI-UNSAFE-DAG: v_rcp_f32_e32 [[RCPB:v[0-9]+]], [[B]] -; SI-UNSAFE-DAG: v_mul_f32_e32 [[TMP:v[0-9]+]], [[RCPB]], [[RSQA]] -; SI-UNSAFE: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] -; SI-UNSAFE: buffer_store_dword [[RESULT]] - -; SI-SAFE-NOT: v_rsq_f32 - -; SI: s_endpgm -define void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid - %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid - %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2 - - %a = load float, float addrspace(1)* %gep.0 - %b = load float, float addrspace(1)* %gep.1 - %c = load float, float addrspace(1)* %gep.2 - - %x = call float @llvm.sqrt.f32(float %a) - %y = fmul float %x, %b - %z = fdiv float %c, %y - store float %z, float addrspace(1)* %out.gep - ret void -} diff --git a/test/CodeGen/R600/rv7x0_count3.ll b/test/CodeGen/R600/rv7x0_count3.ll deleted file mode 100644 index c3fd923e459..00000000000 --- a/test/CodeGen/R600/rv7x0_count3.ll +++ /dev/null @@ -1,41 +0,0 @@ -; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=rv710 | FileCheck %s - -; CHECK: TEX 9 @6 ; encoding: [0x06,0x00,0x00,0x00,0x00,0x04,0x88,0x80] - -define void @test(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { - %1 = extractelement <4 x float> %reg1, i32 0 - %2 = extractelement <4 x float> %reg1, i32 1 - %3 = extractelement <4 x float> %reg1, i32 2 - %4 = extractelement <4 x float> %reg1, i32 3 - %5 = insertelement <4 x float> undef, float %1, i32 0 - %6 = insertelement <4 x float> %5, float %2, i32 1 - %7 = insertelement <4 x float> %6, float %3, i32 2 - %8 = insertelement <4 x float> %7, float %4, i32 3 - %9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 0, i32 0, i32 1) - %10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 1, i32 0, i32 1) - %11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 2, i32 0, i32 1) - %12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 3, i32 0, i32 1) - %13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 4, i32 0, i32 1) - %14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 5, i32 0, i32 1) - %15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 6, i32 0, i32 1) - %16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 7, i32 0, i32 1) - %17 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 8, i32 0, i32 1) - %18 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %8, i32 9, i32 0, i32 1) - %19 = fadd <4 x float> %9, %10 - %20 = fadd <4 x float> %19, %11 - %21 = fadd <4 x float> %20, %12 - %22 = fadd <4 x float> %21, %13 - %23 = fadd <4 x float> %22, %14 - %24 = fadd <4 x float> %23, %15 - %25 = fadd <4 x float> %24, %16 - %26 = fadd <4 x float> %25, %17 - %27 = fadd <4 x float> %26, %18 - call void @llvm.R600.store.swizzle(<4 x float> %27, i32 0, i32 2) - ret void -} - -declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="1" } diff --git a/test/CodeGen/R600/s_movk_i32.ll b/test/CodeGen/R600/s_movk_i32.ll deleted file mode 100644 index 6b1a36c979c..00000000000 --- a/test/CodeGen/R600/s_movk_i32.ll +++ /dev/null @@ -1,185 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}s_movk_i32_k0: -; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}} -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] -; SI: s_endpgm -define void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 4 - %or = or i64 %loada, 4295032831 ; ((1 << 16) - 1) | (1 << 32) - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_movk_i32_k1: -; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}} -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] -; SI: s_endpgm -define void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 4 - %or = or i64 %loada, 4295000063 ; ((1 << 15) - 1) | (1 << 32) - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_movk_i32_k2: -; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 64{{$}} -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] -; SI: s_endpgm -define void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 4 - %or = or i64 %loada, 274877939711 ; ((1 << 15) - 1) | (64 << 32) - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_movk_i32_k3: -; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}} -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] -; SI: s_endpgm -define void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 4 - %or = or i64 %loada, 4295000064 ; (1 << 15) | (1 << 32) - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_movk_i32_k4: -; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x20000{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}} -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] -; SI: s_endpgm -define void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 4 - %or = or i64 %loada, 4295098368 ; (1 << 17) | (1 << 32) - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_movk_i32_k5: -; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0xffef{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0xff00ffff{{$}} -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] -; SI: s_endpgm -define void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 4 - %or = or i64 %loada, 18374967954648334319 ; -17 & 0xff00ffffffffffff - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_movk_i32_k6: -; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x41{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 63{{$}} -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] -; SI: s_endpgm -define void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 4 - %or = or i64 %loada, 270582939713 ; 65 | (63 << 32) - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_movk_i32_k7: -; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x2000{{$}} -; SI-DAG: s_movk_i32 [[HI_S_IMM:s[0-9]+]], 0x4000{{$}} -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] -; SI: s_endpgm -define void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 4 - %or = or i64 %loada, 70368744185856; ((1 << 13)) | ((1 << 14) << 32) - store i64 %or, i64 addrspace(1)* %out - ret void -} - - -; SI-LABEL: {{^}}s_movk_i32_k8: -; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] -; SI: s_endpgm -define void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 4 - %or = or i64 %loada, 1229782942255906816 ; 0x11111111ffff8000 - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_movk_i32_k9: -; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8001{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] -; SI: s_endpgm -define void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 4 - %or = or i64 %loada, 1229782942255906817 ; 0x11111111ffff8001 - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_movk_i32_k10: -; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8888{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] -; SI: s_endpgm -define void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 4 - %or = or i64 %loada, 1229782942255909000 ; 0x11111111ffff8888 - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_movk_i32_k11: -; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8fff{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] -; SI: s_endpgm -define void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 4 - %or = or i64 %loada, 1229782942255910911 ; 0x11111111ffff8fff - store i64 %or, i64 addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_movk_i32_k12: -; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff7001{{$}} -; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}} -; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]] -; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]] -; SI: s_endpgm -define void @s_movk_i32_k12(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) { - %loada = load i64, i64 addrspace(1)* %a, align 4 - %or = or i64 %loada, 1229782942255902721 ; 0x11111111ffff7001 - store i64 %or, i64 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/saddo.ll b/test/CodeGen/R600/saddo.ll deleted file mode 100644 index f8ced7942a6..00000000000 --- a/test/CodeGen/R600/saddo.ll +++ /dev/null @@ -1,63 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s - -declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone -declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone - -; FUNC-LABEL: {{^}}saddo_i64_zext: -define void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind - %val = extractvalue { i64, i1 } %sadd, 0 - %carry = extractvalue { i64, i1 } %sadd, 1 - %ext = zext i1 %carry to i64 - %add2 = add i64 %val, %ext - store i64 %add2, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_saddo_i32: -define void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { - %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind - %val = extractvalue { i32, i1 } %sadd, 0 - %carry = extractvalue { i32, i1 } %sadd, 1 - store i32 %val, i32 addrspace(1)* %out, align 4 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} - -; FUNC-LABEL: {{^}}v_saddo_i32: -define void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %a = load i32, i32 addrspace(1)* %aptr, align 4 - %b = load i32, i32 addrspace(1)* %bptr, align 4 - %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind - %val = extractvalue { i32, i1 } %sadd, 0 - %carry = extractvalue { i32, i1 } %sadd, 1 - store i32 %val, i32 addrspace(1)* %out, align 4 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} - -; FUNC-LABEL: {{^}}s_saddo_i64: -define void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { - %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind - %val = extractvalue { i64, i1 } %sadd, 0 - %carry = extractvalue { i64, i1 } %sadd, 1 - store i64 %val, i64 addrspace(1)* %out, align 8 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} - -; FUNC-LABEL: {{^}}v_saddo_i64: -; SI: v_add_i32 -; SI: v_addc_u32 -define void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { - %a = load i64, i64 addrspace(1)* %aptr, align 4 - %b = load i64, i64 addrspace(1)* %bptr, align 4 - %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind - %val = extractvalue { i64, i1 } %sadd, 0 - %carry = extractvalue { i64, i1 } %sadd, 1 - store i64 %val, i64 addrspace(1)* %out, align 8 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} diff --git a/test/CodeGen/R600/salu-to-valu.ll b/test/CodeGen/R600/salu-to-valu.ll deleted file mode 100644 index 0b964957654..00000000000 --- a/test/CodeGen/R600/salu-to-valu.ll +++ /dev/null @@ -1,118 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s - -; In this test both the pointer and the offset operands to the -; BUFFER_LOAD instructions end up being stored in vgprs. This -; requires us to add the pointer and offset together, store the -; result in the offset operand (vaddr), and then store 0 in an -; sgpr register pair and use that for the pointer operand -; (low 64-bits of srsrc). - -; CHECK-LABEL: {{^}}mubuf: - -; Make sure we aren't using VGPRs for the source operand of s_mov_b64 -; CHECK-NOT: s_mov_b64 s[{{[0-9]+:[0-9]+}}], v - -; Make sure we aren't using VGPR's for the srsrc operand of BUFFER_LOAD_* -; instructions -; CHECK: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 -; CHECK: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 -define void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { -entry: - %0 = call i32 @llvm.r600.read.tidig.x() #1 - %1 = call i32 @llvm.r600.read.tidig.y() #1 - %2 = sext i32 %0 to i64 - %3 = sext i32 %1 to i64 - br label %loop - -loop: - %4 = phi i64 [0, %entry], [%5, %loop] - %5 = add i64 %2, %4 - %6 = getelementptr i8, i8 addrspace(1)* %in, i64 %5 - %7 = load i8, i8 addrspace(1)* %6, align 1 - %8 = or i64 %5, 1 - %9 = getelementptr i8, i8 addrspace(1)* %in, i64 %8 - %10 = load i8, i8 addrspace(1)* %9, align 1 - %11 = add i8 %7, %10 - %12 = sext i8 %11 to i32 - store i32 %12, i32 addrspace(1)* %out - %13 = icmp slt i64 %5, 10 - br i1 %13, label %loop, label %done - -done: - ret void -} - -declare i32 @llvm.r600.read.tidig.x() #1 -declare i32 @llvm.r600.read.tidig.y() #1 - -attributes #1 = { nounwind readnone } - -; Test moving an SMRD instruction to the VALU - -; CHECK-LABEL: {{^}}smrd_valu: -; CHECK: buffer_load_dword [[OUT:v[0-9]+]] -; CHECK: buffer_store_dword [[OUT]] - -define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 addrspace(1)* %out) { -entry: - %0 = icmp ne i32 %a, 0 - br i1 %0, label %if, label %else - -if: - %1 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in - br label %endif - -else: - %2 = getelementptr i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %in - %3 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(1)* %2 - br label %endif - -endif: - %4 = phi i32 addrspace(2)* [%1, %if], [%3, %else] - %5 = getelementptr i32, i32 addrspace(2)* %4, i32 3000 - %6 = load i32, i32 addrspace(2)* %5 - store i32 %6, i32 addrspace(1)* %out - ret void -} - -; Test moving ann SMRD with an immediate offset to the VALU - -; CHECK-LABEL: {{^}}smrd_valu2: -; CHECK: buffer_load_dword -define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) { -entry: - %0 = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %1 = add i32 %0, 4 - %2 = getelementptr [8 x i32], [8 x i32] addrspace(2)* %in, i32 %0, i32 4 - %3 = load i32, i32 addrspace(2)* %2 - store i32 %3, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}s_load_imm_v8i32: -; CHECK: buffer_load_dwordx4 -; CHECK: buffer_load_dwordx4 -define void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) { -entry: - %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1 - %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0 - %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <8 x i32> addrspace(2)* - %tmp3 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp2, align 4 - store <8 x i32> %tmp3, <8 x i32> addrspace(1)* %out, align 32 - ret void -} - -; CHECK-LABEL: {{^}}s_load_imm_v16i32: -; CHECK: buffer_load_dwordx4 -; CHECK: buffer_load_dwordx4 -; CHECK: buffer_load_dwordx4 -; CHECK: buffer_load_dwordx4 -define void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) { -entry: - %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1 - %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0 - %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <16 x i32> addrspace(2)* - %tmp3 = load <16 x i32>, <16 x i32> addrspace(2)* %tmp2, align 4 - store <16 x i32> %tmp3, <16 x i32> addrspace(1)* %out, align 32 - ret void -} diff --git a/test/CodeGen/R600/scalar_to_vector.ll b/test/CodeGen/R600/scalar_to_vector.ll deleted file mode 100644 index 0970e5d3063..00000000000 --- a/test/CodeGen/R600/scalar_to_vector.ll +++ /dev/null @@ -1,81 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - - -; FUNC-LABEL: {{^}}scalar_to_vector_v2i32: -; SI: buffer_load_dword [[VAL:v[0-9]+]], -; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]] -; SI: buffer_store_short [[RESULT]] -; SI: buffer_store_short [[RESULT]] -; SI: buffer_store_short [[RESULT]] -; SI: buffer_store_short [[RESULT]] -; SI: s_endpgm -define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %tmp1 = load i32, i32 addrspace(1)* %in, align 4 - %bc = bitcast i32 %tmp1 to <2 x i16> - %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> - store <4 x i16> %tmp2, <4 x i16> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}scalar_to_vector_v2f32: -; SI: buffer_load_dword [[VAL:v[0-9]+]], -; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]] -; SI: buffer_store_short [[RESULT]] -; SI: buffer_store_short [[RESULT]] -; SI: buffer_store_short [[RESULT]] -; SI: buffer_store_short [[RESULT]] -; SI: s_endpgm -define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind { - %tmp1 = load float, float addrspace(1)* %in, align 4 - %bc = bitcast float %tmp1 to <2 x i16> - %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> - store <4 x i16> %tmp2, <4 x i16> addrspace(1)* %out, align 8 - ret void -} - -; Getting a SCALAR_TO_VECTOR seems to be tricky. These cases managed -; to produce one, but for some reason never made it to selection. - - -; define void @scalar_to_vector_test2(<8 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { -; %tmp1 = load i32, i32 addrspace(1)* %in, align 4 -; %bc = bitcast i32 %tmp1 to <4 x i8> - -; %tmp2 = shufflevector <4 x i8> %bc, <4 x i8> undef, <8 x i32> -; store <8 x i8> %tmp2, <8 x i8> addrspace(1)* %out, align 4 -; ret void -; } - -; define void @scalar_to_vector_test3(<4 x i32> addrspace(1)* %out) nounwind { -; %newvec0 = insertelement <2 x i64> undef, i64 12345, i32 0 -; %newvec1 = insertelement <2 x i64> %newvec0, i64 undef, i32 1 -; %bc = bitcast <2 x i64> %newvec1 to <4 x i32> -; %add = add <4 x i32> %bc, -; store <4 x i32> %add, <4 x i32> addrspace(1)* %out, align 16 -; ret void -; } - -; define void @scalar_to_vector_test4(<8 x i16> addrspace(1)* %out) nounwind { -; %newvec0 = insertelement <4 x i32> undef, i32 12345, i32 0 -; %bc = bitcast <4 x i32> %newvec0 to <8 x i16> -; %add = add <8 x i16> %bc, -; store <8 x i16> %add, <8 x i16> addrspace(1)* %out, align 16 -; ret void -; } - -; define void @scalar_to_vector_test5(<4 x i16> addrspace(1)* %out) nounwind { -; %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0 -; %bc = bitcast <2 x i32> %newvec0 to <4 x i16> -; %add = add <4 x i16> %bc, -; store <4 x i16> %add, <4 x i16> addrspace(1)* %out, align 16 -; ret void -; } - -; define void @scalar_to_vector_test6(<4 x i16> addrspace(1)* %out) nounwind { -; %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0 -; %bc = bitcast <2 x i32> %newvec0 to <4 x i16> -; %add = add <4 x i16> %bc, -; store <4 x i16> %add, <4 x i16> addrspace(1)* %out, align 16 -; ret void -; } diff --git a/test/CodeGen/R600/schedule-fs-loop-nested-if.ll b/test/CodeGen/R600/schedule-fs-loop-nested-if.ll deleted file mode 100644 index 11e8f5176f4..00000000000 --- a/test/CodeGen/R600/schedule-fs-loop-nested-if.ll +++ /dev/null @@ -1,82 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs -;REQUIRES: asserts - -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #1 { -main_body: - %0 = extractelement <4 x float> %reg1, i32 0 - %1 = extractelement <4 x float> %reg1, i32 1 - %2 = extractelement <4 x float> %reg1, i32 2 - %3 = extractelement <4 x float> %reg1, i32 3 - %4 = fcmp ult float %1, 0.000000e+00 - %5 = select i1 %4, float 1.000000e+00, float 0.000000e+00 - %6 = fsub float -0.000000e+00, %5 - %7 = fptosi float %6 to i32 - %8 = bitcast i32 %7 to float - %9 = fcmp ult float %0, 5.700000e+01 - %10 = select i1 %9, float 1.000000e+00, float 0.000000e+00 - %11 = fsub float -0.000000e+00, %10 - %12 = fptosi float %11 to i32 - %13 = bitcast i32 %12 to float - %14 = bitcast float %8 to i32 - %15 = bitcast float %13 to i32 - %16 = and i32 %14, %15 - %17 = bitcast i32 %16 to float - %18 = bitcast float %17 to i32 - %19 = icmp ne i32 %18, 0 - %20 = fcmp ult float %0, 0.000000e+00 - %21 = select i1 %20, float 1.000000e+00, float 0.000000e+00 - %22 = fsub float -0.000000e+00, %21 - %23 = fptosi float %22 to i32 - %24 = bitcast i32 %23 to float - %25 = bitcast float %24 to i32 - %26 = icmp ne i32 %25, 0 - br i1 %19, label %IF, label %ELSE - -IF: ; preds = %main_body - %. = select i1 %26, float 0.000000e+00, float 1.000000e+00 - %.18 = select i1 %26, float 1.000000e+00, float 0.000000e+00 - br label %ENDIF - -ELSE: ; preds = %main_body - br i1 %26, label %ENDIF, label %ELSE17 - -ENDIF: ; preds = %ELSE17, %ELSE, %IF - %temp1.0 = phi float [ %., %IF ], [ %48, %ELSE17 ], [ 0.000000e+00, %ELSE ] - %temp2.0 = phi float [ 0.000000e+00, %IF ], [ %49, %ELSE17 ], [ 1.000000e+00, %ELSE ] - %temp.0 = phi float [ %.18, %IF ], [ %47, %ELSE17 ], [ 0.000000e+00, %ELSE ] - %27 = call float @llvm.AMDIL.clamp.(float %temp.0, float 0.000000e+00, float 1.000000e+00) - %28 = call float @llvm.AMDIL.clamp.(float %temp1.0, float 0.000000e+00, float 1.000000e+00) - %29 = call float @llvm.AMDIL.clamp.(float %temp2.0, float 0.000000e+00, float 1.000000e+00) - %30 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00) - %31 = insertelement <4 x float> undef, float %27, i32 0 - %32 = insertelement <4 x float> %31, float %28, i32 1 - %33 = insertelement <4 x float> %32, float %29, i32 2 - %34 = insertelement <4 x float> %33, float %30, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %34, i32 0, i32 0) - ret void - -ELSE17: ; preds = %ELSE - %35 = fadd float 0.000000e+00, 0x3FC99999A0000000 - %36 = fadd float 0.000000e+00, 0x3FC99999A0000000 - %37 = fadd float 0.000000e+00, 0x3FC99999A0000000 - %38 = fadd float %35, 0x3FC99999A0000000 - %39 = fadd float %36, 0x3FC99999A0000000 - %40 = fadd float %37, 0x3FC99999A0000000 - %41 = fadd float %38, 0x3FC99999A0000000 - %42 = fadd float %39, 0x3FC99999A0000000 - %43 = fadd float %40, 0x3FC99999A0000000 - %44 = fadd float %41, 0x3FC99999A0000000 - %45 = fadd float %42, 0x3FC99999A0000000 - %46 = fadd float %43, 0x3FC99999A0000000 - %47 = fadd float %44, 0x3FC99999A0000000 - %48 = fadd float %45, 0x3FC99999A0000000 - %49 = fadd float %46, 0x3FC99999A0000000 - br label %ENDIF -} - -declare float @llvm.AMDIL.clamp.(float, float, float) #0 - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { readnone } -attributes #1 = { "ShaderType"="1" } diff --git a/test/CodeGen/R600/schedule-fs-loop-nested.ll b/test/CodeGen/R600/schedule-fs-loop-nested.ll deleted file mode 100644 index 759197ca61f..00000000000 --- a/test/CodeGen/R600/schedule-fs-loop-nested.ll +++ /dev/null @@ -1,88 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs -;REQUIRES: asserts - -define void @main() { -main_body: - %0 = load <4 x float>, <4 x float> addrspace(9)* null - %1 = extractelement <4 x float> %0, i32 3 - %2 = fptosi float %1 to i32 - %3 = bitcast i32 %2 to float - %4 = bitcast float %3 to i32 - %5 = sdiv i32 %4, 4 - %6 = bitcast i32 %5 to float - %7 = bitcast float %6 to i32 - %8 = mul i32 %7, 4 - %9 = bitcast i32 %8 to float - %10 = bitcast float %9 to i32 - %11 = sub i32 0, %10 - %12 = bitcast i32 %11 to float - %13 = bitcast float %3 to i32 - %14 = bitcast float %12 to i32 - %15 = add i32 %13, %14 - %16 = bitcast i32 %15 to float - %17 = load <4 x float>, <4 x float> addrspace(9)* null - %18 = extractelement <4 x float> %17, i32 0 - %19 = load <4 x float>, <4 x float> addrspace(9)* null - %20 = extractelement <4 x float> %19, i32 1 - %21 = load <4 x float>, <4 x float> addrspace(9)* null - %22 = extractelement <4 x float> %21, i32 2 - br label %LOOP - -LOOP: ; preds = %IF31, %main_body - %temp12.0 = phi float [ 0.000000e+00, %main_body ], [ %47, %IF31 ] - %temp6.0 = phi float [ %22, %main_body ], [ %temp6.1, %IF31 ] - %temp5.0 = phi float [ %20, %main_body ], [ %temp5.1, %IF31 ] - %temp4.0 = phi float [ %18, %main_body ], [ %temp4.1, %IF31 ] - %23 = bitcast float %temp12.0 to i32 - %24 = bitcast float %6 to i32 - %25 = icmp sge i32 %23, %24 - %26 = sext i1 %25 to i32 - %27 = bitcast i32 %26 to float - %28 = bitcast float %27 to i32 - %29 = icmp ne i32 %28, 0 - br i1 %29, label %IF, label %LOOP29 - -IF: ; preds = %LOOP - %30 = call float @llvm.AMDIL.clamp.(float %temp4.0, float 0.000000e+00, float 1.000000e+00) - %31 = call float @llvm.AMDIL.clamp.(float %temp5.0, float 0.000000e+00, float 1.000000e+00) - %32 = call float @llvm.AMDIL.clamp.(float %temp6.0, float 0.000000e+00, float 1.000000e+00) - %33 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00) - %34 = insertelement <4 x float> undef, float %30, i32 0 - %35 = insertelement <4 x float> %34, float %31, i32 1 - %36 = insertelement <4 x float> %35, float %32, i32 2 - %37 = insertelement <4 x float> %36, float %33, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %37, i32 0, i32 0) - ret void - -LOOP29: ; preds = %LOOP, %ENDIF30 - %temp6.1 = phi float [ %temp4.1, %ENDIF30 ], [ %temp6.0, %LOOP ] - %temp5.1 = phi float [ %temp6.1, %ENDIF30 ], [ %temp5.0, %LOOP ] - %temp4.1 = phi float [ %temp5.1, %ENDIF30 ], [ %temp4.0, %LOOP ] - %temp20.0 = phi float [ %50, %ENDIF30 ], [ 0.000000e+00, %LOOP ] - %38 = bitcast float %temp20.0 to i32 - %39 = bitcast float %16 to i32 - %40 = icmp sge i32 %38, %39 - %41 = sext i1 %40 to i32 - %42 = bitcast i32 %41 to float - %43 = bitcast float %42 to i32 - %44 = icmp ne i32 %43, 0 - br i1 %44, label %IF31, label %ENDIF30 - -IF31: ; preds = %LOOP29 - %45 = bitcast float %temp12.0 to i32 - %46 = add i32 %45, 1 - %47 = bitcast i32 %46 to float - br label %LOOP - -ENDIF30: ; preds = %LOOP29 - %48 = bitcast float %temp20.0 to i32 - %49 = add i32 %48, 1 - %50 = bitcast i32 %49 to float - br label %LOOP29 -} - -declare float @llvm.AMDIL.clamp.(float, float, float) #0 - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { readnone } diff --git a/test/CodeGen/R600/schedule-fs-loop.ll b/test/CodeGen/R600/schedule-fs-loop.ll deleted file mode 100644 index 28cc08abc02..00000000000 --- a/test/CodeGen/R600/schedule-fs-loop.ll +++ /dev/null @@ -1,55 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs -;REQUIRES: asserts - -define void @main() { -main_body: - %0 = load <4 x float>, <4 x float> addrspace(9)* null - %1 = extractelement <4 x float> %0, i32 3 - %2 = fptosi float %1 to i32 - %3 = bitcast i32 %2 to float - %4 = load <4 x float>, <4 x float> addrspace(9)* null - %5 = extractelement <4 x float> %4, i32 0 - %6 = load <4 x float>, <4 x float> addrspace(9)* null - %7 = extractelement <4 x float> %6, i32 1 - %8 = load <4 x float>, <4 x float> addrspace(9)* null - %9 = extractelement <4 x float> %8, i32 2 - br label %LOOP - -LOOP: ; preds = %ENDIF, %main_body - %temp4.0 = phi float [ %5, %main_body ], [ %temp5.0, %ENDIF ] - %temp5.0 = phi float [ %7, %main_body ], [ %temp6.0, %ENDIF ] - %temp6.0 = phi float [ %9, %main_body ], [ %temp4.0, %ENDIF ] - %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %27, %ENDIF ] - %10 = bitcast float %temp8.0 to i32 - %11 = bitcast float %3 to i32 - %12 = icmp sge i32 %10, %11 - %13 = sext i1 %12 to i32 - %14 = bitcast i32 %13 to float - %15 = bitcast float %14 to i32 - %16 = icmp ne i32 %15, 0 - br i1 %16, label %IF, label %ENDIF - -IF: ; preds = %LOOP - %17 = call float @llvm.AMDIL.clamp.(float %temp4.0, float 0.000000e+00, float 1.000000e+00) - %18 = call float @llvm.AMDIL.clamp.(float %temp5.0, float 0.000000e+00, float 1.000000e+00) - %19 = call float @llvm.AMDIL.clamp.(float %temp6.0, float 0.000000e+00, float 1.000000e+00) - %20 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00) - %21 = insertelement <4 x float> undef, float %17, i32 0 - %22 = insertelement <4 x float> %21, float %18, i32 1 - %23 = insertelement <4 x float> %22, float %19, i32 2 - %24 = insertelement <4 x float> %23, float %20, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %24, i32 0, i32 0) - ret void - -ENDIF: ; preds = %LOOP - %25 = bitcast float %temp8.0 to i32 - %26 = add i32 %25, 1 - %27 = bitcast i32 %26 to float - br label %LOOP -} - -declare float @llvm.AMDIL.clamp.(float, float, float) #0 - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { readnone } diff --git a/test/CodeGen/R600/schedule-global-loads.ll b/test/CodeGen/R600/schedule-global-loads.ll deleted file mode 100644 index 3f728fd873b..00000000000 --- a/test/CodeGen/R600/schedule-global-loads.ll +++ /dev/null @@ -1,41 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s - - -declare i32 @llvm.r600.read.tidig.x() #1 - -; FIXME: This currently doesn't do a great job of clustering the -; loads, which end up with extra moves between them. Right now, it -; seems the only things areLoadsFromSameBasePtr is accomplishing is -; ordering the loads so that the lower address loads come first. - -; FUNC-LABEL: {{^}}cluster_global_arg_loads: -; SI-DAG: buffer_load_dword [[REG0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; SI-DAG: buffer_load_dword [[REG1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4 -; SI: buffer_store_dword [[REG0]] -; SI: buffer_store_dword [[REG1]] -define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 { - %load0 = load i32, i32 addrspace(1)* %ptr, align 4 - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 1 - %load1 = load i32, i32 addrspace(1)* %gep, align 4 - store i32 %load0, i32 addrspace(1)* %out0, align 4 - store i32 %load1, i32 addrspace(1)* %out1, align 4 - ret void -} - -; Test for a crach in SIInstrInfo::areLoadsFromSameBasePtr() when checking -; an MUBUF load which does not have a vaddr operand. -; FUNC-LABEL: {{^}}same_base_ptr_crash: -; SI: buffer_load_dword -; SI: buffer_load_dword -define void @same_base_ptr_crash(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) { -entry: - %out1 = getelementptr i32, i32 addrspace(1)* %out, i32 %offset - %tmp0 = load i32, i32 addrspace(1)* %out - %tmp1 = load i32, i32 addrspace(1)* %out1 - %tmp2 = add i32 %tmp0, %tmp1 - store i32 %tmp2, i32 addrspace(1)* %out - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/schedule-if-2.ll b/test/CodeGen/R600/schedule-if-2.ll deleted file mode 100644 index 54946509683..00000000000 --- a/test/CodeGen/R600/schedule-if-2.ll +++ /dev/null @@ -1,94 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs -;REQUIRES: asserts - -define void @main() { -main_body: - %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2) - %1 = extractelement <4 x float> %0, i32 0 - %2 = fadd float 1.000000e+03, %1 - %3 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %4 = extractelement <4 x float> %3, i32 0 - %5 = bitcast float %4 to i32 - %6 = icmp eq i32 %5, 0 - %7 = sext i1 %6 to i32 - %8 = bitcast i32 %7 to float - %9 = bitcast float %8 to i32 - %10 = icmp ne i32 %9, 0 - br i1 %10, label %IF, label %ELSE - -IF: ; preds = %main_body - %11 = call float @fabs(float %2) - %12 = fcmp ueq float %11, 0x7FF0000000000000 - %13 = select i1 %12, float 1.000000e+00, float 0.000000e+00 - %14 = fsub float -0.000000e+00, %13 - %15 = fptosi float %14 to i32 - %16 = bitcast i32 %15 to float - %17 = bitcast float %16 to i32 - %18 = icmp ne i32 %17, 0 - %. = select i1 %18, float 0x36A0000000000000, float 0.000000e+00 - %19 = fcmp une float %2, %2 - %20 = select i1 %19, float 1.000000e+00, float 0.000000e+00 - %21 = fsub float -0.000000e+00, %20 - %22 = fptosi float %21 to i32 - %23 = bitcast i32 %22 to float - %24 = bitcast float %23 to i32 - %25 = icmp ne i32 %24, 0 - %temp8.0 = select i1 %25, float 0x36A0000000000000, float 0.000000e+00 - %26 = bitcast float %. to i32 - %27 = sitofp i32 %26 to float - %28 = bitcast float %temp8.0 to i32 - %29 = sitofp i32 %28 to float - %30 = fcmp ugt float %2, 0.000000e+00 - %31 = select i1 %30, float 1.000000e+00, float %2 - %32 = fcmp uge float %31, 0.000000e+00 - %33 = select i1 %32, float %31, float -1.000000e+00 - %34 = fadd float %33, 1.000000e+00 - %35 = fmul float %34, 5.000000e-01 - br label %ENDIF - -ELSE: ; preds = %main_body - %36 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %37 = extractelement <4 x float> %36, i32 0 - %38 = bitcast float %37 to i32 - %39 = icmp eq i32 %38, 1 - %40 = sext i1 %39 to i32 - %41 = bitcast i32 %40 to float - %42 = bitcast float %41 to i32 - %43 = icmp ne i32 %42, 0 - br i1 %43, label %IF23, label %ENDIF - -ENDIF: ; preds = %IF23, %ELSE, %IF - %temp4.0 = phi float [ %2, %IF ], [ %56, %IF23 ], [ 0.000000e+00, %ELSE ] - %temp5.0 = phi float [ %27, %IF ], [ %60, %IF23 ], [ 0.000000e+00, %ELSE ] - %temp6.0 = phi float [ %29, %IF ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF23 ] - %temp7.0 = phi float [ %35, %IF ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF23 ] - %44 = insertelement <4 x float> undef, float %temp4.0, i32 0 - %45 = insertelement <4 x float> %44, float %temp5.0, i32 1 - %46 = insertelement <4 x float> %45, float %temp6.0, i32 2 - %47 = insertelement <4 x float> %46, float %temp7.0, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %47, i32 0, i32 0) - ret void - -IF23: ; preds = %ELSE - %48 = fcmp ult float 0.000000e+00, %2 - %49 = select i1 %48, float 1.000000e+00, float 0.000000e+00 - %50 = fsub float -0.000000e+00, %49 - %51 = fptosi float %50 to i32 - %52 = bitcast i32 %51 to float - %53 = bitcast float %52 to i32 - %54 = icmp ne i32 %53, 0 - %.28 = select i1 %54, float 0x36A0000000000000, float 0.000000e+00 - %55 = bitcast float %.28 to i32 - %56 = sitofp i32 %55 to float - %57 = load <4 x float>, <4 x float> addrspace(8)* null - %58 = extractelement <4 x float> %57, i32 0 - %59 = fsub float -0.000000e+00, %58 - %60 = fadd float %2, %59 - br label %ENDIF -} - -declare float @fabs(float) #0 - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { readonly } diff --git a/test/CodeGen/R600/schedule-if.ll b/test/CodeGen/R600/schedule-if.ll deleted file mode 100644 index 94c653c8f25..00000000000 --- a/test/CodeGen/R600/schedule-if.ll +++ /dev/null @@ -1,46 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs -;REQUIRES: asserts - -define void @main() { -main_body: - %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %1 = extractelement <4 x float> %0, i32 0 - %2 = bitcast float %1 to i32 - %3 = icmp eq i32 %2, 0 - %4 = sext i1 %3 to i32 - %5 = bitcast i32 %4 to float - %6 = bitcast float %5 to i32 - %7 = icmp ne i32 %6, 0 - br i1 %7, label %ENDIF, label %ELSE - -ELSE: ; preds = %main_body - %8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %9 = extractelement <4 x float> %8, i32 0 - %10 = bitcast float %9 to i32 - %11 = icmp eq i32 %10, 1 - %12 = sext i1 %11 to i32 - %13 = bitcast i32 %12 to float - %14 = bitcast float %13 to i32 - %15 = icmp ne i32 %14, 0 - br i1 %15, label %IF13, label %ENDIF - -ENDIF: ; preds = %IF13, %ELSE, %main_body - %temp.0 = phi float [ 1.000000e+03, %main_body ], [ 1.000000e+00, %IF13 ], [ 0.000000e+00, %ELSE ] - %temp1.0 = phi float [ 0.000000e+00, %main_body ], [ %23, %IF13 ], [ 0.000000e+00, %ELSE ] - %temp3.0 = phi float [ 1.000000e+00, %main_body ], [ 0.000000e+00, %ELSE ], [ 0.000000e+00, %IF13 ] - %16 = insertelement <4 x float> undef, float %temp.0, i32 0 - %17 = insertelement <4 x float> %16, float %temp1.0, i32 1 - %18 = insertelement <4 x float> %17, float 0.000000e+00, i32 2 - %19 = insertelement <4 x float> %18, float %temp3.0, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %19, i32 0, i32 0) - ret void - -IF13: ; preds = %ELSE - %20 = load <4 x float>, <4 x float> addrspace(8)* null - %21 = extractelement <4 x float> %20, i32 0 - %22 = fsub float -0.000000e+00, %21 - %23 = fadd float 1.000000e+03, %22 - br label %ENDIF -} - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) diff --git a/test/CodeGen/R600/schedule-kernel-arg-loads.ll b/test/CodeGen/R600/schedule-kernel-arg-loads.ll deleted file mode 100644 index 6b3e0814c38..00000000000 --- a/test/CodeGen/R600/schedule-kernel-arg-loads.ll +++ /dev/null @@ -1,51 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI --check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI --check-prefix=GCN %s - -; FUNC-LABEL: {{^}}cluster_arg_loads: -; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9 -; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xe -; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24 -; VI-NEXT: s_nop 0 -; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI-NEXT: s_nop 0 -; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34 -; VI-NEXT: s_nop 0 -; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38 -define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind { - store i32 %x, i32 addrspace(1)* %out0, align 4 - store i32 %y, i32 addrspace(1)* %out1, align 4 - ret void -} - -; Test for a crash in SIInstrInfo::areLoadsFromSameBasePtr() when -; s_load_dwordx2 has a register offset - -; FUNC-LABEL: @same_base_ptr_crash -; GCN: s_load_dwordx2 -; GCN: s_load_dwordx2 -; GCN: s_load_dwordx2 -; GCN: s_endpgm -define void @same_base_ptr_crash(i64 addrspace(1)* %out, - i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7, - i64 %arg8, i64 %arg9, i64 %arg10, i64 %arg11, i64 %arg12, i64 %arg13, i64 %arg14, i64 %arg15, - i64 %arg16, i64 %arg17, i64 %arg18, i64 %arg19, i64 %arg20, i64 %arg21, i64 %arg22, i64 %arg23, - i64 %arg24, i64 %arg25, i64 %arg26, i64 %arg27, i64 %arg28, i64 %arg29, i64 %arg30, i64 %arg31, - i64 %arg32, i64 %arg33, i64 %arg34, i64 %arg35, i64 %arg36, i64 %arg37, i64 %arg38, i64 %arg39, - i64 %arg40, i64 %arg41, i64 %arg42, i64 %arg43, i64 %arg44, i64 %arg45, i64 %arg46, i64 %arg47, - i64 %arg48, i64 %arg49, i64 %arg50, i64 %arg51, i64 %arg52, i64 %arg53, i64 %arg54, i64 %arg55, - i64 %arg56, i64 %arg57, i64 %arg58, i64 %arg59, i64 %arg60, i64 %arg61, i64 %arg62, i64 %arg63, - i64 %arg64, i64 %arg65, i64 %arg66, i64 %arg67, i64 %arg68, i64 %arg69, i64 %arg70, i64 %arg71, - i64 %arg72, i64 %arg73, i64 %arg74, i64 %arg75, i64 %arg76, i64 %arg77, i64 %arg78, i64 %arg79, - i64 %arg80, i64 %arg81, i64 %arg82, i64 %arg83, i64 %arg84, i64 %arg85, i64 %arg86, i64 %arg87, - i64 %arg88, i64 %arg89, i64 %arg90, i64 %arg91, i64 %arg92, i64 %arg93, i64 %arg94, i64 %arg95, - i64 %arg96, i64 %arg97, i64 %arg98, i64 %arg99, i64 %arg100, i64 %arg101, i64 %arg102, i64 %arg103, - i64 %arg104, i64 %arg105, i64 %arg106, i64 %arg107, i64 %arg108, i64 %arg109, i64 %arg110, i64 %arg111, - i64 %arg112, i64 %arg113, i64 %arg114, i64 %arg115, i64 %arg116, i64 %arg117, i64 %arg118, i64 %arg119, - i64 %arg120, i64 %arg121, i64 %arg122, i64 %arg123, i64 %arg124, i64 %arg125, i64 %arg126) { -entry: - %value = add i64 %arg125, %arg126 - store i64 %value, i64 addrspace(1)* %out, align 8 - ret void -} diff --git a/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll b/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll deleted file mode 100644 index 3863afda5dd..00000000000 --- a/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll +++ /dev/null @@ -1,163 +0,0 @@ -; XFAIL: * -; REQUIRES: asserts -; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI -; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI - -declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate - - -; SI-LABEL: {{^}}main( -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { -main_body: - %0 = extractelement <4 x float> %reg1, i32 0 - %1 = extractelement <4 x float> %reg1, i32 2 - %2 = fcmp ult float %0, 0.000000e+00 - %3 = select i1 %2, float 1.000000e+00, float 0.000000e+00 - %4 = fsub float -0.000000e+00, %3 - %5 = fptosi float %4 to i32 - %6 = bitcast i32 %5 to float - %7 = bitcast float %6 to i32 - %8 = icmp ne i32 %7, 0 - br i1 %8, label %LOOP, label %ENDIF - -Flow1: ; preds = %ENDIF19, %ENDIF16 - %9 = phi float [ %115, %ENDIF19 ], [ undef, %ENDIF16 ] - %10 = phi float [ %114, %ENDIF19 ], [ undef, %ENDIF16 ] - %11 = phi float [ %113, %ENDIF19 ], [ undef, %ENDIF16 ] - %12 = phi float [ %112, %ENDIF19 ], [ undef, %ENDIF16 ] - %13 = phi float [ %111, %ENDIF19 ], [ undef, %ENDIF16 ] - %14 = phi i1 [ false, %ENDIF19 ], [ true, %ENDIF16 ] - br label %Flow - -Flow2: ; preds = %Flow - br label %ENDIF - -ENDIF: ; preds = %main_body, %Flow2 - %temp.0 = phi float [ 0.000000e+00, %main_body ], [ %104, %Flow2 ] - %temp1.0 = phi float [ 1.000000e+00, %main_body ], [ %103, %Flow2 ] - %temp2.0 = phi float [ 0.000000e+00, %main_body ], [ %102, %Flow2 ] - %temp3.0 = phi float [ 0.000000e+00, %main_body ], [ %101, %Flow2 ] - %15 = extractelement <4 x float> %reg1, i32 1 - %16 = extractelement <4 x float> %reg1, i32 3 - %17 = load <4 x float>, <4 x float> addrspace(9)* null - %18 = extractelement <4 x float> %17, i32 0 - %19 = fmul float %18, %0 - %20 = load <4 x float>, <4 x float> addrspace(9)* null - %21 = extractelement <4 x float> %20, i32 1 - %22 = fmul float %21, %0 - %23 = load <4 x float>, <4 x float> addrspace(9)* null - %24 = extractelement <4 x float> %23, i32 2 - %25 = fmul float %24, %0 - %26 = load <4 x float>, <4 x float> addrspace(9)* null - %27 = extractelement <4 x float> %26, i32 3 - %28 = fmul float %27, %0 - %29 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) - %30 = extractelement <4 x float> %29, i32 0 - %31 = fmul float %30, %15 - %32 = fadd float %31, %19 - %33 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) - %34 = extractelement <4 x float> %33, i32 1 - %35 = fmul float %34, %15 - %36 = fadd float %35, %22 - %37 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) - %38 = extractelement <4 x float> %37, i32 2 - %39 = fmul float %38, %15 - %40 = fadd float %39, %25 - %41 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) - %42 = extractelement <4 x float> %41, i32 3 - %43 = fmul float %42, %15 - %44 = fadd float %43, %28 - %45 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) - %46 = extractelement <4 x float> %45, i32 0 - %47 = fmul float %46, %1 - %48 = fadd float %47, %32 - %49 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) - %50 = extractelement <4 x float> %49, i32 1 - %51 = fmul float %50, %1 - %52 = fadd float %51, %36 - %53 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) - %54 = extractelement <4 x float> %53, i32 2 - %55 = fmul float %54, %1 - %56 = fadd float %55, %40 - %57 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) - %58 = extractelement <4 x float> %57, i32 3 - %59 = fmul float %58, %1 - %60 = fadd float %59, %44 - %61 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) - %62 = extractelement <4 x float> %61, i32 0 - %63 = fmul float %62, %16 - %64 = fadd float %63, %48 - %65 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) - %66 = extractelement <4 x float> %65, i32 1 - %67 = fmul float %66, %16 - %68 = fadd float %67, %52 - %69 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) - %70 = extractelement <4 x float> %69, i32 2 - %71 = fmul float %70, %16 - %72 = fadd float %71, %56 - %73 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) - %74 = extractelement <4 x float> %73, i32 3 - %75 = fmul float %74, %16 - %76 = fadd float %75, %60 - %77 = insertelement <4 x float> undef, float %64, i32 0 - %78 = insertelement <4 x float> %77, float %68, i32 1 - %79 = insertelement <4 x float> %78, float %72, i32 2 - %80 = insertelement <4 x float> %79, float %76, i32 3 - call void @llvm.AMDGPU.barrier.local() - %81 = insertelement <4 x float> undef, float %temp.0, i32 0 - %82 = insertelement <4 x float> %81, float %temp1.0, i32 1 - %83 = insertelement <4 x float> %82, float %temp2.0, i32 2 - %84 = insertelement <4 x float> %83, float %temp3.0, i32 3 - call void @llvm.AMDGPU.barrier.local() - ret void - -LOOP: ; preds = %main_body, %Flow - %temp.1 = phi float [ %109, %Flow ], [ 0.000000e+00, %main_body ] - %temp1.1 = phi float [ %108, %Flow ], [ 1.000000e+00, %main_body ] - %temp2.1 = phi float [ %107, %Flow ], [ 0.000000e+00, %main_body ] - %temp3.1 = phi float [ %106, %Flow ], [ 0.000000e+00, %main_body ] - %temp4.0 = phi float [ %105, %Flow ], [ -2.000000e+00, %main_body ] - %85 = fcmp uge float %temp4.0, %0 - %86 = select i1 %85, float 1.000000e+00, float 0.000000e+00 - %87 = fsub float -0.000000e+00, %86 - %88 = fptosi float %87 to i32 - %89 = bitcast i32 %88 to float - %90 = bitcast float %89 to i32 - %91 = icmp ne i32 %90, 0 - %92 = xor i1 %91, true - br i1 %92, label %ENDIF16, label %Flow - -ENDIF16: ; preds = %LOOP - %93 = fcmp une float %1, %temp4.0 - %94 = select i1 %93, float 1.000000e+00, float 0.000000e+00 - %95 = fsub float -0.000000e+00, %94 - %96 = fptosi float %95 to i32 - %97 = bitcast i32 %96 to float - %98 = bitcast float %97 to i32 - %99 = icmp ne i32 %98, 0 - %100 = xor i1 %99, true - br i1 %100, label %ENDIF19, label %Flow1 - -Flow: ; preds = %Flow1, %LOOP - %101 = phi float [ %temp3.1, %Flow1 ], [ %temp3.1, %LOOP ] - %102 = phi float [ %temp2.1, %Flow1 ], [ %temp2.1, %LOOP ] - %103 = phi float [ %temp1.1, %Flow1 ], [ %temp1.1, %LOOP ] - %104 = phi float [ %temp.1, %Flow1 ], [ %temp.1, %LOOP ] - %105 = phi float [ %9, %Flow1 ], [ undef, %LOOP ] - %106 = phi float [ %10, %Flow1 ], [ undef, %LOOP ] - %107 = phi float [ %11, %Flow1 ], [ undef, %LOOP ] - %108 = phi float [ %12, %Flow1 ], [ undef, %LOOP ] - %109 = phi float [ %13, %Flow1 ], [ undef, %LOOP ] - %110 = phi i1 [ %14, %Flow1 ], [ true, %LOOP ] - br i1 %110, label %Flow2, label %LOOP - -ENDIF19: ; preds = %ENDIF16 - %111 = fadd float %temp.1, 1.000000e+00 - %112 = fadd float %temp1.1, 0.000000e+00 - %113 = fadd float %temp2.1, 0.000000e+00 - %114 = fadd float %temp3.1, 0.000000e+00 - %115 = fadd float %temp4.0, 1.000000e+00 - br label %Flow1 -} - -attributes #0 = { "ShaderType"="1" } diff --git a/test/CodeGen/R600/schedule-vs-if-nested-loop.ll b/test/CodeGen/R600/schedule-vs-if-nested-loop.ll deleted file mode 100644 index 8d980dbf899..00000000000 --- a/test/CodeGen/R600/schedule-vs-if-nested-loop.ll +++ /dev/null @@ -1,132 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -;REQUIRES: asserts - -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { -main_body: - %0 = extractelement <4 x float> %reg1, i32 0 - %1 = extractelement <4 x float> %reg1, i32 1 - %2 = extractelement <4 x float> %reg1, i32 2 - %3 = extractelement <4 x float> %reg1, i32 3 - %4 = fcmp ult float %0, 0.000000e+00 - %5 = select i1 %4, float 1.000000e+00, float 0.000000e+00 - %6 = fsub float -0.000000e+00, %5 - %7 = fptosi float %6 to i32 - %8 = bitcast i32 %7 to float - %9 = bitcast float %8 to i32 - %10 = icmp ne i32 %9, 0 - br i1 %10, label %LOOP, label %ENDIF - -ENDIF: ; preds = %ENDIF16, %LOOP, %main_body - %temp.0 = phi float [ 0.000000e+00, %main_body ], [ %temp.1, %LOOP ], [ %temp.1, %ENDIF16 ] - %temp1.0 = phi float [ 1.000000e+00, %main_body ], [ %temp1.1, %LOOP ], [ %temp1.1, %ENDIF16 ] - %temp2.0 = phi float [ 0.000000e+00, %main_body ], [ %temp2.1, %LOOP ], [ %temp2.1, %ENDIF16 ] - %temp3.0 = phi float [ 0.000000e+00, %main_body ], [ %temp3.1, %LOOP ], [ %temp3.1, %ENDIF16 ] - %11 = load <4 x float>, <4 x float> addrspace(9)* null - %12 = extractelement <4 x float> %11, i32 0 - %13 = fmul float %12, %0 - %14 = load <4 x float>, <4 x float> addrspace(9)* null - %15 = extractelement <4 x float> %14, i32 1 - %16 = fmul float %15, %0 - %17 = load <4 x float>, <4 x float> addrspace(9)* null - %18 = extractelement <4 x float> %17, i32 2 - %19 = fmul float %18, %0 - %20 = load <4 x float>, <4 x float> addrspace(9)* null - %21 = extractelement <4 x float> %20, i32 3 - %22 = fmul float %21, %0 - %23 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) - %24 = extractelement <4 x float> %23, i32 0 - %25 = fmul float %24, %1 - %26 = fadd float %25, %13 - %27 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) - %28 = extractelement <4 x float> %27, i32 1 - %29 = fmul float %28, %1 - %30 = fadd float %29, %16 - %31 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) - %32 = extractelement <4 x float> %31, i32 2 - %33 = fmul float %32, %1 - %34 = fadd float %33, %19 - %35 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) - %36 = extractelement <4 x float> %35, i32 3 - %37 = fmul float %36, %1 - %38 = fadd float %37, %22 - %39 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) - %40 = extractelement <4 x float> %39, i32 0 - %41 = fmul float %40, %2 - %42 = fadd float %41, %26 - %43 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) - %44 = extractelement <4 x float> %43, i32 1 - %45 = fmul float %44, %2 - %46 = fadd float %45, %30 - %47 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) - %48 = extractelement <4 x float> %47, i32 2 - %49 = fmul float %48, %2 - %50 = fadd float %49, %34 - %51 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) - %52 = extractelement <4 x float> %51, i32 3 - %53 = fmul float %52, %2 - %54 = fadd float %53, %38 - %55 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) - %56 = extractelement <4 x float> %55, i32 0 - %57 = fmul float %56, %3 - %58 = fadd float %57, %42 - %59 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) - %60 = extractelement <4 x float> %59, i32 1 - %61 = fmul float %60, %3 - %62 = fadd float %61, %46 - %63 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) - %64 = extractelement <4 x float> %63, i32 2 - %65 = fmul float %64, %3 - %66 = fadd float %65, %50 - %67 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) - %68 = extractelement <4 x float> %67, i32 3 - %69 = fmul float %68, %3 - %70 = fadd float %69, %54 - %71 = insertelement <4 x float> undef, float %58, i32 0 - %72 = insertelement <4 x float> %71, float %62, i32 1 - %73 = insertelement <4 x float> %72, float %66, i32 2 - %74 = insertelement <4 x float> %73, float %70, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %74, i32 60, i32 1) - %75 = insertelement <4 x float> undef, float %temp.0, i32 0 - %76 = insertelement <4 x float> %75, float %temp1.0, i32 1 - %77 = insertelement <4 x float> %76, float %temp2.0, i32 2 - %78 = insertelement <4 x float> %77, float %temp3.0, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %78, i32 0, i32 2) - ret void - -LOOP: ; preds = %main_body, %ENDIF19 - %temp.1 = phi float [ %93, %ENDIF19 ], [ 0.000000e+00, %main_body ] - %temp1.1 = phi float [ %94, %ENDIF19 ], [ 1.000000e+00, %main_body ] - %temp2.1 = phi float [ %95, %ENDIF19 ], [ 0.000000e+00, %main_body ] - %temp3.1 = phi float [ %96, %ENDIF19 ], [ 0.000000e+00, %main_body ] - %temp4.0 = phi float [ %97, %ENDIF19 ], [ -2.000000e+00, %main_body ] - %79 = fcmp uge float %temp4.0, %0 - %80 = select i1 %79, float 1.000000e+00, float 0.000000e+00 - %81 = fsub float -0.000000e+00, %80 - %82 = fptosi float %81 to i32 - %83 = bitcast i32 %82 to float - %84 = bitcast float %83 to i32 - %85 = icmp ne i32 %84, 0 - br i1 %85, label %ENDIF, label %ENDIF16 - -ENDIF16: ; preds = %LOOP - %86 = fcmp une float %2, %temp4.0 - %87 = select i1 %86, float 1.000000e+00, float 0.000000e+00 - %88 = fsub float -0.000000e+00, %87 - %89 = fptosi float %88 to i32 - %90 = bitcast i32 %89 to float - %91 = bitcast float %90 to i32 - %92 = icmp ne i32 %91, 0 - br i1 %92, label %ENDIF, label %ENDIF19 - -ENDIF19: ; preds = %ENDIF16 - %93 = fadd float %temp.1, 1.000000e+00 - %94 = fadd float %temp1.1, 0.000000e+00 - %95 = fadd float %temp2.1, 0.000000e+00 - %96 = fadd float %temp3.1, 0.000000e+00 - %97 = fadd float %temp4.0, 1.000000e+00 - br label %LOOP -} - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="1" } diff --git a/test/CodeGen/R600/scratch-buffer.ll b/test/CodeGen/R600/scratch-buffer.ll deleted file mode 100644 index 56088718ada..00000000000 --- a/test/CodeGen/R600/scratch-buffer.ll +++ /dev/null @@ -1,87 +0,0 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s - -; When a frame index offset is more than 12-bits, make sure we don't store -; it in mubuf's offset field. - -; Also, make sure we use the same register for storing the scratch buffer addresss -; for both stores. This register is allocated by the register scavenger, so we -; should be able to reuse the same regiser for each scratch buffer access. - -; CHECK-LABEL: {{^}}legal_offset_fi: -; CHECK: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0{{$}} -; CHECK: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen -; CHECK: v_mov_b32_e32 [[OFFSET]], 0x8000 -; CHECK: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} - -define void @legal_offset_fi(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) { -entry: - %scratch0 = alloca [8192 x i32] - %scratch1 = alloca [8192 x i32] - - %scratchptr0 = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 0 - store i32 1, i32* %scratchptr0 - - %scratchptr1 = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 0 - store i32 2, i32* %scratchptr1 - - %cmp = icmp eq i32 %cond, 0 - br i1 %cmp, label %if, label %else - -if: - %if_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %if_offset - %if_value = load i32, i32* %if_ptr - br label %done - -else: - %else_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %else_offset - %else_value = load i32, i32* %else_ptr - br label %done - -done: - %value = phi i32 [%if_value, %if], [%else_value, %else] - store i32 %value, i32 addrspace(1)* %out - ret void - - ret void - -} - -; CHECK-LABEL: {{^}}legal_offset_fi_offset -; CHECK: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen -; CHECK: v_add_i32_e32 [[OFFSET:v[0-9]+]], 0x8000 -; CHECK: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} - -define void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) { -entry: - %scratch0 = alloca [8192 x i32] - %scratch1 = alloca [8192 x i32] - - %offset0 = load i32, i32 addrspace(1)* %offsets - %scratchptr0 = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %offset0 - store i32 %offset0, i32* %scratchptr0 - - %offsetptr1 = getelementptr i32, i32 addrspace(1)* %offsets, i32 1 - %offset1 = load i32, i32 addrspace(1)* %offsetptr1 - %scratchptr1 = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %offset1 - store i32 %offset1, i32* %scratchptr1 - - %cmp = icmp eq i32 %cond, 0 - br i1 %cmp, label %if, label %else - -if: - %if_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch0, i32 0, i32 %if_offset - %if_value = load i32, i32* %if_ptr - br label %done - -else: - %else_ptr = getelementptr [8192 x i32], [8192 x i32]* %scratch1, i32 0, i32 %else_offset - %else_value = load i32, i32* %else_ptr - br label %done - -done: - %value = phi i32 [%if_value, %if], [%else_value, %else] - store i32 %value, i32 addrspace(1)* %out - ret void -} - diff --git a/test/CodeGen/R600/sdiv.ll b/test/CodeGen/R600/sdiv.ll deleted file mode 100644 index de645353a40..00000000000 --- a/test/CodeGen/R600/sdiv.ll +++ /dev/null @@ -1,104 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; The code generated by sdiv is long and complex and may frequently change. -; The goal of this test is to make sure the ISel doesn't fail. -; -; This program was previously failing to compile when one of the selectcc -; opcodes generated by the sdiv lowering was being legalized and optimized to: -; selectcc Remainder -1, 0, -1, SETGT -; This was fixed by adding an additional pattern in R600Instructions.td to -; match this pattern with a CNDGE_INT. - -; FUNC-LABEL: {{^}}sdiv_i32: -; EG: CF_END -define void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in - %den = load i32, i32 addrspace(1) * %den_ptr - %result = sdiv i32 %num, %den - store i32 %result, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sdiv_i32_4: -define void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %num = load i32, i32 addrspace(1) * %in - %result = sdiv i32 %num, 4 - store i32 %result, i32 addrspace(1)* %out - ret void -} - -; Multiply by a weird constant to make sure setIntDivIsCheap is -; working. - -; FUNC-LABEL: {{^}}slow_sdiv_i32_3435: -; SI: buffer_load_dword [[VAL:v[0-9]+]], -; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b -; SI: v_mul_hi_i32 [[TMP:v[0-9]+]], [[MAGIC]], [[VAL]] -; SI: v_add_i32 -; SI: v_lshrrev_b32 -; SI: v_ashrrev_i32 -; SI: v_add_i32 -; SI: buffer_store_dword -; SI: s_endpgm -define void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %num = load i32, i32 addrspace(1) * %in - %result = sdiv i32 %num, 3435 - store i32 %result, i32 addrspace(1)* %out - ret void -} - -define void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 - %num = load <2 x i32>, <2 x i32> addrspace(1) * %in - %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr - %result = sdiv <2 x i32> %num, %den - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -define void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %num = load <2 x i32>, <2 x i32> addrspace(1) * %in - %result = sdiv <2 x i32> %num, - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -define void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 - %num = load <4 x i32>, <4 x i32> addrspace(1) * %in - %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr - %result = sdiv <4 x i32> %num, %den - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -define void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %num = load <4 x i32>, <4 x i32> addrspace(1) * %in - %result = sdiv <4 x i32> %num, - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -; Tests for 64-bit divide bypass. -; define void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { -; %result = sdiv i64 %a, %b -; store i64 %result, i64 addrspace(1)* %out, align 8 -; ret void -; } - -; define void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { -; %result = srem i64 %a, %b -; store i64 %result, i64 addrspace(1)* %out, align 8 -; ret void -; } - -; define void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { -; %resultdiv = sdiv i64 %a, %b -; %resultrem = srem i64 %a, %b -; %result = add i64 %resultdiv, %resultrem -; store i64 %result, i64 addrspace(1)* %out, align 8 -; ret void -; } diff --git a/test/CodeGen/R600/sdivrem24.ll b/test/CodeGen/R600/sdivrem24.ll deleted file mode 100644 index ad5df39f550..00000000000 --- a/test/CodeGen/R600/sdivrem24.ll +++ /dev/null @@ -1,239 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}sdiv24_i8: -; SI: v_cvt_f32_i32 -; SI: v_cvt_f32_i32 -; SI: v_rcp_f32 -; SI: v_cvt_i32_f32 - -; EG: INT_TO_FLT -; EG-DAG: INT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_INT -define void @sdiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { - %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 - %num = load i8, i8 addrspace(1) * %in - %den = load i8, i8 addrspace(1) * %den_ptr - %result = sdiv i8 %num, %den - store i8 %result, i8 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sdiv24_i16: -; SI: v_cvt_f32_i32 -; SI: v_cvt_f32_i32 -; SI: v_rcp_f32 -; SI: v_cvt_i32_f32 - -; EG: INT_TO_FLT -; EG-DAG: INT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_INT -define void @sdiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { - %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 - %num = load i16, i16 addrspace(1) * %in, align 2 - %den = load i16, i16 addrspace(1) * %den_ptr, align 2 - %result = sdiv i16 %num, %den - store i16 %result, i16 addrspace(1)* %out, align 2 - ret void -} - -; FUNC-LABEL: {{^}}sdiv24_i32: -; SI: v_cvt_f32_i32 -; SI: v_cvt_f32_i32 -; SI: v_rcp_f32 -; SI: v_cvt_i32_f32 - -; EG: INT_TO_FLT -; EG-DAG: INT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_INT -define void @sdiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 8 - %den.i24.0 = shl i32 %den, 8 - %num.i24 = ashr i32 %num.i24.0, 8 - %den.i24 = ashr i32 %den.i24.0, 8 - %result = sdiv i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}sdiv25_i32: -; SI-NOT: v_cvt_f32_i32 -; SI-NOT: v_rcp_f32 - -; EG-NOT: INT_TO_FLT -; EG-NOT: RECIP_IEEE -define void @sdiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 7 - %den.i24.0 = shl i32 %den, 7 - %num.i24 = ashr i32 %num.i24.0, 7 - %den.i24 = ashr i32 %den.i24.0, 7 - %result = sdiv i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_no_sdiv24_i32_1: -; SI-NOT: v_cvt_f32_i32 -; SI-NOT: v_rcp_f32 - -; EG-NOT: INT_TO_FLT -; EG-NOT: RECIP_IEEE -define void @test_no_sdiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 8 - %den.i24.0 = shl i32 %den, 7 - %num.i24 = ashr i32 %num.i24.0, 8 - %den.i24 = ashr i32 %den.i24.0, 7 - %result = sdiv i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_no_sdiv24_i32_2: -; SI-NOT: v_cvt_f32_i32 -; SI-NOT: v_rcp_f32 - -; EG-NOT: INT_TO_FLT -; EG-NOT: RECIP_IEEE -define void @test_no_sdiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 7 - %den.i24.0 = shl i32 %den, 8 - %num.i24 = ashr i32 %num.i24.0, 7 - %den.i24 = ashr i32 %den.i24.0, 8 - %result = sdiv i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}srem24_i8: -; SI: v_cvt_f32_i32 -; SI: v_cvt_f32_i32 -; SI: v_rcp_f32 -; SI: v_cvt_i32_f32 - -; EG: INT_TO_FLT -; EG-DAG: INT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_INT -define void @srem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { - %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 - %num = load i8, i8 addrspace(1) * %in - %den = load i8, i8 addrspace(1) * %den_ptr - %result = srem i8 %num, %den - store i8 %result, i8 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}srem24_i16: -; SI: v_cvt_f32_i32 -; SI: v_cvt_f32_i32 -; SI: v_rcp_f32 -; SI: v_cvt_i32_f32 - -; EG: INT_TO_FLT -; EG-DAG: INT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_INT -define void @srem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { - %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 - %num = load i16, i16 addrspace(1) * %in, align 2 - %den = load i16, i16 addrspace(1) * %den_ptr, align 2 - %result = srem i16 %num, %den - store i16 %result, i16 addrspace(1)* %out, align 2 - ret void -} - -; FUNC-LABEL: {{^}}srem24_i32: -; SI: v_cvt_f32_i32 -; SI: v_cvt_f32_i32 -; SI: v_rcp_f32 -; SI: v_cvt_i32_f32 - -; EG: INT_TO_FLT -; EG-DAG: INT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_INT -define void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 8 - %den.i24.0 = shl i32 %den, 8 - %num.i24 = ashr i32 %num.i24.0, 8 - %den.i24 = ashr i32 %den.i24.0, 8 - %result = srem i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}srem25_i32: -; SI-NOT: v_cvt_f32_i32 -; SI-NOT: v_rcp_f32 - -; EG-NOT: INT_TO_FLT -; EG-NOT: RECIP_IEEE -define void @srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 7 - %den.i24.0 = shl i32 %den, 7 - %num.i24 = ashr i32 %num.i24.0, 7 - %den.i24 = ashr i32 %den.i24.0, 7 - %result = srem i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_no_srem24_i32_1: -; SI-NOT: v_cvt_f32_i32 -; SI-NOT: v_rcp_f32 - -; EG-NOT: INT_TO_FLT -; EG-NOT: RECIP_IEEE -define void @test_no_srem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 8 - %den.i24.0 = shl i32 %den, 7 - %num.i24 = ashr i32 %num.i24.0, 8 - %den.i24 = ashr i32 %den.i24.0, 7 - %result = srem i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_no_srem24_i32_2: -; SI-NOT: v_cvt_f32_i32 -; SI-NOT: v_rcp_f32 - -; EG-NOT: INT_TO_FLT -; EG-NOT: RECIP_IEEE -define void @test_no_srem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 7 - %den.i24.0 = shl i32 %den, 8 - %num.i24 = ashr i32 %num.i24.0, 7 - %den.i24 = ashr i32 %den.i24.0, 8 - %result = srem i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/sdivrem64.ll b/test/CodeGen/R600/sdivrem64.ll deleted file mode 100644 index a9b2b7f9df5..00000000000 --- a/test/CodeGen/R600/sdivrem64.ll +++ /dev/null @@ -1,225 +0,0 @@ -;RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s -;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s -;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s - -;FUNC-LABEL: {{^}}test_sdiv: -;EG: RECIP_UINT -;EG: LSHL {{.*}}, 1, -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT - -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN: v_bfe_u32 -;GCN-NOT: v_mad_f32 -;SI-NOT: v_lshr_b64 -;VI-NOT: v_lshrrev_b64 -;GCN: s_endpgm -define void @test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { - %result = sdiv i64 %x, %y - store i64 %result, i64 addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}test_srem: -;EG: RECIP_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: AND_INT {{.*}}, 1, - -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN-NOT: v_mad_f32 -;SI-NOT: v_lshr_b64 -;VI-NOT: v_lshrrev_b64 -;GCN: s_endpgm -define void @test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) { - %result = urem i64 %x, %y - store i64 %result, i64 addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}test_sdiv3264: -;EG: RECIP_UINT -;EG-NOT: BFE_UINT - -;GCN-NOT: s_bfe_u32 -;GCN-NOT: v_mad_f32 -;SI-NOT: v_lshr_b64 -;VI-NOT: v_lshrrev_b64 -;GCN: s_endpgm -define void @test_sdiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { - %1 = ashr i64 %x, 33 - %2 = ashr i64 %y, 33 - %result = sdiv i64 %1, %2 - store i64 %result, i64 addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}test_srem3264: -;EG: RECIP_UINT -;EG-NOT: BFE_UINT - -;GCN-NOT: s_bfe_u32 -;GCN-NOT: v_mad_f32 -;SI-NOT: v_lshr_b64 -;VI-NOT: v_lshrrev_b64 -;GCN: s_endpgm -define void @test_srem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { - %1 = ashr i64 %x, 33 - %2 = ashr i64 %y, 33 - %result = srem i64 %1, %2 - store i64 %result, i64 addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}test_sdiv2464: -;EG: INT_TO_FLT -;EG: INT_TO_FLT -;EG: FLT_TO_INT -;EG-NOT: RECIP_UINT -;EG-NOT: BFE_UINT - -;GCN-NOT: s_bfe_u32 -;GCN: v_mad_f32 -;SI-NOT: v_lshr_b64 -;VI-NOT: v_lshrrev_b64 -;GCN: s_endpgm -define void @test_sdiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { - %1 = ashr i64 %x, 40 - %2 = ashr i64 %y, 40 - %result = sdiv i64 %1, %2 - store i64 %result, i64 addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}test_srem2464: -;EG: INT_TO_FLT -;EG: INT_TO_FLT -;EG: FLT_TO_INT -;EG-NOT: RECIP_UINT -;EG-NOT: BFE_UINT - -;GCN-NOT: s_bfe_u32 -;GCN: v_mad_f32 -;SI-NOT: v_lshr_b64 -;VI-NOT: v_lshrrev_b64 -;GCN: s_endpgm -define void @test_srem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { - %1 = ashr i64 %x, 40 - %2 = ashr i64 %y, 40 - %result = srem i64 %1, %2 - store i64 %result, i64 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/select-i1.ll b/test/CodeGen/R600/select-i1.ll deleted file mode 100644 index 6735394e93a..00000000000 --- a/test/CodeGen/R600/select-i1.ll +++ /dev/null @@ -1,15 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FIXME: This should go in existing select.ll test, except the current testcase there is broken on SI - -; FUNC-LABEL: {{^}}select_i1: -; SI: v_cndmask_b32 -; SI-NOT: v_cndmask_b32 -define void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind { - %cmp = icmp ugt i32 %cond, 5 - %sel = select i1 %cmp, i1 %a, i1 %b - store i1 %sel, i1 addrspace(1)* %out, align 4 - ret void -} - diff --git a/test/CodeGen/R600/select-vectors.ll b/test/CodeGen/R600/select-vectors.ll deleted file mode 100644 index 59082c65cc8..00000000000 --- a/test/CodeGen/R600/select-vectors.ll +++ /dev/null @@ -1,156 +0,0 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; Test expansion of scalar selects on vectors. -; Evergreen not enabled since it seems to be having problems with doubles. - - -; FUNC-LABEL: {{^}}select_v4i8: -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -define void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) nounwind { - %cmp = icmp eq i8 %c, 0 - %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b - store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}select_v4i16: -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -define void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b, i32 %c) nounwind { - %cmp = icmp eq i32 %c, 0 - %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b - store <4 x i16> %select, <4 x i16> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}select_v2i32: -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: buffer_store_dwordx2 -define void @select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind { - %cmp = icmp eq i32 %c, 0 - %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b - store <2 x i32> %select, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}select_v4i32: -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: buffer_store_dwordx4 -define void @select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind { - %cmp = icmp eq i32 %c, 0 - %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b - store <4 x i32> %select, <4 x i32> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: {{^}}select_v8i32: -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -define void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) nounwind { - %cmp = icmp eq i32 %c, 0 - %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b - store <8 x i32> %select, <8 x i32> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: {{^}}select_v2f32: -; SI: buffer_store_dwordx2 -define void @select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind { - %cmp = icmp eq i32 %c, 0 - %select = select i1 %cmp, <2 x float> %a, <2 x float> %b - store <2 x float> %select, <2 x float> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: {{^}}select_v4f32: -; SI: buffer_store_dwordx4 -define void @select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) nounwind { - %cmp = icmp eq i32 %c, 0 - %select = select i1 %cmp, <4 x float> %a, <4 x float> %b - store <4 x float> %select, <4 x float> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: {{^}}select_v8f32: -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -define void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) nounwind { - %cmp = icmp eq i32 %c, 0 - %select = select i1 %cmp, <8 x float> %a, <8 x float> %b - store <8 x float> %select, <8 x float> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: {{^}}select_v2f64: -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -define void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) nounwind { - %cmp = icmp eq i32 %c, 0 - %select = select i1 %cmp, <2 x double> %a, <2 x double> %b - store <2 x double> %select, <2 x double> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: {{^}}select_v4f64: -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -define void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) nounwind { - %cmp = icmp eq i32 %c, 0 - %select = select i1 %cmp, <4 x double> %a, <4 x double> %b - store <4 x double> %select, <4 x double> addrspace(1)* %out, align 16 - ret void -} - -; FUNC-LABEL: {{^}}select_v8f64: -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -define void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) nounwind { - %cmp = icmp eq i32 %c, 0 - %select = select i1 %cmp, <8 x double> %a, <8 x double> %b - store <8 x double> %select, <8 x double> addrspace(1)* %out, align 16 - ret void -} diff --git a/test/CodeGen/R600/select.ll b/test/CodeGen/R600/select.ll deleted file mode 100644 index 45f3cd5a7ac..00000000000 --- a/test/CodeGen/R600/select.ll +++ /dev/null @@ -1,47 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - - -; Normally icmp + select is optimized to select_cc, when this happens the -; DAGLegalizer never sees the select and doesn't have a chance to leaglize it. -; -; In order to avoid the select_cc optimization, this test case calculates the -; condition for the select in a separate basic block. - -; FUNC-LABEL: {{^}}select: -; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X -; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X -; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY -; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY -; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW -; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW -define void @select (i32 addrspace(1)* %i32out, float addrspace(1)* %f32out, - <2 x i32> addrspace(1)* %v2i32out, <2 x float> addrspace(1)* %v2f32out, - <4 x i32> addrspace(1)* %v4i32out, <4 x float> addrspace(1)* %v4f32out, - i32 %cond) { -entry: - br label %for -body: - %inc = add i32 %i, 1 - %br_cmp.i = icmp eq i1 %br_cmp, 0 - br label %for -for: - %i = phi i32 [ %inc, %body], [ 0, %entry ] - %br_cmp = phi i1 [ %br_cmp.i, %body ], [ 0, %entry ] - %0 = icmp eq i32 %cond, %i - %1 = select i1 %br_cmp, i32 2, i32 3 - %2 = select i1 %br_cmp, float 2.0 , float 5.0 - %3 = select i1 %br_cmp, <2 x i32> , <2 x i32> - %4 = select i1 %br_cmp, <2 x float> , <2 x float> - %5 = select i1 %br_cmp, <4 x i32> , <4 x i32> - %6 = select i1 %br_cmp, <4 x float> , <4 x float> - br i1 %0, label %body, label %done - -done: - store i32 %1, i32 addrspace(1)* %i32out - store float %2, float addrspace(1)* %f32out - store <2 x i32> %3, <2 x i32> addrspace(1)* %v2i32out - store <2 x float> %4, <2 x float> addrspace(1)* %v2f32out - store <4 x i32> %5, <4 x i32> addrspace(1)* %v4i32out - store <4 x float> %6, <4 x float> addrspace(1)* %v4f32out - ret void -} diff --git a/test/CodeGen/R600/select64.ll b/test/CodeGen/R600/select64.ll deleted file mode 100644 index 5cebb30dc72..00000000000 --- a/test/CodeGen/R600/select64.ll +++ /dev/null @@ -1,68 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -; CHECK-LABEL: {{^}}select0: -; i64 select should be split into two i32 selects, and we shouldn't need -; to use a shfit to extract the hi dword of the input. -; CHECK-NOT: s_lshr_b64 -; CHECK: v_cndmask -; CHECK: v_cndmask -define void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) { -entry: - %0 = icmp ugt i32 %cond, 5 - %1 = select i1 %0, i64 0, i64 %in - store i64 %1, i64 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}select_trunc_i64: -; CHECK: v_cndmask_b32 -; CHECK-NOT: v_cndmask_b32 -define void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind { - %cmp = icmp ugt i32 %cond, 5 - %sel = select i1 %cmp, i64 0, i64 %in - %trunc = trunc i64 %sel to i32 - store i32 %trunc, i32 addrspace(1)* %out, align 4 - ret void -} - -; CHECK-LABEL: {{^}}select_trunc_i64_2: -; CHECK: v_cndmask_b32 -; CHECK-NOT: v_cndmask_b32 -define void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind { - %cmp = icmp ugt i32 %cond, 5 - %sel = select i1 %cmp, i64 %a, i64 %b - %trunc = trunc i64 %sel to i32 - store i32 %trunc, i32 addrspace(1)* %out, align 4 - ret void -} - -; CHECK-LABEL: {{^}}v_select_trunc_i64_2: -; CHECK: v_cndmask_b32 -; CHECK-NOT: v_cndmask_b32 -define void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { - %cmp = icmp ugt i32 %cond, 5 - %a = load i64, i64 addrspace(1)* %aptr, align 8 - %b = load i64, i64 addrspace(1)* %bptr, align 8 - %sel = select i1 %cmp, i64 %a, i64 %b - %trunc = trunc i64 %sel to i32 - store i32 %trunc, i32 addrspace(1)* %out, align 4 - ret void -} - -; CHECK-LABEL: {{^}}v_select_i64_split_imm: -; CHECK: s_mov_b32 [[SHI:s[0-9]+]], 63 -; CHECK: s_mov_b32 [[SLO:s[0-9]+]], 0 -; CHECK-DAG: v_mov_b32_e32 [[VHI:v[0-9]+]], [[SHI]] -; CHECK-DAG: v_mov_b32_e32 [[VLO:v[0-9]+]], [[SLO]] -; CHECK-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, [[VLO]], {{v[0-9]+}} -; CHECK-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, [[VHI]], {{v[0-9]+}} -; CHECK: s_endpgm -define void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { - %cmp = icmp ugt i32 %cond, 5 - %a = load i64, i64 addrspace(1)* %aptr, align 8 - %b = load i64, i64 addrspace(1)* %bptr, align 8 - %sel = select i1 %cmp, i64 %a, i64 270582939648 ; 63 << 32 - store i64 %sel, i64 addrspace(1)* %out, align 8 - ret void -} diff --git a/test/CodeGen/R600/selectcc-cnd.ll b/test/CodeGen/R600/selectcc-cnd.ll deleted file mode 100644 index 94d0ace7569..00000000000 --- a/test/CodeGen/R600/selectcc-cnd.ll +++ /dev/null @@ -1,12 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;CHECK-NOT: SETE -;CHECK: CNDE {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1.0, literal.x, -;CHECK: 1073741824 -define void @test(float addrspace(1)* %out, float addrspace(1)* %in) { - %1 = load float, float addrspace(1)* %in - %2 = fcmp oeq float %1, 0.0 - %3 = select i1 %2, float 1.0, float 2.0 - store float %3, float addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/selectcc-cnde-int.ll b/test/CodeGen/R600/selectcc-cnde-int.ll deleted file mode 100644 index 58a4ee7d62b..00000000000 --- a/test/CodeGen/R600/selectcc-cnde-int.ll +++ /dev/null @@ -1,12 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;CHECK-NOT: SETE_INT -;CHECK: CNDE_INT {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, literal.x, -;CHECK-NEXT: 2 -define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %1 = load i32, i32 addrspace(1)* %in - %2 = icmp eq i32 %1, 0 - %3 = select i1 %2, i32 1, i32 2 - store i32 %3, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/selectcc-icmp-select-float.ll b/test/CodeGen/R600/selectcc-icmp-select-float.ll deleted file mode 100644 index e870ee891e6..00000000000 --- a/test/CodeGen/R600/selectcc-icmp-select-float.ll +++ /dev/null @@ -1,16 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; Note additional optimizations may cause this SGT to be replaced with a -; CND* instruction. -; CHECK: SETGT_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, literal.x, -; CHECK-NEXT: -1 -; Test a selectcc with i32 LHS/RHS and float True/False - -define void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) { -entry: - %0 = load i32, i32 addrspace(1)* %in - %1 = icmp sge i32 %0, 0 - %2 = select i1 %1, float 1.0, float 0.0 - store float %2, float addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/selectcc-opt.ll b/test/CodeGen/R600/selectcc-opt.ll deleted file mode 100644 index 65be4a626a1..00000000000 --- a/test/CodeGen/R600/selectcc-opt.ll +++ /dev/null @@ -1,80 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - - -; FUNC-LABEL: {{^}}test_a: -; EG-NOT: CND -; EG: SET{{[NEQGTL]+}}_DX10 - -define void @test_a(i32 addrspace(1)* %out, float %in) { -entry: - %0 = fcmp olt float %in, 0.000000e+00 - %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 - %3 = fptosi float %2 to i32 - %4 = bitcast i32 %3 to float - %5 = bitcast float %4 to i32 - %6 = icmp ne i32 %5, 0 - br i1 %6, label %IF, label %ENDIF - -IF: - %7 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - store i32 0, i32 addrspace(1)* %7 - br label %ENDIF - -ENDIF: - store i32 0, i32 addrspace(1)* %out - ret void -} - -; Same as test_a, but the branch labels are swapped to produce the inverse cc -; for the icmp instruction - -; EG-LABEL: {{^}}test_b: -; EG: SET{{[GTEQN]+}}_DX10 -; EG-NEXT: PRED_ -; EG-NEXT: ALU clause starting -define void @test_b(i32 addrspace(1)* %out, float %in) { -entry: - %0 = fcmp olt float %in, 0.0 - %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 - %3 = fptosi float %2 to i32 - %4 = bitcast i32 %3 to float - %5 = bitcast float %4 to i32 - %6 = icmp ne i32 %5, 0 - br i1 %6, label %ENDIF, label %IF - -IF: - %7 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - store i32 0, i32 addrspace(1)* %7 - br label %ENDIF - -ENDIF: - store i32 0, i32 addrspace(1)* %out - ret void -} - -; Test a CND*_INT instruction with float true/false values -; EG-LABEL: {{^}}test_c: -; EG: CND{{[GTE]+}}_INT -define void @test_c(float addrspace(1)* %out, i32 %in) { -entry: - %0 = icmp sgt i32 %in, 0 - %1 = select i1 %0, float 2.0, float 3.0 - store float %1, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}selectcc_bool: -; SI: v_cmp_ne_i32 -; SI-NEXT: v_cndmask_b32_e64 -; SI-NOT: cmp -; SI-NOT: cndmask -define void @selectcc_bool(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %icmp0 = icmp ne i32 %a, %b - %ext = select i1 %icmp0, i32 -1, i32 0 - store i32 %ext, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/selectcc.ll b/test/CodeGen/R600/selectcc.ll deleted file mode 100644 index f378e15dd76..00000000000 --- a/test/CodeGen/R600/selectcc.ll +++ /dev/null @@ -1,20 +0,0 @@ -; RUN: llc -verify-machineinstrs -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}selectcc_i64: -; EG: XOR_INT -; EG: XOR_INT -; EG: OR_INT -; EG: CNDE_INT -; EG: CNDE_INT -; SI: v_cmp_eq_i64 -; SI: v_cndmask -; SI: v_cndmask -define void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) { -entry: - %0 = icmp eq i64 %lhs, %rhs - %1 = select i1 %0, i64 %true, i64 %false - store i64 %1, i64 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/set-dx10.ll b/test/CodeGen/R600/set-dx10.ll deleted file mode 100644 index 53694dcffa6..00000000000 --- a/test/CodeGen/R600/set-dx10.ll +++ /dev/null @@ -1,161 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; These tests check that floating point comparisons which are used by select -; to store integer true (-1) and false (0) values are lowered to one of the -; SET*DX10 instructions. - -; CHECK: {{^}}fcmp_une_select_fptosi: -; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, -; CHECK-NEXT: LSHR -; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) { -entry: - %0 = fcmp une float %in, 5.0 - %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 - %3 = fptosi float %2 to i32 - store i32 %3, i32 addrspace(1)* %out - ret void -} - -; CHECK: {{^}}fcmp_une_select_i32: -; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, -; CHECK-NEXT: LSHR -; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) { -entry: - %0 = fcmp une float %in, 5.0 - %1 = select i1 %0, i32 -1, i32 0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; CHECK: {{^}}fcmp_oeq_select_fptosi: -; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, -; CHECK-NEXT: LSHR -; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_oeq_select_fptosi(i32 addrspace(1)* %out, float %in) { -entry: - %0 = fcmp oeq float %in, 5.0 - %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 - %3 = fptosi float %2 to i32 - store i32 %3, i32 addrspace(1)* %out - ret void -} - -; CHECK: {{^}}fcmp_oeq_select_i32: -; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, -; CHECK-NEXT: LSHR -; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_oeq_select_i32(i32 addrspace(1)* %out, float %in) { -entry: - %0 = fcmp oeq float %in, 5.0 - %1 = select i1 %0, i32 -1, i32 0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; CHECK: {{^}}fcmp_ogt_select_fptosi: -; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, -; CHECK-NEXT: LSHR -; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_ogt_select_fptosi(i32 addrspace(1)* %out, float %in) { -entry: - %0 = fcmp ogt float %in, 5.0 - %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 - %3 = fptosi float %2 to i32 - store i32 %3, i32 addrspace(1)* %out - ret void -} - -; CHECK: {{^}}fcmp_ogt_select_i32: -; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, -; CHECK-NEXT: LSHR -; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_ogt_select_i32(i32 addrspace(1)* %out, float %in) { -entry: - %0 = fcmp ogt float %in, 5.0 - %1 = select i1 %0, i32 -1, i32 0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; CHECK: {{^}}fcmp_oge_select_fptosi: -; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, -; CHECK-NEXT: LSHR -; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_oge_select_fptosi(i32 addrspace(1)* %out, float %in) { -entry: - %0 = fcmp oge float %in, 5.0 - %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 - %3 = fptosi float %2 to i32 - store i32 %3, i32 addrspace(1)* %out - ret void -} - -; CHECK: {{^}}fcmp_oge_select_i32: -; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x, -; CHECK-NEXT: LSHR -; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_oge_select_i32(i32 addrspace(1)* %out, float %in) { -entry: - %0 = fcmp oge float %in, 5.0 - %1 = select i1 %0, i32 -1, i32 0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; CHECK: {{^}}fcmp_ole_select_fptosi: -; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z, -; CHECK-NEXT: LSHR -; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_ole_select_fptosi(i32 addrspace(1)* %out, float %in) { -entry: - %0 = fcmp ole float %in, 5.0 - %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 - %3 = fptosi float %2 to i32 - store i32 %3, i32 addrspace(1)* %out - ret void -} - -; CHECK: {{^}}fcmp_ole_select_i32: -; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z, -; CHECK-NEXT: LSHR -; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_ole_select_i32(i32 addrspace(1)* %out, float %in) { -entry: - %0 = fcmp ole float %in, 5.0 - %1 = select i1 %0, i32 -1, i32 0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; CHECK: {{^}}fcmp_olt_select_fptosi: -; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z, -; CHECK-NEXT: LSHR -; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_olt_select_fptosi(i32 addrspace(1)* %out, float %in) { -entry: - %0 = fcmp olt float %in, 5.0 - %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 - %3 = fptosi float %2 to i32 - store i32 %3, i32 addrspace(1)* %out - ret void -} - -; CHECK: {{^}}fcmp_olt_select_i32: -; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z, -; CHECK-NEXT: LSHR -; CHECK-NEXT: 1084227584(5.000000e+00) -define void @fcmp_olt_select_i32(i32 addrspace(1)* %out, float %in) { -entry: - %0 = fcmp olt float %in, 5.0 - %1 = select i1 %0, i32 -1, i32 0 - store i32 %1, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/setcc-equivalent.ll b/test/CodeGen/R600/setcc-equivalent.ll deleted file mode 100644 index 11ea793650c..00000000000 --- a/test/CodeGen/R600/setcc-equivalent.ll +++ /dev/null @@ -1,30 +0,0 @@ -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s - -; EG-LABEL: {{^}}and_setcc_setcc_i32: -; EG: AND_INT -; EG-NEXT: SETE_INT -define void @and_setcc_setcc_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { - %cmp1 = icmp eq i32 %a, -1 - %cmp2 = icmp eq i32 %b, -1 - %and = and i1 %cmp1, %cmp2 - %ext = sext i1 %and to i32 - store i32 %ext, i32 addrspace(1)* %out, align 4 - ret void -} - -; EG-LABEL: {{^}}and_setcc_setcc_v4i32: -; EG: AND_INT -; EG: AND_INT -; EG: SETE_INT -; EG: AND_INT -; EG: SETE_INT -; EG: AND_INT -; EG: SETE_INT -define void @and_setcc_setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) { - %cmp1 = icmp eq <4 x i32> %a, - %cmp2 = icmp eq <4 x i32> %b, - %and = and <4 x i1> %cmp1, %cmp2 - %ext = sext <4 x i1> %and to <4 x i32> - store <4 x i32> %ext, <4 x i32> addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/setcc-opt.ll b/test/CodeGen/R600/setcc-opt.ll deleted file mode 100644 index 4e6a10d6b78..00000000000 --- a/test/CodeGen/R600/setcc-opt.ll +++ /dev/null @@ -1,236 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}sext_bool_icmp_eq_0: -; GCN-NOT: v_cmp -; GCN: v_cmp_ne_i32_e32 vcc, -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT:buffer_store_byte [[RESULT]] -; GCN-NEXT: s_endpgm - -; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W -; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1 -define void @sext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %icmp0 = icmp eq i32 %a, %b - %ext = sext i1 %icmp0 to i32 - %icmp1 = icmp eq i32 %ext, 0 - store i1 %icmp1, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sext_bool_icmp_ne_0: -; GCN-NOT: v_cmp -; GCN: v_cmp_ne_i32_e32 vcc, -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] -; GCN-NEXT: s_endpgm - -; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W -; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1 -define void @sext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %icmp0 = icmp ne i32 %a, %b - %ext = sext i1 %icmp0 to i32 - %icmp1 = icmp ne i32 %ext, 0 - store i1 %icmp1, i1 addrspace(1)* %out - ret void -} - -; This really folds away to false -; FUNC-LABEL: {{^}}sext_bool_icmp_eq_1: -; GCN: v_cmp_eq_i32_e32 vcc, -; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, vcc -; GCN-NEXT: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}} -; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1, -; GCN-NEXT: buffer_store_byte [[TMP]] -; GCN-NEXT: s_endpgm -define void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %icmp0 = icmp eq i32 %a, %b - %ext = sext i1 %icmp0 to i32 - %icmp1 = icmp eq i32 %ext, 1 - store i1 %icmp1, i1 addrspace(1)* %out - ret void -} - -; This really folds away to true -; FUNC-LABEL: {{^}}sext_bool_icmp_ne_1: -; GCN: v_cmp_ne_i32_e32 vcc, -; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, -1, vcc -; GCN-NEXT: v_cmp_ne_i32_e32 vcc, 1, [[TMP]]{{$}} -; GCN-NEXT: v_cndmask_b32_e64 [[TMP:v[0-9]+]], 0, 1, -; GCN-NEXT: buffer_store_byte [[TMP]] -; GCN-NEXT: s_endpgm -define void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %icmp0 = icmp ne i32 %a, %b - %ext = sext i1 %icmp0 to i32 - %icmp1 = icmp ne i32 %ext, 1 - store i1 %icmp1, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zext_bool_icmp_eq_0: -; GCN-NOT: v_cmp -; GCN: v_cmp_ne_i32_e32 vcc, -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] -; GCN-NEXT: s_endpgm -define void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %icmp0 = icmp eq i32 %a, %b - %ext = zext i1 %icmp0 to i32 - %icmp1 = icmp eq i32 %ext, 0 - store i1 %icmp1, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zext_bool_icmp_ne_0: -; GCN-NOT: v_cmp -; GCN: v_cmp_ne_i32_e32 vcc, -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] -; GCN-NEXT: s_endpgm -define void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %icmp0 = icmp ne i32 %a, %b - %ext = zext i1 %icmp0 to i32 - %icmp1 = icmp ne i32 %ext, 0 - store i1 %icmp1, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zext_bool_icmp_eq_1: -; GCN-NOT: v_cmp -; GCN: v_cmp_eq_i32_e32 vcc, -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] -; GCN-NEXT: s_endpgm -define void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %icmp0 = icmp eq i32 %a, %b - %ext = zext i1 %icmp0 to i32 - %icmp1 = icmp eq i32 %ext, 1 - store i1 %icmp1, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zext_bool_icmp_ne_1: -; GCN-NOT: v_cmp -; GCN: v_cmp_eq_i32_e32 vcc, -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] -define void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %icmp0 = icmp ne i32 %a, %b - %ext = zext i1 %icmp0 to i32 - %icmp1 = icmp ne i32 %ext, 1 - store i1 %icmp1, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sext_bool_icmp_ne_k: -; SI-DAG: s_load_dword [[A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[B:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; VI-DAG: s_load_dword [[A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI-DAG: s_load_dword [[B:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 -; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]] -; GCN: v_cmp_ne_i32_e32 vcc, 2, [[VB]]{{$}} -; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN: buffer_store_byte -; GCN: s_endpgm -define void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %icmp0 = icmp ne i32 %a, %b - %ext = sext i1 %icmp0 to i32 - %icmp1 = icmp ne i32 %ext, 2 - store i1 %icmp1, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}cmp_zext_k_i8max: -; GCN: buffer_load_ubyte [[B:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 -; GCN: v_mov_b32_e32 [[K255:v[0-9]+]], 0xff{{$}} -; GCN: v_cmp_ne_i32_e32 vcc, [[K255]], [[B]] -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] -; GCN: s_endpgm -define void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind { - %b.ext = zext i8 %b to i32 - %icmp0 = icmp ne i32 %b.ext, 255 - store i1 %icmp0, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}cmp_sext_k_neg1: -; GCN: buffer_load_sbyte [[B:v[0-9]+]] -; GCN: v_cmp_ne_i32_e32 vcc, -1, [[B]]{{$}} -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] -; GCN: s_endpgm -define void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nounwind { - %b = load i8, i8 addrspace(1)* %b.ptr - %b.ext = sext i8 %b to i32 - %icmp0 = icmp ne i32 %b.ext, -1 - store i1 %icmp0, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}cmp_sext_k_neg1_i8_sext_arg: -; GCN: s_load_dword [[B:s[0-9]+]] -; GCN: v_cmp_ne_i32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -1, [[B]] -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP]] -; GCN-NEXT: buffer_store_byte [[RESULT]] -; GCN: s_endpgm -define void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) nounwind { - %b.ext = sext i8 %b to i32 - %icmp0 = icmp ne i32 %b.ext, -1 - store i1 %icmp0, i1 addrspace(1)* %out - ret void -} - -; FIXME: This ends up doing a buffer_load_ubyte, and and compare to -; 255. Seems to be because of ordering problems when not allowing load widths to be reduced. -; Should do a buffer_load_sbyte and compare with -1 - -; FUNC-LABEL: {{^}}cmp_sext_k_neg1_i8_arg: -; GCN-DAG: buffer_load_ubyte [[B:v[0-9]+]] -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xff{{$}} -; GCN: v_cmp_ne_i32_e32 vcc, [[K]], [[B]]{{$}} -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc -; GCN-NEXT: buffer_store_byte [[RESULT]] -; GCN: s_endpgm -define void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind { - %b.ext = sext i8 %b to i32 - %icmp0 = icmp ne i32 %b.ext, -1 - store i1 %icmp0, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}cmp_zext_k_neg1: -; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}} -; GCN: buffer_store_byte [[RESULT]] -; GCN: s_endpgm -define void @cmp_zext_k_neg1(i1 addrspace(1)* %out, i8 %b) nounwind { - %b.ext = zext i8 %b to i32 - %icmp0 = icmp ne i32 %b.ext, -1 - store i1 %icmp0, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zext_bool_icmp_ne_k: -; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}} -; GCN: buffer_store_byte [[RESULT]] -; GCN-NEXT: s_endpgm -define void @zext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %icmp0 = icmp ne i32 %a, %b - %ext = zext i1 %icmp0 to i32 - %icmp1 = icmp ne i32 %ext, 2 - store i1 %icmp1, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zext_bool_icmp_eq_k: -; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} -; GCN: buffer_store_byte [[RESULT]] -; GCN-NEXT: s_endpgm -define void @zext_bool_icmp_eq_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %icmp0 = icmp ne i32 %a, %b - %ext = zext i1 %icmp0 to i32 - %icmp1 = icmp eq i32 %ext, 2 - store i1 %icmp1, i1 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/setcc.ll b/test/CodeGen/R600/setcc.ll deleted file mode 100644 index f33a82df5ff..00000000000 --- a/test/CodeGen/R600/setcc.ll +++ /dev/null @@ -1,377 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -; FUNC-LABEL: {{^}}setcc_v2i32: -; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[3].X, KC0[3].Z -; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[2].W, KC0[3].Y - -define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) { - %result = icmp eq <2 x i32> %a, %b - %sext = sext <2 x i1> %result to <2 x i32> - store <2 x i32> %sext, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}setcc_v4i32: -; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 - %a = load <4 x i32>, <4 x i32> addrspace(1) * %in - %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr - %result = icmp eq <4 x i32> %a, %b - %sext = sext <4 x i1> %result to <4 x i32> - store <4 x i32> %sext, <4 x i32> addrspace(1)* %out - ret void -} - -;;;==========================================================================;;; -;; Float comparisons -;;;==========================================================================;;; - -; FUNC-LABEL: {{^}}f32_oeq: -; R600: SETE_DX10 -; SI: v_cmp_eq_f32 -define void @f32_oeq(i32 addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fcmp oeq float %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f32_ogt: -; R600: SETGT_DX10 -; SI: v_cmp_gt_f32 -define void @f32_ogt(i32 addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fcmp ogt float %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f32_oge: -; R600: SETGE_DX10 -; SI: v_cmp_ge_f32 -define void @f32_oge(i32 addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fcmp oge float %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f32_olt: -; R600: SETGT_DX10 -; SI: v_cmp_lt_f32 -define void @f32_olt(i32 addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fcmp olt float %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f32_ole: -; R600: SETGE_DX10 -; SI: v_cmp_le_f32 -define void @f32_ole(i32 addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fcmp ole float %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f32_one: -; R600-DAG: SETE_DX10 -; R600-DAG: SETE_DX10 -; R600-DAG: AND_INT -; R600-DAG: SETNE_DX10 -; R600-DAG: AND_INT -; R600-DAG: SETNE_INT - -; SI: v_cmp_lg_f32_e32 vcc -; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f32_one(i32 addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fcmp one float %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f32_ord: -; R600-DAG: SETE_DX10 -; R600-DAG: SETE_DX10 -; R600-DAG: AND_INT -; R600-DAG: SETNE_INT -; SI: v_cmp_o_f32 -define void @f32_ord(i32 addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fcmp ord float %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f32_ueq: -; R600-DAG: SETNE_DX10 -; R600-DAG: SETNE_DX10 -; R600-DAG: OR_INT -; R600-DAG: SETE_DX10 -; R600-DAG: OR_INT -; R600-DAG: SETNE_INT - -; SI: v_cmp_nlg_f32_e32 vcc -; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fcmp ueq float %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f32_ugt: -; R600: SETGE -; R600: SETE_DX10 -; SI: v_cmp_nle_f32_e32 vcc -; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fcmp ugt float %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f32_uge: -; R600: SETGT -; R600: SETE_DX10 - -; SI: v_cmp_nlt_f32_e32 vcc -; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fcmp uge float %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f32_ult: -; R600: SETGE -; R600: SETE_DX10 - -; SI: v_cmp_nge_f32_e32 vcc -; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fcmp ult float %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f32_ule: -; R600: SETGT -; R600: SETE_DX10 - -; SI: v_cmp_ngt_f32_e32 vcc -; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fcmp ule float %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f32_une: -; R600: SETNE_DX10 -; SI: v_cmp_neq_f32 -define void @f32_une(i32 addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fcmp une float %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f32_uno: -; R600: SETNE_DX10 -; R600: SETNE_DX10 -; R600: OR_INT -; R600: SETNE_INT -; SI: v_cmp_u_f32 -define void @f32_uno(i32 addrspace(1)* %out, float %a, float %b) { -entry: - %0 = fcmp uno float %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -;;;==========================================================================;;; -;; 32-bit integer comparisons -;;;==========================================================================;;; - -; FUNC-LABEL: {{^}}i32_eq: -; R600: SETE_INT -; SI: v_cmp_eq_i32 -define void @i32_eq(i32 addrspace(1)* %out, i32 %a, i32 %b) { -entry: - %0 = icmp eq i32 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i32_ne: -; R600: SETNE_INT -; SI: v_cmp_ne_i32 -define void @i32_ne(i32 addrspace(1)* %out, i32 %a, i32 %b) { -entry: - %0 = icmp ne i32 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i32_ugt: -; R600: SETGT_UINT -; SI: v_cmp_gt_u32 -define void @i32_ugt(i32 addrspace(1)* %out, i32 %a, i32 %b) { -entry: - %0 = icmp ugt i32 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i32_uge: -; R600: SETGE_UINT -; SI: v_cmp_ge_u32 -define void @i32_uge(i32 addrspace(1)* %out, i32 %a, i32 %b) { -entry: - %0 = icmp uge i32 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i32_ult: -; R600: SETGT_UINT -; SI: v_cmp_lt_u32 -define void @i32_ult(i32 addrspace(1)* %out, i32 %a, i32 %b) { -entry: - %0 = icmp ult i32 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i32_ule: -; R600: SETGE_UINT -; SI: v_cmp_le_u32 -define void @i32_ule(i32 addrspace(1)* %out, i32 %a, i32 %b) { -entry: - %0 = icmp ule i32 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i32_sgt: -; R600: SETGT_INT -; SI: v_cmp_gt_i32 -define void @i32_sgt(i32 addrspace(1)* %out, i32 %a, i32 %b) { -entry: - %0 = icmp sgt i32 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i32_sge: -; R600: SETGE_INT -; SI: v_cmp_ge_i32 -define void @i32_sge(i32 addrspace(1)* %out, i32 %a, i32 %b) { -entry: - %0 = icmp sge i32 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i32_slt: -; R600: SETGT_INT -; SI: v_cmp_lt_i32 -define void @i32_slt(i32 addrspace(1)* %out, i32 %a, i32 %b) { -entry: - %0 = icmp slt i32 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i32_sle: -; R600: SETGE_INT -; SI: v_cmp_le_i32 -define void @i32_sle(i32 addrspace(1)* %out, i32 %a, i32 %b) { -entry: - %0 = icmp sle i32 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FIXME: This does 4 compares -; FUNC-LABEL: {{^}}v3i32_eq: -; SI-DAG: v_cmp_eq_i32 -; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, -; SI-DAG: v_cmp_eq_i32 -; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, -; SI-DAG: v_cmp_eq_i32 -; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, -; SI: s_endpgm -define void @v3i32_eq(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %ptra, <3 x i32> addrspace(1)* %ptrb) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.a = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %ptra, i32 %tid - %gep.b = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %ptrb, i32 %tid - %gep.out = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %out, i32 %tid - %a = load <3 x i32>, <3 x i32> addrspace(1)* %gep.a - %b = load <3 x i32>, <3 x i32> addrspace(1)* %gep.b - %cmp = icmp eq <3 x i32> %a, %b - %ext = sext <3 x i1> %cmp to <3 x i32> - store <3 x i32> %ext, <3 x i32> addrspace(1)* %gep.out - ret void -} - -; FUNC-LABEL: {{^}}v3i8_eq: -; SI-DAG: v_cmp_eq_i32 -; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, -; SI-DAG: v_cmp_eq_i32 -; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, -; SI-DAG: v_cmp_eq_i32 -; SI-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, -; SI: s_endpgm -define void @v3i8_eq(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %ptra, <3 x i8> addrspace(1)* %ptrb) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep.a = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %ptra, i32 %tid - %gep.b = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %ptrb, i32 %tid - %gep.out = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %out, i32 %tid - %a = load <3 x i8>, <3 x i8> addrspace(1)* %gep.a - %b = load <3 x i8>, <3 x i8> addrspace(1)* %gep.b - %cmp = icmp eq <3 x i8> %a, %b - %ext = sext <3 x i1> %cmp to <3 x i8> - store <3 x i8> %ext, <3 x i8> addrspace(1)* %gep.out - ret void -} diff --git a/test/CodeGen/R600/setcc64.ll b/test/CodeGen/R600/setcc64.ll deleted file mode 100644 index 231be7aa3da..00000000000 --- a/test/CodeGen/R600/setcc64.ll +++ /dev/null @@ -1,259 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s - -; XXX: Merge this into setcc, once R600 supports 64-bit operations - -;;;==========================================================================;;; -;; Double comparisons -;;;==========================================================================;;; - -; FUNC-LABEL: {{^}}f64_oeq: -; SI: v_cmp_eq_f64 -define void @f64_oeq(i32 addrspace(1)* %out, double %a, double %b) { -entry: - %0 = fcmp oeq double %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f64_ogt: -; SI: v_cmp_gt_f64 -define void @f64_ogt(i32 addrspace(1)* %out, double %a, double %b) { -entry: - %0 = fcmp ogt double %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f64_oge: -; SI: v_cmp_ge_f64 -define void @f64_oge(i32 addrspace(1)* %out, double %a, double %b) { -entry: - %0 = fcmp oge double %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f64_olt: -; SI: v_cmp_lt_f64 -define void @f64_olt(i32 addrspace(1)* %out, double %a, double %b) { -entry: - %0 = fcmp olt double %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f64_ole: -; SI: v_cmp_le_f64 -define void @f64_ole(i32 addrspace(1)* %out, double %a, double %b) { -entry: - %0 = fcmp ole double %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f64_one: -; SI: v_cmp_lg_f64_e32 vcc -; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f64_one(i32 addrspace(1)* %out, double %a, double %b) { -entry: - %0 = fcmp one double %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f64_ord: -; SI: v_cmp_o_f64 -define void @f64_ord(i32 addrspace(1)* %out, double %a, double %b) { -entry: - %0 = fcmp ord double %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f64_ueq: -; SI: v_cmp_nlg_f64_e32 vcc -; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) { -entry: - %0 = fcmp ueq double %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f64_ugt: - -; SI: v_cmp_nle_f64_e32 vcc -; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) { -entry: - %0 = fcmp ugt double %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f64_uge: -; SI: v_cmp_nlt_f64_e32 vcc -; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) { -entry: - %0 = fcmp uge double %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f64_ult: -; SI: v_cmp_nge_f64_e32 vcc -; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) { -entry: - %0 = fcmp ult double %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f64_ule: -; SI: v_cmp_ngt_f64_e32 vcc -; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc -define void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) { -entry: - %0 = fcmp ule double %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f64_une: -; SI: v_cmp_neq_f64 -define void @f64_une(i32 addrspace(1)* %out, double %a, double %b) { -entry: - %0 = fcmp une double %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}f64_uno: -; SI: v_cmp_u_f64 -define void @f64_uno(i32 addrspace(1)* %out, double %a, double %b) { -entry: - %0 = fcmp uno double %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -;;;==========================================================================;;; -;; 64-bit integer comparisons -;;;==========================================================================;;; - -; FUNC-LABEL: {{^}}i64_eq: -; SI: v_cmp_eq_i64 -define void @i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) { -entry: - %0 = icmp eq i64 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i64_ne: -; SI: v_cmp_ne_i64 -define void @i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) { -entry: - %0 = icmp ne i64 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i64_ugt: -; SI: v_cmp_gt_u64 -define void @i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) { -entry: - %0 = icmp ugt i64 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i64_uge: -; SI: v_cmp_ge_u64 -define void @i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) { -entry: - %0 = icmp uge i64 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i64_ult: -; SI: v_cmp_lt_u64 -define void @i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) { -entry: - %0 = icmp ult i64 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i64_ule: -; SI: v_cmp_le_u64 -define void @i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) { -entry: - %0 = icmp ule i64 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i64_sgt: -; SI: v_cmp_gt_i64 -define void @i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) { -entry: - %0 = icmp sgt i64 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i64_sge: -; SI: v_cmp_ge_i64 -define void @i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) { -entry: - %0 = icmp sge i64 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i64_slt: -; SI: v_cmp_lt_i64 -define void @i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) { -entry: - %0 = icmp slt i64 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}i64_sle: -; SI: v_cmp_le_i64 -define void @i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) { -entry: - %0 = icmp sle i64 %a, %b - %1 = sext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/seto.ll b/test/CodeGen/R600/seto.ll deleted file mode 100644 index 9b5d6b5dbd6..00000000000 --- a/test/CodeGen/R600/seto.ll +++ /dev/null @@ -1,15 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s - -; CHECK-LABEL: {{^}}main: -; CHECK: v_cmp_o_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]] -; CHECK-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1.0, [[CMP]] -define void @main(float %p) { -main_body: - %c = fcmp oeq float %p, %p - %r = select i1 %c, float 1.000000e+00, float 0.000000e+00 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %r, float %r, float %r, float %r) - ret void -} - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/test/CodeGen/R600/setuo.ll b/test/CodeGen/R600/setuo.ll deleted file mode 100644 index 76346c4f624..00000000000 --- a/test/CodeGen/R600/setuo.ll +++ /dev/null @@ -1,15 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s - -; CHECK-LABEL: {{^}}main: -; CHECK: v_cmp_u_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]] -; CHECK-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1.0, [[CMP]] -define void @main(float %p) { -main_body: - %c = fcmp une float %p, %p - %r = select i1 %c, float 1.000000e+00, float 0.000000e+00 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %r, float %r, float %r, float %r) - ret void -} - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/test/CodeGen/R600/sext-eliminate.ll b/test/CodeGen/R600/sext-eliminate.ll deleted file mode 100644 index 7dc6eb87f6b..00000000000 --- a/test/CodeGen/R600/sext-eliminate.ll +++ /dev/null @@ -1,26 +0,0 @@ -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - - -; FUNC-LABEL: {{^}}sext_in_reg_i1_i32_add: - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] -; EG: SUB_INT {{[* ]*}}[[RES]] -; EG-NOT: BFE -define void @sext_in_reg_i1_i32_add(i32 addrspace(1)* %out, i1 %a, i32 %b) { - %sext = sext i1 %a to i32 - %res = add i32 %b, %sext - store i32 %res, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_i1_i32_sub: - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] -; EG: ADD_INT {{[* ]*}}[[RES]] -; EG-NOT: BFE -define void @sext_in_reg_i1_i32_sub(i32 addrspace(1)* %out, i1 %a, i32 %b) { - %sext = sext i1 %a to i32 - %res = sub i32 %b, %sext - store i32 %res, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/sext-in-reg.ll b/test/CodeGen/R600/sext-in-reg.ll deleted file mode 100644 index 5aedda2ce1a..00000000000 --- a/test/CodeGen/R600/sext-in-reg.ll +++ /dev/null @@ -1,611 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare i32 @llvm.AMDGPU.imax(i32, i32) nounwind readnone -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - - -; FUNC-LABEL: {{^}}sext_in_reg_i1_i32: -; SI: s_load_dword [[ARG:s[0-9]+]], -; SI: s_bfe_i32 [[SEXTRACT:s[0-9]+]], [[ARG]], 0x10000 -; SI: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], [[SEXTRACT]] -; SI: buffer_store_dword [[EXTRACT]], - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] -; EG: BFE_INT [[RES]], {{.*}}, 0.0, 1 -; EG-NEXT: LSHR * [[ADDR]] -define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) { - %shl = shl i32 %in, 31 - %sext = ashr i32 %shl, 31 - store i32 %sext, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32: -; SI: s_add_i32 [[VAL:s[0-9]+]], -; SI: s_sext_i32_i8 [[EXTRACT:s[0-9]+]], [[VAL]] -; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]] -; SI: buffer_store_dword [[VEXTRACT]], - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] -; EG: ADD_INT -; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal -; EG-NEXT: LSHR * [[ADDR]] -define void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %c = add i32 %a, %b ; add to prevent folding into extload - %shl = shl i32 %c, 24 - %ashr = ashr i32 %shl, 24 - store i32 %ashr, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_i16_to_i32: -; SI: s_add_i32 [[VAL:s[0-9]+]], -; SI: s_sext_i32_i16 [[EXTRACT:s[0-9]+]], [[VAL]] -; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]] -; SI: buffer_store_dword [[VEXTRACT]], - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] -; EG: ADD_INT -; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal -; EG-NEXT: LSHR * [[ADDR]] -define void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %c = add i32 %a, %b ; add to prevent folding into extload - %shl = shl i32 %c, 16 - %ashr = ashr i32 %shl, 16 - store i32 %ashr, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_i8_to_v1i32: -; SI: s_add_i32 [[VAL:s[0-9]+]], -; SI: s_sext_i32_i8 [[EXTRACT:s[0-9]+]], [[VAL]] -; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]] -; SI: buffer_store_dword [[VEXTRACT]], - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] -; EG: ADD_INT -; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal -; EG-NEXT: LSHR * [[ADDR]] -define void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind { - %c = add <1 x i32> %a, %b ; add to prevent folding into extload - %shl = shl <1 x i32> %c, - %ashr = ashr <1 x i32> %shl, - store <1 x i32> %ashr, <1 x i32> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_i1_to_i64: -; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]] -; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x10000 -; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] -; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] -; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} -define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %c = shl i64 %a, %b - %shl = shl i64 %c, 63 - %ashr = ashr i64 %shl, 63 - store i64 %ashr, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i64: -; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]] -; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x80000 -; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] -; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] -; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]] -; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]] -; EG: LSHL -; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal -; EG: ASHR [[RES_HI]] -; EG-NOT: BFE_INT -; EG: LSHR -; EG: LSHR -;; TODO Check address computation, using | with variables in {{}} does not work, -;; also the _LO/_HI order might be different -define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %c = shl i64 %a, %b - %shl = shl i64 %c, 56 - %ashr = ashr i64 %shl, 56 - store i64 %ashr, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_i16_to_i64: -; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]] -; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x100000 -; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] -; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] -; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]] -; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]] -; EG: LSHL -; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal -; EG: ASHR [[RES_HI]] -; EG-NOT: BFE_INT -; EG: LSHR -; EG: LSHR -;; TODO Check address computation, using | with variables in {{}} does not work, -;; also the _LO/_HI order might be different -define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %c = shl i64 %a, %b - %shl = shl i64 %c, 48 - %ashr = ashr i64 %shl, 48 - store i64 %ashr, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_i32_to_i64: -; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]] -; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x200000 -; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] -; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] -; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]] -; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]] -; EG-NOT: BFE_INT - -; EG: ASHR [[RES_HI]] - -; EG: LSHR -; EG: LSHR -;; TODO Check address computation, using | with variables in {{}} does not work, -;; also the _LO/_HI order might be different -define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %c = shl i64 %a, %b - %shl = shl i64 %c, 32 - %ashr = ashr i64 %shl, 32 - store i64 %ashr, i64 addrspace(1)* %out, align 8 - ret void -} - -; This is broken on Evergreen for some reason related to the <1 x i64> kernel arguments. -; XFUNC-LABEL: {{^}}sext_in_reg_i8_to_v1i64: -; XSI: s_bfe_i32 [[EXTRACT:s[0-9]+]], {{s[0-9]+}}, 524288 -; XSI: s_ashr_i32 {{v[0-9]+}}, [[EXTRACT]], 31 -; XSI: buffer_store_dword -; XEG: BFE_INT -; XEG: ASHR -; define void @sext_in_reg_i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a, <1 x i64> %b) nounwind { -; %c = add <1 x i64> %a, %b -; %shl = shl <1 x i64> %c, -; %ashr = ashr <1 x i64> %shl, -; store <1 x i64> %ashr, <1 x i64> addrspace(1)* %out, align 8 -; ret void -; } - -; FUNC-LABEL: {{^}}v_sext_in_reg_i1_to_i64: -; SI: buffer_load_dwordx2 -; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} -; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 1 -; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] -; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() - %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid - %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid - %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid - %a = load i64, i64 addrspace(1)* %a.gep, align 8 - %b = load i64, i64 addrspace(1)* %b.gep, align 8 - - %c = shl i64 %a, %b - %shl = shl i64 %c, 63 - %ashr = ashr i64 %shl, 63 - store i64 %ashr, i64 addrspace(1)* %out.gep, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_sext_in_reg_i8_to_i64: -; SI: buffer_load_dwordx2 -; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} -; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 8 -; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] -; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() - %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid - %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid - %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid - %a = load i64, i64 addrspace(1)* %a.gep, align 8 - %b = load i64, i64 addrspace(1)* %b.gep, align 8 - - %c = shl i64 %a, %b - %shl = shl i64 %c, 56 - %ashr = ashr i64 %shl, 56 - store i64 %ashr, i64 addrspace(1)* %out.gep, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_sext_in_reg_i16_to_i64: -; SI: buffer_load_dwordx2 -; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} -; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 16 -; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] -; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -define void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() - %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid - %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid - %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid - %a = load i64, i64 addrspace(1)* %a.gep, align 8 - %b = load i64, i64 addrspace(1)* %b.gep, align 8 - - %c = shl i64 %a, %b - %shl = shl i64 %c, 48 - %ashr = ashr i64 %shl, 48 - store i64 %ashr, i64 addrspace(1)* %out.gep, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_sext_in_reg_i32_to_i64: -; SI: buffer_load_dwordx2 -; SI: v_lshl_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, -; SI: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]] -; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[SHR]]{{\]}} -define void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() - %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid - %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid - %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid - %a = load i64, i64 addrspace(1)* %a.gep, align 8 - %b = load i64, i64 addrspace(1)* %b.gep, align 8 - - %c = shl i64 %a, %b - %shl = shl i64 %c, 32 - %ashr = ashr i64 %shl, 32 - store i64 %ashr, i64 addrspace(1)* %out.gep, align 8 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_i1_in_i32_other_amount: -; SI-NOT: s_lshl -; SI-NOT: s_ashr -; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001 - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] -; EG-NOT: BFE -; EG: ADD_INT -; EG: LSHL -; EG: ASHR [[RES]] -; EG: LSHR {{\*?}} [[ADDR]] -define void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %c = add i32 %a, %b - %x = shl i32 %c, 6 - %y = ashr i32 %x, 7 - store i32 %y, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_v2i1_in_v2i32_other_amount: -; SI-NOT: s_lshl -; SI-NOT: s_ashr -; SI-DAG: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001 -; SI-DAG: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001 -; SI: s_endpgm - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] -; EG-NOT: BFE -; EG: ADD_INT -; EG: LSHL -; EG: ASHR [[RES]] -; EG: LSHL -; EG: ASHR [[RES]] -; EG: LSHR {{\*?}} [[ADDR]] -define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { - %c = add <2 x i32> %a, %b - %x = shl <2 x i32> %c, - %y = ashr <2 x i32> %x, - store <2 x i32> %y, <2 x i32> addrspace(1)* %out, align 2 - ret void -} - - -; FUNC-LABEL: {{^}}sext_in_reg_v2i1_to_v2i32: -; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 -; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 -; SI: buffer_store_dwordx2 - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] -; EG: BFE_INT [[RES]] -; EG: BFE_INT [[RES]] -; EG: LSHR {{\*?}} [[ADDR]] -define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { - %c = add <2 x i32> %a, %b ; add to prevent folding into extload - %shl = shl <2 x i32> %c, - %ashr = ashr <2 x i32> %shl, - store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_v4i1_to_v4i32: -; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 -; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 -; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 -; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 -; SI: buffer_store_dwordx4 - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] -; EG: BFE_INT [[RES]] -; EG: BFE_INT [[RES]] -; EG: BFE_INT [[RES]] -; EG: BFE_INT [[RES]] -; EG: LSHR {{\*?}} [[ADDR]] -define void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind { - %c = add <4 x i32> %a, %b ; add to prevent folding into extload - %shl = shl <4 x i32> %c, - %ashr = ashr <4 x i32> %shl, - store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_v2i8_to_v2i32: -; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} -; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} -; SI: buffer_store_dwordx2 - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] -; EG: BFE_INT [[RES]] -; EG: BFE_INT [[RES]] -; EG: LSHR {{\*?}} [[ADDR]] -define void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { - %c = add <2 x i32> %a, %b ; add to prevent folding into extload - %shl = shl <2 x i32> %c, - %ashr = ashr <2 x i32> %shl, - store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_v4i8_to_v4i32: -; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} -; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} -; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} -; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} -; SI: buffer_store_dwordx4 - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] -; EG: BFE_INT [[RES]] -; EG: BFE_INT [[RES]] -; EG: BFE_INT [[RES]] -; EG: BFE_INT [[RES]] -; EG: LSHR {{\*?}} [[ADDR]] -define void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind { - %c = add <4 x i32> %a, %b ; add to prevent folding into extload - %shl = shl <4 x i32> %c, - %ashr = ashr <4 x i32> %shl, - store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_v2i16_to_v2i32: -; SI: s_sext_i32_i16 {{s[0-9]+}}, {{s[0-9]+}} -; SI: s_sext_i32_i16 {{s[0-9]+}}, {{s[0-9]+}} -; SI: buffer_store_dwordx2 - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] -; EG: BFE_INT [[RES]] -; EG: BFE_INT [[RES]] -; EG: LSHR {{\*?}} [[ADDR]] -define void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { - %c = add <2 x i32> %a, %b ; add to prevent folding into extload - %shl = shl <2 x i32> %c, - %ashr = ashr <2 x i32> %shl, - store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}testcase: -define void @testcase(i8 addrspace(1)* %out, i8 %a) nounwind { - %and_a_1 = and i8 %a, 1 - %cmp_eq = icmp eq i8 %and_a_1, 0 - %cmp_slt = icmp slt i8 %a, 0 - %sel0 = select i1 %cmp_slt, i8 0, i8 %a - %sel1 = select i1 %cmp_eq, i8 0, i8 %a - %xor = xor i8 %sel0, %sel1 - store i8 %xor, i8 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}testcase_3: -define void @testcase_3(i8 addrspace(1)* %out, i8 %a) nounwind { - %and_a_1 = and i8 %a, 1 - %cmp_eq = icmp eq i8 %and_a_1, 0 - %cmp_slt = icmp slt i8 %a, 0 - %sel0 = select i1 %cmp_slt, i8 0, i8 %a - %sel1 = select i1 %cmp_eq, i8 0, i8 %a - %xor = xor i8 %sel0, %sel1 - store i8 %xor, i8 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}vgpr_sext_in_reg_v4i8_to_v4i32: -; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 -; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 -; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 -; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 -define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind { - %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16 - %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16 - %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload - %shl = shl <4 x i32> %c, - %ashr = ashr <4 x i32> %shl, - store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}vgpr_sext_in_reg_v4i16_to_v4i32: -; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16 -; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16 -define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind { - %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16 - %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16 - %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload - %shl = shl <4 x i32> %c, - %ashr = ashr <4 x i32> %shl, - store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_to_illegal_type: -; SI: buffer_load_sbyte -; SI: v_max_i32 -; SI-NOT: bfe -; SI: buffer_store_short -define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind { - %tmp5 = load i8, i8 addrspace(1)* %src, align 1 - %tmp2 = sext i8 %tmp5 to i32 - %tmp3 = tail call i32 @llvm.AMDGPU.imax(i32 %tmp2, i32 0) nounwind readnone - %tmp4 = trunc i32 %tmp3 to i8 - %tmp6 = sext i8 %tmp4 to i16 - store i16 %tmp6, i16 addrspace(1)* %out, align 2 - ret void -} - -declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone - -; FUNC-LABEL: {{^}}bfe_0_width: -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind { - %load = load i32, i32 addrspace(1)* %ptr, align 4 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 8, i32 0) nounwind readnone - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_8_bfe_8: -; SI: v_bfe_i32 -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind { - %load = load i32, i32 addrspace(1)* %ptr, align 4 - %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone - %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone - store i32 %bfe1, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}bfe_8_bfe_16: -; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 -; SI: s_endpgm -define void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind { - %load = load i32, i32 addrspace(1)* %ptr, align 4 - %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone - %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 16) nounwind readnone - store i32 %bfe1, i32 addrspace(1)* %out, align 4 - ret void -} - -; This really should be folded into 1 -; FUNC-LABEL: {{^}}bfe_16_bfe_8: -; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind { - %load = load i32, i32 addrspace(1)* %ptr, align 4 - %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 16) nounwind readnone - %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone - store i32 %bfe1, i32 addrspace(1)* %out, align 4 - ret void -} - -; Make sure there isn't a redundant BFE -; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe: -; SI: s_sext_i32_i8 s{{[0-9]+}}, s{{[0-9]+}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %c = add i32 %a, %b ; add to prevent folding into extload - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 0, i32 8) nounwind readnone - %shl = shl i32 %bfe, 24 - %ashr = ashr i32 %shl, 24 - store i32 %ashr, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe_wrong: -define void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %c = add i32 %a, %b ; add to prevent folding into extload - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 8, i32 0) nounwind readnone - %shl = shl i32 %bfe, 24 - %ashr = ashr i32 %shl, 24 - store i32 %ashr, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe: -; SI: buffer_load_sbyte -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind { - %load = load i8, i8 addrspace(1)* %ptr, align 1 - %sext = sext i8 %load to i32 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 0, i32 8) nounwind readnone - %shl = shl i32 %bfe, 24 - %ashr = ashr i32 %shl, 24 - store i32 %ashr, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI: .text -; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe_0:{{.*$}} -; SI-NOT: {{[^@]}}bfe -; SI: s_endpgm -define void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind { - %load = load i8, i8 addrspace(1)* %ptr, align 1 - %sext = sext i8 %load to i32 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 8, i32 0) nounwind readnone - %shl = shl i32 %bfe, 24 - %ashr = ashr i32 %shl, 24 - store i32 %ashr, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_i1_bfe_offset_0: -; SI-NOT: shr -; SI-NOT: shl -; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1 -; SI: s_endpgm -define void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 31 - %shr = ashr i32 %shl, 31 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 0, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_i1_bfe_offset_1: -; SI: buffer_load_dword -; SI-NOT: shl -; SI-NOT: shr -; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 1 -; SI: s_endpgm -define void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 30 - %shr = ashr i32 %shl, 30 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 1) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}sext_in_reg_i2_bfe_offset_1: -; SI: buffer_load_dword -; SI-NOT: v_lshl -; SI-NOT: v_ashr -; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 2 -; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2 -; SI: s_endpgm -define void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %x = load i32, i32 addrspace(1)* %in, align 4 - %shl = shl i32 %x, 30 - %shr = ashr i32 %shl, 30 - %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 2) - store i32 %bfe, i32 addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/sgpr-control-flow.ll b/test/CodeGen/R600/sgpr-control-flow.ll deleted file mode 100644 index 38289ced632..00000000000 --- a/test/CodeGen/R600/sgpr-control-flow.ll +++ /dev/null @@ -1,105 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s -; -; -; Most SALU instructions ignore control flow, so we need to make sure -; they don't overwrite values from other blocks. - -; If the branch decision is made based on a value in an SGPR then all -; threads will execute the same code paths, so we don't need to worry -; about instructions in different blocks overwriting each other. -; SI-LABEL: {{^}}sgpr_if_else_salu_br: -; SI: s_add -; SI: s_add - -define void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { -entry: - %0 = icmp eq i32 %a, 0 - br i1 %0, label %if, label %else - -if: - %1 = add i32 %b, %c - br label %endif - -else: - %2 = add i32 %d, %e - br label %endif - -endif: - %3 = phi i32 [%1, %if], [%2, %else] - %4 = add i32 %3, %a - store i32 %4, i32 addrspace(1)* %out - ret void -} - -; The two S_ADD instructions should write to different registers, since -; different threads will take different control flow paths. - -; SI-LABEL: {{^}}sgpr_if_else_valu_br: -; SI: s_add_i32 [[SGPR:s[0-9]+]] -; SI-NOT: s_add_i32 [[SGPR]] - -define void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) { -entry: - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %tid_f = uitofp i32 %tid to float - %tmp1 = fcmp ueq float %tid_f, 0.0 - br i1 %tmp1, label %if, label %else - -if: - %tmp2 = add i32 %b, %c - br label %endif - -else: - %tmp3 = add i32 %d, %e - br label %endif - -endif: - %tmp4 = phi i32 [%tmp2, %if], [%tmp3, %else] - store i32 %tmp4, i32 addrspace(1)* %out - ret void -} - -; FIXME: Should write to different SGPR pairs instead of copying to -; VALU for i1 phi. - -; SI-LABEL: {{^}}sgpr_if_else_valu_cmp_phi_br: -; SI: buffer_load_dword [[AVAL:v[0-9]+]] -; SI: v_cmp_gt_i32_e32 [[CMP_IF:vcc]], 0, [[AVAL]] -; SI: v_cndmask_b32_e64 [[V_CMP:v[0-9]+]], 0, -1, [[CMP_IF]] - -; SI: BB2_1: -; SI: buffer_load_dword [[AVAL:v[0-9]+]] -; SI: v_cmp_eq_i32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]] -; SI: v_cndmask_b32_e64 [[V_CMP]], 0, -1, [[CMP_ELSE]] - -; SI: v_cmp_ne_i32_e32 [[CMP_CMP:vcc]], 0, [[V_CMP]] -; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP_CMP]] -; SI: buffer_store_dword [[RESULT]] -define void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) { -entry: - %tid = call i32 @llvm.r600.read.tidig.x() #0 - %tmp1 = icmp eq i32 %tid, 0 - br i1 %tmp1, label %if, label %else - -if: - %gep.if = getelementptr i32, i32 addrspace(1)* %a, i32 %tid - %a.val = load i32, i32 addrspace(1)* %gep.if - %cmp.if = icmp eq i32 %a.val, 0 - br label %endif - -else: - %gep.else = getelementptr i32, i32 addrspace(1)* %b, i32 %tid - %b.val = load i32, i32 addrspace(1)* %gep.else - %cmp.else = icmp slt i32 %b.val, 0 - br label %endif - -endif: - %tmp4 = phi i1 [%cmp.if, %if], [%cmp.else, %else] - %ext = sext i1 %tmp4 to i32 - store i32 %ext, i32 addrspace(1)* %out - ret void -} - -declare i32 @llvm.r600.read.tidig.x() #0 - -attributes #0 = { readnone } diff --git a/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll b/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll deleted file mode 100644 index df67fcca22f..00000000000 --- a/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll +++ /dev/null @@ -1,19 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s - -; Copy VGPR -> SGPR used twice as an instruction operand, which is then -; used in an REG_SEQUENCE that also needs to be handled. - -; SI-LABEL: {{^}}test_dup_operands: -; SI: v_add_i32_e32 -define void @test_dup_operands(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) { - %a = load <2 x i32>, <2 x i32> addrspace(1)* %in - %lo = extractelement <2 x i32> %a, i32 0 - %hi = extractelement <2 x i32> %a, i32 1 - %add = add i32 %lo, %lo - %vec0 = insertelement <2 x i32> undef, i32 %add, i32 0 - %vec1 = insertelement <2 x i32> %vec0, i32 %hi, i32 1 - store <2 x i32> %vec1, <2 x i32> addrspace(1)* %out, align 8 - ret void -} - diff --git a/test/CodeGen/R600/sgpr-copy.ll b/test/CodeGen/R600/sgpr-copy.ll deleted file mode 100644 index b849c4038bc..00000000000 --- a/test/CodeGen/R600/sgpr-copy.ll +++ /dev/null @@ -1,379 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -; This test checks that no VGPR to SGPR copies are created by the register -; allocator. -; CHECK-LABEL: {{^}}phi1: -; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0 -; CHECK: v_mov_b32_e32 v{{[0-9]}}, [[DST]] - -define void @phi1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { -main_body: - %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 - %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1 - %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 0) - %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16) - %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 32) - %25 = fptosi float %23 to i32 - %26 = icmp ne i32 %25, 0 - br i1 %26, label %ENDIF, label %ELSE - -ELSE: ; preds = %main_body - %27 = fsub float -0.000000e+00, %22 - br label %ENDIF - -ENDIF: ; preds = %main_body, %ELSE - %temp.0 = phi float [ %27, %ELSE ], [ %22, %main_body ] - %28 = fadd float %temp.0, %24 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %28, float %28, float 0.000000e+00, float 1.000000e+00) - ret void -} - -; Make sure this program doesn't crash -; CHECK-LABEL: {{^}}phi2: -define void @phi2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { -main_body: - %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 - %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1 - %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16) - %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 32) - %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 36) - %25 = call float @llvm.SI.load.const(<16 x i8> %21, i32 40) - %26 = call float @llvm.SI.load.const(<16 x i8> %21, i32 48) - %27 = call float @llvm.SI.load.const(<16 x i8> %21, i32 52) - %28 = call float @llvm.SI.load.const(<16 x i8> %21, i32 56) - %29 = call float @llvm.SI.load.const(<16 x i8> %21, i32 64) - %30 = call float @llvm.SI.load.const(<16 x i8> %21, i32 68) - %31 = call float @llvm.SI.load.const(<16 x i8> %21, i32 72) - %32 = call float @llvm.SI.load.const(<16 x i8> %21, i32 76) - %33 = call float @llvm.SI.load.const(<16 x i8> %21, i32 80) - %34 = call float @llvm.SI.load.const(<16 x i8> %21, i32 84) - %35 = call float @llvm.SI.load.const(<16 x i8> %21, i32 88) - %36 = call float @llvm.SI.load.const(<16 x i8> %21, i32 92) - %37 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0 - %38 = load <32 x i8>, <32 x i8> addrspace(2)* %37, !tbaa !1 - %39 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %1, i32 0 - %40 = load <16 x i8>, <16 x i8> addrspace(2)* %39, !tbaa !1 - %41 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %5) - %42 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %5) - %43 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %3, <2 x i32> %5) - %44 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %3, <2 x i32> %5) - %45 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %3, <2 x i32> %5) - %46 = bitcast float %41 to i32 - %47 = bitcast float %42 to i32 - %48 = insertelement <2 x i32> undef, i32 %46, i32 0 - %49 = insertelement <2 x i32> %48, i32 %47, i32 1 - %50 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %49, <32 x i8> %38, <16 x i8> %40, i32 2) - %51 = extractelement <4 x float> %50, i32 2 - %52 = call float @fabs(float %51) - %53 = fmul float %43, %43 - %54 = fmul float %44, %44 - %55 = fadd float %54, %53 - %56 = fmul float %45, %45 - %57 = fadd float %55, %56 - %58 = call float @llvm.AMDGPU.rsq.f32(float %57) - %59 = fmul float %43, %58 - %60 = fmul float %44, %58 - %61 = fmul float %45, %58 - %62 = fmul float %59, %23 - %63 = fmul float %60, %24 - %64 = fadd float %63, %62 - %65 = fmul float %61, %25 - %66 = fadd float %64, %65 - %67 = fsub float -0.000000e+00, %26 - %68 = fmul float %66, %52 - %69 = fadd float %68, %67 - %70 = fmul float %27, %69 - %71 = fmul float %28, %69 - %72 = call float @fabs(float %70) - %73 = fcmp olt float 0x3EE4F8B580000000, %72 - %74 = sext i1 %73 to i32 - %75 = bitcast i32 %74 to float - %76 = bitcast float %75 to i32 - %77 = icmp ne i32 %76, 0 - br i1 %77, label %IF, label %ENDIF - -IF: ; preds = %main_body - %78 = fsub float -0.000000e+00, %70 - %79 = call float @llvm.AMDIL.exp.(float %78) - %80 = fsub float -0.000000e+00, %79 - %81 = fadd float 1.000000e+00, %80 - %82 = fdiv float 1.000000e+00, %70 - %83 = fmul float %81, %82 - %84 = fmul float %32, %83 - br label %ENDIF - -ENDIF: ; preds = %main_body, %IF - %temp4.0 = phi float [ %84, %IF ], [ %32, %main_body ] - %85 = call float @fabs(float %71) - %86 = fcmp olt float 0x3EE4F8B580000000, %85 - %87 = sext i1 %86 to i32 - %88 = bitcast i32 %87 to float - %89 = bitcast float %88 to i32 - %90 = icmp ne i32 %89, 0 - br i1 %90, label %IF25, label %ENDIF24 - -IF25: ; preds = %ENDIF - %91 = fsub float -0.000000e+00, %71 - %92 = call float @llvm.AMDIL.exp.(float %91) - %93 = fsub float -0.000000e+00, %92 - %94 = fadd float 1.000000e+00, %93 - %95 = fdiv float 1.000000e+00, %71 - %96 = fmul float %94, %95 - %97 = fmul float %36, %96 - br label %ENDIF24 - -ENDIF24: ; preds = %ENDIF, %IF25 - %temp8.0 = phi float [ %97, %IF25 ], [ %36, %ENDIF ] - %98 = fmul float %29, %temp4.0 - %99 = fmul float %30, %temp4.0 - %100 = fmul float %31, %temp4.0 - %101 = fmul float %33, %temp8.0 - %102 = fadd float %101, %98 - %103 = fmul float %34, %temp8.0 - %104 = fadd float %103, %99 - %105 = fmul float %35, %temp8.0 - %106 = fadd float %105, %100 - %107 = call float @llvm.pow.f32(float %52, float %22) - %108 = fsub float -0.000000e+00, %102 - %109 = fmul float %108, %107 - %110 = fsub float -0.000000e+00, %104 - %111 = fmul float %110, %107 - %112 = fsub float -0.000000e+00, %106 - %113 = fmul float %112, %107 - %114 = call i32 @llvm.SI.packf16(float %109, float %111) - %115 = bitcast i32 %114 to float - %116 = call i32 @llvm.SI.packf16(float %113, float 1.000000e+00) - %117 = bitcast i32 %116 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %115, float %117, float %115, float %117) - ret void -} - -; We just want ot make sure the program doesn't crash -; CHECK-LABEL: {{^}}loop: - -define void @loop(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { -main_body: - %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 - %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1 - %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 0) - %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 4) - %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 8) - %25 = call float @llvm.SI.load.const(<16 x i8> %21, i32 12) - %26 = fptosi float %25 to i32 - %27 = bitcast i32 %26 to float - %28 = bitcast float %27 to i32 - br label %LOOP - -LOOP: ; preds = %ENDIF, %main_body - %temp4.0 = phi float [ %22, %main_body ], [ %temp5.0, %ENDIF ] - %temp5.0 = phi float [ %23, %main_body ], [ %temp6.0, %ENDIF ] - %temp6.0 = phi float [ %24, %main_body ], [ %temp4.0, %ENDIF ] - %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %37, %ENDIF ] - %29 = bitcast float %temp8.0 to i32 - %30 = icmp sge i32 %29, %28 - %31 = sext i1 %30 to i32 - %32 = bitcast i32 %31 to float - %33 = bitcast float %32 to i32 - %34 = icmp ne i32 %33, 0 - br i1 %34, label %IF, label %ENDIF - -IF: ; preds = %LOOP - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp4.0, float %temp5.0, float %temp6.0, float 1.000000e+00) - ret void - -ENDIF: ; preds = %LOOP - %35 = bitcast float %temp8.0 to i32 - %36 = add i32 %35, 1 - %37 = bitcast i32 %36 to float - br label %LOOP -} - -; Function Attrs: nounwind readnone -declare float @llvm.SI.load.const(<16 x i8>, i32) #1 - -; Function Attrs: readonly -declare float @fabs(float) #2 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } -attributes #2 = { readonly } -attributes #3 = { readnone } -attributes #4 = { nounwind readonly } - -!0 = !{!"const", null} -!1 = !{!0, !0, i64 0, i32 1} - -; Function Attrs: nounwind readnone -declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 - -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1 - -; Function Attrs: readnone -declare float @llvm.AMDGPU.rsq.f32(float) #3 - -; Function Attrs: readnone -declare float @llvm.AMDIL.exp.(float) #3 - -; Function Attrs: nounwind readonly -declare float @llvm.pow.f32(float, float) #4 - -; Function Attrs: nounwind readnone -declare i32 @llvm.SI.packf16(float, float) #1 - -; This checks for a bug in the FixSGPRCopies pass where VReg96 -; registers were being identified as an SGPR regclass which was causing -; an assertion failure. - -; CHECK-LABEL: {{^}}sample_v3: -; CHECK: image_sample -; CHECK: image_sample -; CHECK: exp -; CHECK: s_endpgm -define void @sample_v3([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { - -entry: - %21 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0 - %22 = load <16 x i8>, <16 x i8> addrspace(2)* %21, !tbaa !2 - %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 16) - %24 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0 - %25 = load <32 x i8>, <32 x i8> addrspace(2)* %24, !tbaa !2 - %26 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0 - %27 = load <16 x i8>, <16 x i8> addrspace(2)* %26, !tbaa !2 - %28 = fcmp oeq float %23, 0.0 - br i1 %28, label %if, label %else - -if: - %val.if = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> , <32 x i8> %25, <16 x i8> %27, i32 2) - %val.if.0 = extractelement <4 x float> %val.if, i32 0 - %val.if.1 = extractelement <4 x float> %val.if, i32 1 - %val.if.2 = extractelement <4 x float> %val.if, i32 2 - br label %endif - -else: - %val.else = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> , <32 x i8> %25, <16 x i8> %27, i32 2) - %val.else.0 = extractelement <4 x float> %val.else, i32 0 - %val.else.1 = extractelement <4 x float> %val.else, i32 1 - %val.else.2 = extractelement <4 x float> %val.else, i32 2 - br label %endif - -endif: - %val.0 = phi float [%val.if.0, %if], [%val.else.0, %else] - %val.1 = phi float [%val.if.1, %if], [%val.else.1, %else] - %val.2 = phi float [%val.if.2, %if], [%val.else.2, %else] - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %val.0, float %val.1, float %val.2, float 0.0) - ret void -} - -!2 = !{!"const", null, i32 1} - -; CHECK-LABEL: {{^}}copy1: -; CHECK: buffer_load_dword -; CHECK: v_add -; CHECK: s_endpgm -define void @copy1(float addrspace(1)* %out, float addrspace(1)* %in0) { -entry: - %0 = load float, float addrspace(1)* %in0 - %1 = fcmp oeq float %0, 0.0 - br i1 %1, label %if0, label %endif - -if0: - %2 = bitcast float %0 to i32 - %3 = fcmp olt float %0, 0.0 - br i1 %3, label %if1, label %endif - -if1: - %4 = add i32 %2, 1 - br label %endif - -endif: - %5 = phi i32 [ 0, %entry ], [ %2, %if0 ], [ %4, %if1 ] - %6 = bitcast i32 %5 to float - store float %6, float addrspace(1)* %out - ret void -} - -; This test is just checking that we don't crash / assertion fail. -; CHECK-LABEL: {{^}}copy2: -; CHECK: s_endpgm - -define void @copy2([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { -entry: - br label %LOOP68 - -LOOP68: - %temp4.7 = phi float [ 0.000000e+00, %entry ], [ %v, %ENDIF69 ] - %t = phi i32 [ 20, %entry ], [ %x, %ENDIF69 ] - %g = icmp eq i32 0, %t - %l = bitcast float %temp4.7 to i32 - br i1 %g, label %IF70, label %ENDIF69 - -IF70: - %q = icmp ne i32 %l, 13 - %temp.8 = select i1 %q, float 1.000000e+00, float 0.000000e+00 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp.8, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00) - ret void - -ENDIF69: - %u = add i32 %l, %t - %v = bitcast i32 %u to float - %x = add i32 %t, -1 - br label %LOOP68 -} - -attributes #0 = { "ShaderType"="0" } - -; This test checks that image_sample resource descriptors aren't loaded into -; vgprs. The verifier will fail if this happens. -; CHECK-LABEL:{{^}}sample_rsrc: -; CHECK: image_sample -; CHECK: image_sample -; CHECK: s_endpgm -define void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 { -bb: - %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg1, i32 0, i32 0 - %tmp22 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 - %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp22, i32 16) - %tmp25 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(2)* %arg3, i32 0, i32 0 - %tmp26 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp25, !tbaa !0 - %tmp27 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %arg2, i32 0, i32 0 - %tmp28 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp27, !tbaa !0 - %tmp29 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg5, <2 x i32> %arg7) - %tmp30 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg5, <2 x i32> %arg7) - %tmp31 = bitcast float %tmp23 to i32 - %tmp36 = icmp ne i32 %tmp31, 0 - br i1 %tmp36, label %bb38, label %bb80 - -bb38: ; preds = %bb - %tmp52 = bitcast float %tmp29 to i32 - %tmp53 = bitcast float %tmp30 to i32 - %tmp54 = insertelement <2 x i32> undef, i32 %tmp52, i32 0 - %tmp55 = insertelement <2 x i32> %tmp54, i32 %tmp53, i32 1 - %tmp56 = bitcast <8 x i32> %tmp26 to <32 x i8> - %tmp57 = bitcast <4 x i32> %tmp28 to <16 x i8> - %tmp58 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp55, <32 x i8> %tmp56, <16 x i8> %tmp57, i32 2) - br label %bb71 - -bb80: ; preds = %bb - %tmp81 = bitcast float %tmp29 to i32 - %tmp82 = bitcast float %tmp30 to i32 - %tmp82.2 = add i32 %tmp82, 1 - %tmp83 = insertelement <2 x i32> undef, i32 %tmp81, i32 0 - %tmp84 = insertelement <2 x i32> %tmp83, i32 %tmp82.2, i32 1 - %tmp85 = bitcast <8 x i32> %tmp26 to <32 x i8> - %tmp86 = bitcast <4 x i32> %tmp28 to <16 x i8> - %tmp87 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp84, <32 x i8> %tmp85, <16 x i8> %tmp86, i32 2) - br label %bb71 - -bb71: ; preds = %bb80, %bb38 - %tmp72 = phi <4 x float> [ %tmp58, %bb38 ], [ %tmp87, %bb80 ] - %tmp88 = extractelement <4 x float> %tmp72, i32 0 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp88, float %tmp88, float %tmp88, float %tmp88) - ret void -} - -attributes #0 = { "ShaderType"="0" "unsafe-fp-math"="true" } -attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/shared-op-cycle.ll b/test/CodeGen/R600/shared-op-cycle.ll deleted file mode 100644 index f52a9baf4d1..00000000000 --- a/test/CodeGen/R600/shared-op-cycle.ll +++ /dev/null @@ -1,32 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; CHECK: {{^}}main: -; CHECK: MULADD_IEEE * -; CHECK-NOT: MULADD_IEEE * - -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 { - %w0 = extractelement <4 x float> %reg0, i32 3 - %w1 = extractelement <4 x float> %reg1, i32 3 - %w2 = extractelement <4 x float> %reg2, i32 3 - %sq0 = fmul float %w0, %w0 - %r0 = fadd float %sq0, 2.0 - %sq1 = fmul float %w1, %w1 - %r1 = fadd float %sq1, 2.0 - %sq2 = fmul float %w2, %w2 - %r2 = fadd float %sq2, 2.0 - %v0 = insertelement <4 x float> undef, float %r0, i32 0 - %v1 = insertelement <4 x float> %v0, float %r1, i32 1 - %v2 = insertelement <4 x float> %v1, float %r2, i32 2 - %res = call float @llvm.AMDGPU.dp4(<4 x float> %v2, <4 x float> %v2) - %vecres = insertelement <4 x float> undef, float %res, i32 0 - call void @llvm.R600.store.swizzle(<4 x float> %vecres, i32 0, i32 2) - ret void -} - -; Function Attrs: readnone -declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="1" } -attributes #1 = { readnone } \ No newline at end of file diff --git a/test/CodeGen/R600/shl.ll b/test/CodeGen/R600/shl.ll deleted file mode 100644 index 53b63dc4b8a..00000000000 --- a/test/CodeGen/R600/shl.ll +++ /dev/null @@ -1,180 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI %s - -;EG: {{^}}shl_v2i32: -;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -;SI: {{^}}shl_v2i32: -;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -;VI: {{^}}shl_v2i32: -;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 - %a = load <2 x i32>, <2 x i32> addrspace(1) * %in - %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr - %result = shl <2 x i32> %a, %b - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -;EG: {{^}}shl_v4i32: -;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -;SI: {{^}}shl_v4i32: -;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;SI: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -;VI: {{^}}shl_v4i32: -;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 - %a = load <4 x i32>, <4 x i32> addrspace(1) * %in - %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr - %result = shl <4 x i32> %a, %b - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -;EG: {{^}}shl_i64: -;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]] -;EG: LSHR {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}} -;EG: LSHR {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1 -;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal -;EG-DAG: LSHL {{\*? *}}[[HISMTMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], [[SHIFT]] -;EG-DAG: OR_INT {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], {{[[HISMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}} -;EG-DAG: LSHL {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], [[OPLO]], {{PS|[[SHIFT]]}} -;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal -;EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}} -;EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0 - -;SI: {{^}}shl_i64: -;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} - -;VI: {{^}}shl_i64: -;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} - -define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { - %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 - %a = load i64, i64 addrspace(1) * %in - %b = load i64, i64 addrspace(1) * %b_ptr - %result = shl i64 %a, %b - store i64 %result, i64 addrspace(1)* %out - ret void -} - -;EG: {{^}}shl_v2i64: -;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]] -;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]] -;EG-DAG: LSHR {{\*? *}}[[COMPSHA]] -;EG-DAG: LSHR {{\*? *}}[[COMPSHB]] -;EG-DAG: LSHR {{.*}}, 1 -;EG-DAG: LSHR {{.*}}, 1 -;EG-DAG: ADD_INT {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal -;EG-DAG: ADD_INT {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal -;EG-DAG: LSHL {{.*}}, [[SHA]] -;EG-DAG: LSHL {{.*}}, [[SHB]] -;EG-DAG: LSHL {{.*}}, [[SHA]] -;EG-DAG: LSHL {{.*}}, [[SHB]] -;EG-DAG: LSHL -;EG-DAG: LSHL -;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal -;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal -;EG-DAG: CNDE_INT {{.*}}, 0.0 -;EG-DAG: CNDE_INT {{.*}}, 0.0 -;EG-DAG: CNDE_INT -;EG-DAG: CNDE_INT - -;SI: {{^}}shl_v2i64: -;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} -;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} - -;VI: {{^}}shl_v2i64: -;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} -;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} - -define void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 - %a = load <2 x i64>, <2 x i64> addrspace(1) * %in - %b = load <2 x i64>, <2 x i64> addrspace(1) * %b_ptr - %result = shl <2 x i64> %a, %b - store <2 x i64> %result, <2 x i64> addrspace(1)* %out - ret void -} - -;EG: {{^}}shl_v4i64: -;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]] -;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]] -;EG-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]] -;EG-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]] -;EG-DAG: LSHR {{\*? *}}[[COMPSHA]] -;EG-DAG: LSHR {{\*? *}}[[COMPSHB]] -;EG-DAG: LSHR {{\*? *}}[[COMPSHC]] -;EG-DAG: LSHR {{\*? *}}[[COMPSHD]] -;EG-DAG: LSHR {{.*}}, 1 -;EG-DAG: LSHR {{.*}}, 1 -;EG-DAG: LSHR {{.*}}, 1 -;EG-DAG: LSHR {{.*}}, 1 -;EG-DAG: ADD_INT {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal -;EG-DAG: ADD_INT {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal -;EG-DAG: ADD_INT {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal -;EG-DAG: ADD_INT {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal -;EG-DAG: LSHL {{.*}}, [[SHA]] -;EG-DAG: LSHL {{.*}}, [[SHB]] -;EG-DAG: LSHL {{.*}}, [[SHC]] -;EG-DAG: LSHL {{.*}}, [[SHD]] -;EG-DAG: LSHL {{.*}}, [[SHA]] -;EG-DAG: LSHL {{.*}}, [[SHB]] -;EG-DAG: LSHL {{.*}}, [[SHC]] -;EG-DAG: LSHL {{.*}}, [[SHD]] -;EG-DAG: LSHL -;EG-DAG: LSHL -;EG-DAG: LSHL -;EG-DAG: LSHL -;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal -;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal -;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal -;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal -;EG-DAG: CNDE_INT {{.*}}, 0.0 -;EG-DAG: CNDE_INT {{.*}}, 0.0 -;EG-DAG: CNDE_INT {{.*}}, 0.0 -;EG-DAG: CNDE_INT {{.*}}, 0.0 -;EG-DAG: CNDE_INT -;EG-DAG: CNDE_INT -;EG-DAG: CNDE_INT -;EG-DAG: CNDE_INT - -;SI: {{^}}shl_v4i64: -;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} -;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} -;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} -;SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} - -;VI: {{^}}shl_v4i64: -;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} -;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} -;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} -;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} - -define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 - %a = load <4 x i64>, <4 x i64> addrspace(1) * %in - %b = load <4 x i64>, <4 x i64> addrspace(1) * %b_ptr - %result = shl <4 x i64> %a, %b - store <4 x i64> %result, <4 x i64> addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/shl_add_constant.ll b/test/CodeGen/R600/shl_add_constant.ll deleted file mode 100644 index b1485bfaaeb..00000000000 --- a/test/CodeGen/R600/shl_add_constant.ll +++ /dev/null @@ -1,90 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare i32 @llvm.r600.read.tidig.x() #1 - -; Test with inline immediate - -; FUNC-LABEL: {{^}}shl_2_add_9_i32: -; SI: v_lshlrev_b32_e32 [[REG:v[0-9]+]], 2, {{v[0-9]+}} -; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], 36, [[REG]] -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x - %val = load i32, i32 addrspace(1)* %ptr, align 4 - %add = add i32 %val, 9 - %result = shl i32 %add, 2 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}shl_2_add_9_i32_2_add_uses: -; SI-DAG: v_add_i32_e32 [[ADDREG:v[0-9]+]], 9, {{v[0-9]+}} -; SI-DAG: v_lshlrev_b32_e32 [[SHLREG:v[0-9]+]], 2, {{v[0-9]+}} -; SI-DAG: buffer_store_dword [[ADDREG]] -; SI-DAG: buffer_store_dword [[SHLREG]] -; SI: s_endpgm -define void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x - %val = load i32, i32 addrspace(1)* %ptr, align 4 - %add = add i32 %val, 9 - %result = shl i32 %add, 2 - store i32 %result, i32 addrspace(1)* %out0, align 4 - store i32 %add, i32 addrspace(1)* %out1, align 4 - ret void -} - -; Test with add literal constant - -; FUNC-LABEL: {{^}}shl_2_add_999_i32: -; SI: v_lshlrev_b32_e32 [[REG:v[0-9]+]], 2, {{v[0-9]+}} -; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], 0xf9c, [[REG]] -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x - %val = load i32, i32 addrspace(1)* %ptr, align 4 - %shl = add i32 %val, 999 - %result = shl i32 %shl, 2 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_add_shl_add_constant: -; SI-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; SI: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3 -; SI: s_add_i32 [[TMP:s[0-9]+]], [[SHL3]], [[Y]] -; SI: s_add_i32 [[RESULT:s[0-9]+]], [[TMP]], 0x3d8 -; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]] -; SI: buffer_store_dword [[VRESULT]] -define void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { - %add.0 = add i32 %x, 123 - %shl = shl i32 %add.0, 3 - %add.1 = add i32 %shl, %y - store i32 %add.1, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_add_shl_add_constant_inv: -; SI-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; SI: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3 -; SI: s_add_i32 [[TMP:s[0-9]+]], [[SHL3]], [[Y]] -; SI: s_add_i32 [[RESULT:s[0-9]+]], [[TMP]], 0x3d8 -; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]] -; SI: buffer_store_dword [[VRESULT]] - -define void @test_add_shl_add_constant_inv(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { - %add.0 = add i32 %x, 123 - %shl = shl i32 %add.0, 3 - %add.1 = add i32 %y, %shl - store i32 %add.1, i32 addrspace(1)* %out, align 4 - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/shl_add_ptr.ll b/test/CodeGen/R600/shl_add_ptr.ll deleted file mode 100644 index 6671e909cd1..00000000000 --- a/test/CodeGen/R600/shl_add_ptr.ll +++ /dev/null @@ -1,284 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s - -; Test that doing a shift of a pointer with a constant add will be -; folded into the constant offset addressing mode even if the add has -; multiple uses. This is relevant to accessing 2 separate, adjacent -; LDS globals. - - -declare i32 @llvm.r600.read.tidig.x() #1 - -@lds0 = addrspace(3) global [512 x float] undef, align 4 -@lds1 = addrspace(3) global [512 x float] undef, align 4 - - -; Make sure the (add tid, 2) << 2 gets folded into the ds's offset as (tid << 2) + 8 - -; SI-LABEL: {{^}}load_shl_base_lds_0: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; SI: ds_read_b32 {{v[0-9]+}}, [[PTR]] offset:8 -; SI: s_endpgm -define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 - store float %val0, float addrspace(1)* %out - ret void -} - -; Make sure once the first use is folded into the addressing mode, the -; remaining add use goes through the normal shl + add constant fold. - -; SI-LABEL: {{^}}load_shl_base_lds_1: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; SI: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8 -; SI: v_add_i32_e32 [[ADDUSE:v[0-9]+]], 8, v{{[0-9]+}} -; SI-DAG: buffer_store_dword [[RESULT]] -; SI-DAG: buffer_store_dword [[ADDUSE]] -; SI: s_endpgm -define void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - %shl_add_use = shl i32 %idx.0, 2 - store i32 %shl_add_use, i32 addrspace(1)* %add_use, align 4 - store float %val0, float addrspace(1)* %out - ret void -} - -@maxlds = addrspace(3) global [65536 x i8] undef, align 4 - -; SI-LABEL: {{^}}load_shl_base_lds_max_offset -; SI: ds_read_u8 v{{[0-9]+}}, v{{[0-9]+}} offset:65535 -; SI: s_endpgm -define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 65535 - %arrayidx0 = getelementptr inbounds [65536 x i8], [65536 x i8] addrspace(3)* @maxlds, i32 0, i32 %idx.0 - %val0 = load i8, i8 addrspace(3)* %arrayidx0 - store i32 %idx.0, i32 addrspace(1)* %add_use - store i8 %val0, i8 addrspace(1)* %out - ret void -} - -; The two globals are placed adjacent in memory, so the same base -; pointer can be used with an offset into the second one. - -; SI-LABEL: {{^}}load_shl_base_lds_2: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; SI: s_mov_b32 m0, -1 -; SI-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9 -; SI: s_endpgm -define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 64 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 - %val0 = load float, float addrspace(3)* %arrayidx0, align 4 - %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0 - %val1 = load float, float addrspace(3)* %arrayidx1, align 4 - %sum = fadd float %val0, %val1 - store float %sum, float addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}store_shl_base_lds_0: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; SI: ds_write_b32 [[PTR]], {{v[0-9]+}} offset:8 -; SI: s_endpgm -define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0 - store float 1.0, float addrspace(3)* %arrayidx0, align 4 - store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 - ret void -} - - -; -------------------------------------------------------------------------------- -; Atomics. - -@lds2 = addrspace(3) global [512 x i32] undef, align 4 - -; define void @atomic_load_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { -; %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 -; %idx.0 = add nsw i32 %tid.x, 2 -; %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 -; %val = load atomic i32, i32 addrspace(3)* %arrayidx0 seq_cst, align 4 -; store i32 %val, i32 addrspace(1)* %out, align 4 -; store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 -; ret void -; } - - -; SI-LABEL: {{^}}atomic_cmpxchg_shl_base_lds_0: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; SI: ds_cmpst_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, {{v[0-9]+}} offset:8 -; SI: s_endpgm -define void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use, i32 %swap) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 - %pair = cmpxchg i32 addrspace(3)* %arrayidx0, i32 7, i32 %swap seq_cst monotonic - %result = extractvalue { i32, i1 } %pair, 0 - store i32 %result, i32 addrspace(1)* %out, align 4 - store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 - ret void -} - -; SI-LABEL: {{^}}atomic_swap_shl_base_lds_0: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; SI: ds_wrxchg_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 -; SI: s_endpgm -define void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 - %val = atomicrmw xchg i32 addrspace(3)* %arrayidx0, i32 3 seq_cst - store i32 %val, i32 addrspace(1)* %out, align 4 - store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 - ret void -} - -; SI-LABEL: {{^}}atomic_add_shl_base_lds_0: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; SI: ds_add_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 -; SI: s_endpgm -define void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 - %val = atomicrmw add i32 addrspace(3)* %arrayidx0, i32 3 seq_cst - store i32 %val, i32 addrspace(1)* %out, align 4 - store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 - ret void -} - -; SI-LABEL: {{^}}atomic_sub_shl_base_lds_0: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; SI: ds_sub_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 -; SI: s_endpgm -define void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 - %val = atomicrmw sub i32 addrspace(3)* %arrayidx0, i32 3 seq_cst - store i32 %val, i32 addrspace(1)* %out, align 4 - store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 - ret void -} - -; SI-LABEL: {{^}}atomic_and_shl_base_lds_0: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; SI: ds_and_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 -; SI: s_endpgm -define void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 - %val = atomicrmw and i32 addrspace(3)* %arrayidx0, i32 3 seq_cst - store i32 %val, i32 addrspace(1)* %out, align 4 - store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 - ret void -} - -; SI-LABEL: {{^}}atomic_or_shl_base_lds_0: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; SI: ds_or_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 -; SI: s_endpgm -define void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 - %val = atomicrmw or i32 addrspace(3)* %arrayidx0, i32 3 seq_cst - store i32 %val, i32 addrspace(1)* %out, align 4 - store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 - ret void -} - -; SI-LABEL: {{^}}atomic_xor_shl_base_lds_0: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; SI: ds_xor_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 -; SI: s_endpgm -define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 - %val = atomicrmw xor i32 addrspace(3)* %arrayidx0, i32 3 seq_cst - store i32 %val, i32 addrspace(1)* %out, align 4 - store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 - ret void -} - -; define void @atomic_nand_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { -; %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 -; %idx.0 = add nsw i32 %tid.x, 2 -; %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 -; %val = atomicrmw nand i32 addrspace(3)* %arrayidx0, i32 3 seq_cst -; store i32 %val, i32 addrspace(1)* %out, align 4 -; store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 -; ret void -; } - -; SI-LABEL: {{^}}atomic_min_shl_base_lds_0: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; SI: ds_min_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 -; SI: s_endpgm -define void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 - %val = atomicrmw min i32 addrspace(3)* %arrayidx0, i32 3 seq_cst - store i32 %val, i32 addrspace(1)* %out, align 4 - store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 - ret void -} - -; SI-LABEL: {{^}}atomic_max_shl_base_lds_0: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; SI: ds_max_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 -; SI: s_endpgm -define void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 - %val = atomicrmw max i32 addrspace(3)* %arrayidx0, i32 3 seq_cst - store i32 %val, i32 addrspace(1)* %out, align 4 - store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 - ret void -} - -; SI-LABEL: {{^}}atomic_umin_shl_base_lds_0: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; SI: ds_min_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 -; SI: s_endpgm -define void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 - %val = atomicrmw umin i32 addrspace(3)* %arrayidx0, i32 3 seq_cst - store i32 %val, i32 addrspace(1)* %out, align 4 - store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 - ret void -} - -; SI-LABEL: {{^}}atomic_umax_shl_base_lds_0: -; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} -; SI: ds_max_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 -; SI: s_endpgm -define void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { - %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1 - %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 - %val = atomicrmw umax i32 addrspace(3)* %arrayidx0, i32 3 seq_cst - store i32 %val, i32 addrspace(1)* %out, align 4 - store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/si-annotate-cf-assertion.ll b/test/CodeGen/R600/si-annotate-cf-assertion.ll deleted file mode 100644 index 69d719385ac..00000000000 --- a/test/CodeGen/R600/si-annotate-cf-assertion.ll +++ /dev/null @@ -1,25 +0,0 @@ -; REQUIRES: asserts -; XFAIL: * -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs-asm-verbose=false < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs-asm-verbose=false < %s | FileCheck %s - - -define void @test(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind { -; CHECK-LABEL: {{^}}test: - -entry: - switch i32 %x, label %sw.default [ - i32 0, label %sw.bb - i32 60, label %sw.bb - ] - -sw.bb: - unreachable - -sw.default: - unreachable - -sw.epilog: - ret void -} - diff --git a/test/CodeGen/R600/si-annotate-cf.ll b/test/CodeGen/R600/si-annotate-cf.ll deleted file mode 100644 index bbcb861f37d..00000000000 --- a/test/CodeGen/R600/si-annotate-cf.ll +++ /dev/null @@ -1,63 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}break_inserted_outside_of_loop: - -; SI: [[LOOP_LABEL:[A-Z0-9]+]]: -; Lowered break instructin: -; SI: s_or_b64 -; Lowered Loop instruction: -; SI: s_andn2_b64 -; s_cbranch_execnz [[LOOP_LABEL]] -; SI: s_endpgm -define void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a, i32 %b) { -main_body: - %0 = and i32 %a, %b - %1 = trunc i32 %0 to i1 - br label %ENDIF - -ENDLOOP: - store i32 0, i32 addrspace(1)* %out - ret void - -ENDIF: - br i1 %1, label %ENDLOOP, label %ENDIF -} - - -; FUNC-LABEL: {{^}}phi_cond_outside_loop: -; FIXME: This could be folded into the s_or_b64 instruction -; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0 -; SI: [[LOOP_LABEL:[A-Z0-9]+]] -; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}} - -; SI_IF_BREAK instruction: -; SI: s_or_b64 [[BREAK:s\[[0-9]+:[0-9]+\]]], vcc, [[ZERO]] - -; SI_LOOP instruction: -; SI: s_andn2_b64 exec, exec, [[BREAK]] -; SI: s_cbranch_execnz [[LOOP_LABEL]] -; SI: s_endpgm - -define void @phi_cond_outside_loop(i32 %a, i32 %b) { -entry: - %0 = icmp eq i32 %a , 0 - br i1 %0, label %if, label %else - -if: - br label %endif - -else: - %1 = icmp eq i32 %b, 0 - br label %endif - -endif: - %2 = phi i1 [0, %if], [%1, %else] - br label %loop - -loop: - br i1 %2, label %exit, label %loop - -exit: - ret void -} diff --git a/test/CodeGen/R600/si-lod-bias.ll b/test/CodeGen/R600/si-lod-bias.ll deleted file mode 100644 index 944499a1146..00000000000 --- a/test/CodeGen/R600/si-lod-bias.ll +++ /dev/null @@ -1,52 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -; This shader has the potential to generated illegal VGPR to SGPR copies if -; the wrong register class is used for the REG_SEQUENCE instructions. - -; CHECK: {{^}}main: -; CHECK: image_sample_b v{{\[[0-9]:[0-9]\]}}, 15, 0, 0, 0, 0, 0, 0, 0, v{{\[[0-9]:[0-9]\]}} - -define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { -main_body: - %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 - %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20, !tbaa !1 - %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16) - %23 = getelementptr <32 x i8>, <32 x i8> addrspace(2)* %2, i32 0 - %24 = load <32 x i8>, <32 x i8> addrspace(2)* %23, !tbaa !1 - %25 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %1, i32 0 - %26 = load <16 x i8>, <16 x i8> addrspace(2)* %25, !tbaa !1 - %27 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %5) - %28 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %5) - %29 = bitcast float %22 to i32 - %30 = bitcast float %27 to i32 - %31 = bitcast float %28 to i32 - %32 = insertelement <4 x i32> undef, i32 %29, i32 0 - %33 = insertelement <4 x i32> %32, i32 %30, i32 1 - %34 = insertelement <4 x i32> %33, i32 %31, i32 2 - %35 = insertelement <4 x i32> %34, i32 undef, i32 3 - %36 = call <4 x float> @llvm.SI.sampleb.v4i32(<4 x i32> %35, <32 x i8> %24, <16 x i8> %26, i32 2) - %37 = extractelement <4 x float> %36, i32 0 - %38 = extractelement <4 x float> %36, i32 1 - %39 = extractelement <4 x float> %36, i32 2 - %40 = extractelement <4 x float> %36, i32 3 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %37, float %38, float %39, float %40) - ret void -} - -; Function Attrs: nounwind readnone -declare float @llvm.SI.load.const(<16 x i8>, i32) #1 - -; Function Attrs: nounwind readnone -declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 - -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.sampleb.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } - -!0 = !{!"const", null} -!1 = !{!0, !0, i64 0, i32 1} diff --git a/test/CodeGen/R600/si-sgpr-spill.ll b/test/CodeGen/R600/si-sgpr-spill.ll deleted file mode 100644 index 84652701f77..00000000000 --- a/test/CodeGen/R600/si-sgpr-spill.ll +++ /dev/null @@ -1,1568 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck %s - -; These tests check that the compiler won't crash when it needs to spill -; SGPRs. - -; CHECK-LABEL: {{^}}main: -; CHECK: s_wqm -; Writing to M0 from an SMRD instruction will hang the GPU. -; CHECK-NOT: s_buffer_load_dword m0 -; CHECK: s_endpgm -@ddxy_lds = external addrspace(3) global [64 x i32] - -define void @main([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { -main_body: - %21 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0 - %22 = load <16 x i8>, <16 x i8> addrspace(2)* %21, !tbaa !0 - %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 96) - %24 = call float @llvm.SI.load.const(<16 x i8> %22, i32 100) - %25 = call float @llvm.SI.load.const(<16 x i8> %22, i32 104) - %26 = call float @llvm.SI.load.const(<16 x i8> %22, i32 112) - %27 = call float @llvm.SI.load.const(<16 x i8> %22, i32 116) - %28 = call float @llvm.SI.load.const(<16 x i8> %22, i32 120) - %29 = call float @llvm.SI.load.const(<16 x i8> %22, i32 128) - %30 = call float @llvm.SI.load.const(<16 x i8> %22, i32 132) - %31 = call float @llvm.SI.load.const(<16 x i8> %22, i32 140) - %32 = call float @llvm.SI.load.const(<16 x i8> %22, i32 144) - %33 = call float @llvm.SI.load.const(<16 x i8> %22, i32 160) - %34 = call float @llvm.SI.load.const(<16 x i8> %22, i32 176) - %35 = call float @llvm.SI.load.const(<16 x i8> %22, i32 180) - %36 = call float @llvm.SI.load.const(<16 x i8> %22, i32 184) - %37 = call float @llvm.SI.load.const(<16 x i8> %22, i32 192) - %38 = call float @llvm.SI.load.const(<16 x i8> %22, i32 196) - %39 = call float @llvm.SI.load.const(<16 x i8> %22, i32 200) - %40 = call float @llvm.SI.load.const(<16 x i8> %22, i32 208) - %41 = call float @llvm.SI.load.const(<16 x i8> %22, i32 212) - %42 = call float @llvm.SI.load.const(<16 x i8> %22, i32 216) - %43 = call float @llvm.SI.load.const(<16 x i8> %22, i32 224) - %44 = call float @llvm.SI.load.const(<16 x i8> %22, i32 240) - %45 = call float @llvm.SI.load.const(<16 x i8> %22, i32 244) - %46 = call float @llvm.SI.load.const(<16 x i8> %22, i32 248) - %47 = call float @llvm.SI.load.const(<16 x i8> %22, i32 256) - %48 = call float @llvm.SI.load.const(<16 x i8> %22, i32 272) - %49 = call float @llvm.SI.load.const(<16 x i8> %22, i32 276) - %50 = call float @llvm.SI.load.const(<16 x i8> %22, i32 280) - %51 = call float @llvm.SI.load.const(<16 x i8> %22, i32 288) - %52 = call float @llvm.SI.load.const(<16 x i8> %22, i32 292) - %53 = call float @llvm.SI.load.const(<16 x i8> %22, i32 296) - %54 = call float @llvm.SI.load.const(<16 x i8> %22, i32 304) - %55 = call float @llvm.SI.load.const(<16 x i8> %22, i32 308) - %56 = call float @llvm.SI.load.const(<16 x i8> %22, i32 312) - %57 = call float @llvm.SI.load.const(<16 x i8> %22, i32 368) - %58 = call float @llvm.SI.load.const(<16 x i8> %22, i32 372) - %59 = call float @llvm.SI.load.const(<16 x i8> %22, i32 376) - %60 = call float @llvm.SI.load.const(<16 x i8> %22, i32 384) - %61 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0 - %62 = load <32 x i8>, <32 x i8> addrspace(2)* %61, !tbaa !0 - %63 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0 - %64 = load <16 x i8>, <16 x i8> addrspace(2)* %63, !tbaa !0 - %65 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 1 - %66 = load <32 x i8>, <32 x i8> addrspace(2)* %65, !tbaa !0 - %67 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 1 - %68 = load <16 x i8>, <16 x i8> addrspace(2)* %67, !tbaa !0 - %69 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 2 - %70 = load <32 x i8>, <32 x i8> addrspace(2)* %69, !tbaa !0 - %71 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 2 - %72 = load <16 x i8>, <16 x i8> addrspace(2)* %71, !tbaa !0 - %73 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 3 - %74 = load <32 x i8>, <32 x i8> addrspace(2)* %73, !tbaa !0 - %75 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 3 - %76 = load <16 x i8>, <16 x i8> addrspace(2)* %75, !tbaa !0 - %77 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 4 - %78 = load <32 x i8>, <32 x i8> addrspace(2)* %77, !tbaa !0 - %79 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 4 - %80 = load <16 x i8>, <16 x i8> addrspace(2)* %79, !tbaa !0 - %81 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 5 - %82 = load <32 x i8>, <32 x i8> addrspace(2)* %81, !tbaa !0 - %83 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 5 - %84 = load <16 x i8>, <16 x i8> addrspace(2)* %83, !tbaa !0 - %85 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 6 - %86 = load <32 x i8>, <32 x i8> addrspace(2)* %85, !tbaa !0 - %87 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 6 - %88 = load <16 x i8>, <16 x i8> addrspace(2)* %87, !tbaa !0 - %89 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 7 - %90 = load <32 x i8>, <32 x i8> addrspace(2)* %89, !tbaa !0 - %91 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 7 - %92 = load <16 x i8>, <16 x i8> addrspace(2)* %91, !tbaa !0 - %93 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %4, <2 x i32> %6) - %94 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %4, <2 x i32> %6) - %95 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %4, <2 x i32> %6) - %96 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %4, <2 x i32> %6) - %97 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %4, <2 x i32> %6) - %98 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %4, <2 x i32> %6) - %99 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %4, <2 x i32> %6) - %100 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %4, <2 x i32> %6) - %101 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %4, <2 x i32> %6) - %102 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %4, <2 x i32> %6) - %103 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %4, <2 x i32> %6) - %104 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %4, <2 x i32> %6) - %105 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %4, <2 x i32> %6) - %106 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %4, <2 x i32> %6) - %107 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %4, <2 x i32> %6) - %108 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %4, <2 x i32> %6) - %109 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %4, <2 x i32> %6) - %110 = call i32 @llvm.SI.tid() - %111 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %110 - %112 = bitcast float %93 to i32 - store i32 %112, i32 addrspace(3)* %111 - %113 = bitcast float %94 to i32 - store i32 %113, i32 addrspace(3)* %111 - %114 = call i32 @llvm.SI.tid() - %115 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %114 - %116 = and i32 %114, -4 - %117 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %116 - %118 = add i32 %116, 1 - %119 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %118 - %120 = bitcast float %93 to i32 - store i32 %120, i32 addrspace(3)* %115 - %121 = load i32, i32 addrspace(3)* %117 - %122 = bitcast i32 %121 to float - %123 = load i32, i32 addrspace(3)* %119 - %124 = bitcast i32 %123 to float - %125 = fsub float %124, %122 - %126 = bitcast float %94 to i32 - store i32 %126, i32 addrspace(3)* %115 - %127 = load i32, i32 addrspace(3)* %117 - %128 = bitcast i32 %127 to float - %129 = load i32, i32 addrspace(3)* %119 - %130 = bitcast i32 %129 to float - %131 = fsub float %130, %128 - %132 = insertelement <4 x float> undef, float %125, i32 0 - %133 = insertelement <4 x float> %132, float %131, i32 1 - %134 = insertelement <4 x float> %133, float %131, i32 2 - %135 = insertelement <4 x float> %134, float %131, i32 3 - %136 = extractelement <4 x float> %135, i32 0 - %137 = extractelement <4 x float> %135, i32 1 - %138 = fmul float %60, %93 - %139 = fmul float %60, %94 - %140 = fmul float %60, %94 - %141 = fmul float %60, %94 - %142 = call i32 @llvm.SI.tid() - %143 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %142 - %144 = bitcast float %138 to i32 - store i32 %144, i32 addrspace(3)* %143 - %145 = bitcast float %139 to i32 - store i32 %145, i32 addrspace(3)* %143 - %146 = bitcast float %140 to i32 - store i32 %146, i32 addrspace(3)* %143 - %147 = bitcast float %141 to i32 - store i32 %147, i32 addrspace(3)* %143 - %148 = call i32 @llvm.SI.tid() - %149 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %148 - %150 = and i32 %148, -4 - %151 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %150 - %152 = add i32 %150, 2 - %153 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %152 - %154 = bitcast float %138 to i32 - store i32 %154, i32 addrspace(3)* %149 - %155 = load i32, i32 addrspace(3)* %151 - %156 = bitcast i32 %155 to float - %157 = load i32, i32 addrspace(3)* %153 - %158 = bitcast i32 %157 to float - %159 = fsub float %158, %156 - %160 = bitcast float %139 to i32 - store i32 %160, i32 addrspace(3)* %149 - %161 = load i32, i32 addrspace(3)* %151 - %162 = bitcast i32 %161 to float - %163 = load i32, i32 addrspace(3)* %153 - %164 = bitcast i32 %163 to float - %165 = fsub float %164, %162 - %166 = bitcast float %140 to i32 - store i32 %166, i32 addrspace(3)* %149 - %167 = load i32, i32 addrspace(3)* %151 - %168 = bitcast i32 %167 to float - %169 = load i32, i32 addrspace(3)* %153 - %170 = bitcast i32 %169 to float - %171 = fsub float %170, %168 - %172 = bitcast float %141 to i32 - store i32 %172, i32 addrspace(3)* %149 - %173 = load i32, i32 addrspace(3)* %151 - %174 = bitcast i32 %173 to float - %175 = load i32, i32 addrspace(3)* %153 - %176 = bitcast i32 %175 to float - %177 = fsub float %176, %174 - %178 = insertelement <4 x float> undef, float %159, i32 0 - %179 = insertelement <4 x float> %178, float %165, i32 1 - %180 = insertelement <4 x float> %179, float %171, i32 2 - %181 = insertelement <4 x float> %180, float %177, i32 3 - %182 = extractelement <4 x float> %181, i32 0 - %183 = extractelement <4 x float> %181, i32 1 - %184 = fdiv float 1.000000e+00, %97 - %185 = fmul float %33, %184 - %186 = fcmp uge float 1.000000e+00, %185 - %187 = select i1 %186, float %185, float 1.000000e+00 - %188 = fmul float %187, %30 - %189 = call float @ceil(float %188) - %190 = fcmp uge float 3.000000e+00, %189 - %191 = select i1 %190, float 3.000000e+00, float %189 - %192 = fdiv float 1.000000e+00, %191 - %193 = fdiv float 1.000000e+00, %30 - %194 = fmul float %191, %193 - %195 = fmul float %31, %194 - %196 = fmul float %95, %95 - %197 = fmul float %96, %96 - %198 = fadd float %197, %196 - %199 = fmul float %97, %97 - %200 = fadd float %198, %199 - %201 = call float @llvm.AMDGPU.rsq.f32(float %200) - %202 = fmul float %95, %201 - %203 = fmul float %96, %201 - %204 = fmul float %202, %29 - %205 = fmul float %203, %29 - %206 = fmul float %204, -1.000000e+00 - %207 = fmul float %205, 1.000000e+00 - %208 = fmul float %206, %32 - %209 = fmul float %207, %32 - %210 = fsub float -0.000000e+00, %208 - %211 = fadd float %93, %210 - %212 = fsub float -0.000000e+00, %209 - %213 = fadd float %94, %212 - %214 = fmul float %206, %192 - %215 = fmul float %207, %192 - %216 = fmul float -1.000000e+00, %192 - %217 = bitcast float %136 to i32 - %218 = bitcast float %182 to i32 - %219 = bitcast float %137 to i32 - %220 = bitcast float %183 to i32 - %221 = insertelement <8 x i32> undef, i32 %217, i32 0 - %222 = insertelement <8 x i32> %221, i32 %218, i32 1 - %223 = insertelement <8 x i32> %222, i32 %219, i32 2 - %224 = insertelement <8 x i32> %223, i32 %220, i32 3 - br label %LOOP - -LOOP: ; preds = %ENDIF, %main_body - %temp24.0 = phi float [ 1.000000e+00, %main_body ], [ %258, %ENDIF ] - %temp28.0 = phi float [ %211, %main_body ], [ %253, %ENDIF ] - %temp29.0 = phi float [ %213, %main_body ], [ %255, %ENDIF ] - %temp30.0 = phi float [ 1.000000e+00, %main_body ], [ %257, %ENDIF ] - %225 = fcmp oge float %temp24.0, %191 - %226 = sext i1 %225 to i32 - %227 = bitcast i32 %226 to float - %228 = bitcast float %227 to i32 - %229 = icmp ne i32 %228, 0 - br i1 %229, label %IF, label %ENDIF - -IF: ; preds = %LOOP - %230 = bitcast float %136 to i32 - %231 = bitcast float %182 to i32 - %232 = bitcast float %137 to i32 - %233 = bitcast float %183 to i32 - %234 = insertelement <8 x i32> undef, i32 %230, i32 0 - %235 = insertelement <8 x i32> %234, i32 %231, i32 1 - %236 = insertelement <8 x i32> %235, i32 %232, i32 2 - %237 = insertelement <8 x i32> %236, i32 %233, i32 3 - br label %LOOP65 - -ENDIF: ; preds = %LOOP - %238 = bitcast float %temp28.0 to i32 - %239 = bitcast float %temp29.0 to i32 - %240 = insertelement <8 x i32> %224, i32 %238, i32 4 - %241 = insertelement <8 x i32> %240, i32 %239, i32 5 - %242 = insertelement <8 x i32> %241, i32 undef, i32 6 - %243 = insertelement <8 x i32> %242, i32 undef, i32 7 - %244 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %243, <32 x i8> %62, <16 x i8> %64, i32 2) - %245 = extractelement <4 x float> %244, i32 3 - %246 = fcmp oge float %temp30.0, %245 - %247 = sext i1 %246 to i32 - %248 = bitcast i32 %247 to float - %249 = bitcast float %248 to i32 - %250 = and i32 %249, 1065353216 - %251 = bitcast i32 %250 to float - %252 = fmul float %214, %251 - %253 = fadd float %252, %temp28.0 - %254 = fmul float %215, %251 - %255 = fadd float %254, %temp29.0 - %256 = fmul float %216, %251 - %257 = fadd float %256, %temp30.0 - %258 = fadd float %temp24.0, 1.000000e+00 - br label %LOOP - -LOOP65: ; preds = %ENDIF66, %IF - %temp24.1 = phi float [ 0.000000e+00, %IF ], [ %610, %ENDIF66 ] - %temp28.1 = phi float [ %temp28.0, %IF ], [ %605, %ENDIF66 ] - %temp29.1 = phi float [ %temp29.0, %IF ], [ %607, %ENDIF66 ] - %temp30.1 = phi float [ %temp30.0, %IF ], [ %609, %ENDIF66 ] - %temp32.0 = phi float [ 1.000000e+00, %IF ], [ %611, %ENDIF66 ] - %259 = fcmp oge float %temp24.1, %195 - %260 = sext i1 %259 to i32 - %261 = bitcast i32 %260 to float - %262 = bitcast float %261 to i32 - %263 = icmp ne i32 %262, 0 - br i1 %263, label %IF67, label %ENDIF66 - -IF67: ; preds = %LOOP65 - %264 = bitcast float %136 to i32 - %265 = bitcast float %182 to i32 - %266 = bitcast float %137 to i32 - %267 = bitcast float %183 to i32 - %268 = bitcast float %temp28.1 to i32 - %269 = bitcast float %temp29.1 to i32 - %270 = insertelement <8 x i32> undef, i32 %264, i32 0 - %271 = insertelement <8 x i32> %270, i32 %265, i32 1 - %272 = insertelement <8 x i32> %271, i32 %266, i32 2 - %273 = insertelement <8 x i32> %272, i32 %267, i32 3 - %274 = insertelement <8 x i32> %273, i32 %268, i32 4 - %275 = insertelement <8 x i32> %274, i32 %269, i32 5 - %276 = insertelement <8 x i32> %275, i32 undef, i32 6 - %277 = insertelement <8 x i32> %276, i32 undef, i32 7 - %278 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %277, <32 x i8> %66, <16 x i8> %68, i32 2) - %279 = extractelement <4 x float> %278, i32 0 - %280 = extractelement <4 x float> %278, i32 1 - %281 = extractelement <4 x float> %278, i32 2 - %282 = extractelement <4 x float> %278, i32 3 - %283 = fmul float %282, %47 - %284 = bitcast float %136 to i32 - %285 = bitcast float %182 to i32 - %286 = bitcast float %137 to i32 - %287 = bitcast float %183 to i32 - %288 = bitcast float %temp28.1 to i32 - %289 = bitcast float %temp29.1 to i32 - %290 = insertelement <8 x i32> undef, i32 %284, i32 0 - %291 = insertelement <8 x i32> %290, i32 %285, i32 1 - %292 = insertelement <8 x i32> %291, i32 %286, i32 2 - %293 = insertelement <8 x i32> %292, i32 %287, i32 3 - %294 = insertelement <8 x i32> %293, i32 %288, i32 4 - %295 = insertelement <8 x i32> %294, i32 %289, i32 5 - %296 = insertelement <8 x i32> %295, i32 undef, i32 6 - %297 = insertelement <8 x i32> %296, i32 undef, i32 7 - %298 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %297, <32 x i8> %82, <16 x i8> %84, i32 2) - %299 = extractelement <4 x float> %298, i32 0 - %300 = extractelement <4 x float> %298, i32 1 - %301 = extractelement <4 x float> %298, i32 2 - %302 = bitcast float %136 to i32 - %303 = bitcast float %182 to i32 - %304 = bitcast float %137 to i32 - %305 = bitcast float %183 to i32 - %306 = bitcast float %temp28.1 to i32 - %307 = bitcast float %temp29.1 to i32 - %308 = insertelement <8 x i32> undef, i32 %302, i32 0 - %309 = insertelement <8 x i32> %308, i32 %303, i32 1 - %310 = insertelement <8 x i32> %309, i32 %304, i32 2 - %311 = insertelement <8 x i32> %310, i32 %305, i32 3 - %312 = insertelement <8 x i32> %311, i32 %306, i32 4 - %313 = insertelement <8 x i32> %312, i32 %307, i32 5 - %314 = insertelement <8 x i32> %313, i32 undef, i32 6 - %315 = insertelement <8 x i32> %314, i32 undef, i32 7 - %316 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %315, <32 x i8> %78, <16 x i8> %80, i32 2) - %317 = extractelement <4 x float> %316, i32 0 - %318 = extractelement <4 x float> %316, i32 1 - %319 = extractelement <4 x float> %316, i32 2 - %320 = fmul float %317, %23 - %321 = fmul float %318, %24 - %322 = fmul float %319, %25 - %323 = fmul float %299, %26 - %324 = fadd float %323, %320 - %325 = fmul float %300, %27 - %326 = fadd float %325, %321 - %327 = fmul float %301, %28 - %328 = fadd float %327, %322 - %329 = fadd float %279, %324 - %330 = fadd float %280, %326 - %331 = fadd float %281, %328 - %332 = bitcast float %136 to i32 - %333 = bitcast float %182 to i32 - %334 = bitcast float %137 to i32 - %335 = bitcast float %183 to i32 - %336 = bitcast float %temp28.1 to i32 - %337 = bitcast float %temp29.1 to i32 - %338 = insertelement <8 x i32> undef, i32 %332, i32 0 - %339 = insertelement <8 x i32> %338, i32 %333, i32 1 - %340 = insertelement <8 x i32> %339, i32 %334, i32 2 - %341 = insertelement <8 x i32> %340, i32 %335, i32 3 - %342 = insertelement <8 x i32> %341, i32 %336, i32 4 - %343 = insertelement <8 x i32> %342, i32 %337, i32 5 - %344 = insertelement <8 x i32> %343, i32 undef, i32 6 - %345 = insertelement <8 x i32> %344, i32 undef, i32 7 - %346 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %345, <32 x i8> %62, <16 x i8> %64, i32 2) - %347 = extractelement <4 x float> %346, i32 0 - %348 = extractelement <4 x float> %346, i32 1 - %349 = extractelement <4 x float> %346, i32 2 - %350 = fadd float %347, -5.000000e-01 - %351 = fadd float %348, -5.000000e-01 - %352 = fadd float %349, -5.000000e-01 - %353 = fmul float %350, %350 - %354 = fmul float %351, %351 - %355 = fadd float %354, %353 - %356 = fmul float %352, %352 - %357 = fadd float %355, %356 - %358 = call float @llvm.AMDGPU.rsq.f32(float %357) - %359 = fmul float %350, %358 - %360 = fmul float %351, %358 - %361 = fmul float %352, %358 - %362 = bitcast float %136 to i32 - %363 = bitcast float %182 to i32 - %364 = bitcast float %137 to i32 - %365 = bitcast float %183 to i32 - %366 = bitcast float %temp28.1 to i32 - %367 = bitcast float %temp29.1 to i32 - %368 = insertelement <8 x i32> undef, i32 %362, i32 0 - %369 = insertelement <8 x i32> %368, i32 %363, i32 1 - %370 = insertelement <8 x i32> %369, i32 %364, i32 2 - %371 = insertelement <8 x i32> %370, i32 %365, i32 3 - %372 = insertelement <8 x i32> %371, i32 %366, i32 4 - %373 = insertelement <8 x i32> %372, i32 %367, i32 5 - %374 = insertelement <8 x i32> %373, i32 undef, i32 6 - %375 = insertelement <8 x i32> %374, i32 undef, i32 7 - %376 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %375, <32 x i8> %70, <16 x i8> %72, i32 2) - %377 = extractelement <4 x float> %376, i32 0 - %378 = extractelement <4 x float> %376, i32 1 - %379 = extractelement <4 x float> %376, i32 2 - %380 = extractelement <4 x float> %376, i32 3 - %381 = fsub float -0.000000e+00, %95 - %382 = fsub float -0.000000e+00, %96 - %383 = fsub float -0.000000e+00, %97 - %384 = fmul float %359, %381 - %385 = fmul float %360, %382 - %386 = fadd float %385, %384 - %387 = fmul float %361, %383 - %388 = fadd float %386, %387 - %389 = fmul float %388, %359 - %390 = fmul float %388, %360 - %391 = fmul float %388, %361 - %392 = fmul float 2.000000e+00, %389 - %393 = fmul float 2.000000e+00, %390 - %394 = fmul float 2.000000e+00, %391 - %395 = fsub float -0.000000e+00, %392 - %396 = fadd float %381, %395 - %397 = fsub float -0.000000e+00, %393 - %398 = fadd float %382, %397 - %399 = fsub float -0.000000e+00, %394 - %400 = fadd float %383, %399 - %401 = fmul float %396, %98 - %402 = fmul float %396, %99 - %403 = fmul float %396, %100 - %404 = fmul float %398, %101 - %405 = fadd float %404, %401 - %406 = fmul float %398, %102 - %407 = fadd float %406, %402 - %408 = fmul float %398, %103 - %409 = fadd float %408, %403 - %410 = fmul float %400, %104 - %411 = fadd float %410, %405 - %412 = fmul float %400, %105 - %413 = fadd float %412, %407 - %414 = fmul float %400, %106 - %415 = fadd float %414, %409 - %416 = bitcast float %136 to i32 - %417 = bitcast float %182 to i32 - %418 = bitcast float %137 to i32 - %419 = bitcast float %183 to i32 - %420 = bitcast float %temp28.1 to i32 - %421 = bitcast float %temp29.1 to i32 - %422 = insertelement <8 x i32> undef, i32 %416, i32 0 - %423 = insertelement <8 x i32> %422, i32 %417, i32 1 - %424 = insertelement <8 x i32> %423, i32 %418, i32 2 - %425 = insertelement <8 x i32> %424, i32 %419, i32 3 - %426 = insertelement <8 x i32> %425, i32 %420, i32 4 - %427 = insertelement <8 x i32> %426, i32 %421, i32 5 - %428 = insertelement <8 x i32> %427, i32 undef, i32 6 - %429 = insertelement <8 x i32> %428, i32 undef, i32 7 - %430 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %429, <32 x i8> %86, <16 x i8> %88, i32 2) - %431 = extractelement <4 x float> %430, i32 0 - %432 = extractelement <4 x float> %430, i32 1 - %433 = extractelement <4 x float> %430, i32 2 - %434 = fmul float %48, %411 - %435 = fmul float %49, %411 - %436 = fmul float %50, %411 - %437 = fmul float %51, %413 - %438 = fadd float %437, %434 - %439 = fmul float %52, %413 - %440 = fadd float %439, %435 - %441 = fmul float %53, %413 - %442 = fadd float %441, %436 - %443 = fmul float %54, %415 - %444 = fadd float %443, %438 - %445 = fmul float %55, %415 - %446 = fadd float %445, %440 - %447 = fmul float %56, %415 - %448 = fadd float %447, %442 - %449 = insertelement <4 x float> undef, float %444, i32 0 - %450 = insertelement <4 x float> %449, float %446, i32 1 - %451 = insertelement <4 x float> %450, float %448, i32 2 - %452 = insertelement <4 x float> %451, float %195, i32 3 - %453 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %452) - %454 = extractelement <4 x float> %453, i32 0 - %455 = extractelement <4 x float> %453, i32 1 - %456 = extractelement <4 x float> %453, i32 2 - %457 = extractelement <4 x float> %453, i32 3 - %458 = call float @fabs(float %456) - %459 = fdiv float 1.000000e+00, %458 - %460 = fmul float %454, %459 - %461 = fadd float %460, 1.500000e+00 - %462 = fmul float %455, %459 - %463 = fadd float %462, 1.500000e+00 - %464 = bitcast float %463 to i32 - %465 = bitcast float %461 to i32 - %466 = bitcast float %457 to i32 - %467 = insertelement <4 x i32> undef, i32 %464, i32 0 - %468 = insertelement <4 x i32> %467, i32 %465, i32 1 - %469 = insertelement <4 x i32> %468, i32 %466, i32 2 - %470 = insertelement <4 x i32> %469, i32 undef, i32 3 - %471 = call <4 x float> @llvm.SI.sample.v4i32(<4 x i32> %470, <32 x i8> %90, <16 x i8> %92, i32 4) - %472 = extractelement <4 x float> %471, i32 0 - %473 = extractelement <4 x float> %471, i32 1 - %474 = extractelement <4 x float> %471, i32 2 - %475 = fmul float %431, %472 - %476 = fadd float %475, %329 - %477 = fmul float %432, %473 - %478 = fadd float %477, %330 - %479 = fmul float %433, %474 - %480 = fadd float %479, %331 - %481 = fmul float %107, %107 - %482 = fmul float %108, %108 - %483 = fadd float %482, %481 - %484 = fmul float %109, %109 - %485 = fadd float %483, %484 - %486 = call float @llvm.AMDGPU.rsq.f32(float %485) - %487 = fmul float %107, %486 - %488 = fmul float %108, %486 - %489 = fmul float %109, %486 - %490 = fmul float %377, %40 - %491 = fmul float %378, %41 - %492 = fmul float %379, %42 - %493 = fmul float %359, %487 - %494 = fmul float %360, %488 - %495 = fadd float %494, %493 - %496 = fmul float %361, %489 - %497 = fadd float %495, %496 - %498 = fmul float %497, %359 - %499 = fmul float %497, %360 - %500 = fmul float %497, %361 - %501 = fmul float 2.000000e+00, %498 - %502 = fmul float 2.000000e+00, %499 - %503 = fmul float 2.000000e+00, %500 - %504 = fsub float -0.000000e+00, %501 - %505 = fadd float %487, %504 - %506 = fsub float -0.000000e+00, %502 - %507 = fadd float %488, %506 - %508 = fsub float -0.000000e+00, %503 - %509 = fadd float %489, %508 - %510 = fmul float %95, %95 - %511 = fmul float %96, %96 - %512 = fadd float %511, %510 - %513 = fmul float %97, %97 - %514 = fadd float %512, %513 - %515 = call float @llvm.AMDGPU.rsq.f32(float %514) - %516 = fmul float %95, %515 - %517 = fmul float %96, %515 - %518 = fmul float %97, %515 - %519 = fmul float %505, %516 - %520 = fmul float %507, %517 - %521 = fadd float %520, %519 - %522 = fmul float %509, %518 - %523 = fadd float %521, %522 - %524 = fsub float -0.000000e+00, %523 - %525 = fcmp uge float %524, 0.000000e+00 - %526 = select i1 %525, float %524, float 0.000000e+00 - %527 = fmul float %43, %380 - %528 = fadd float %527, 1.000000e+00 - %529 = call float @llvm.pow.f32(float %526, float %528) - %530 = fmul float %476, %37 - %531 = fmul float %478, %38 - %532 = fmul float %480, %39 - %533 = fmul float %359, %487 - %534 = fmul float %360, %488 - %535 = fadd float %534, %533 - %536 = fmul float %361, %489 - %537 = fadd float %535, %536 - %538 = fcmp uge float %537, 0.000000e+00 - %539 = select i1 %538, float %537, float 0.000000e+00 - %540 = fmul float %530, %539 - %541 = fmul float %531, %539 - %542 = fmul float %532, %539 - %543 = fmul float %490, %529 - %544 = fadd float %543, %540 - %545 = fmul float %491, %529 - %546 = fadd float %545, %541 - %547 = fmul float %492, %529 - %548 = fadd float %547, %542 - %549 = fmul float %476, %34 - %550 = fmul float %478, %35 - %551 = fmul float %480, %36 - %552 = fmul float %544, %57 - %553 = fadd float %552, %549 - %554 = fmul float %546, %58 - %555 = fadd float %554, %550 - %556 = fmul float %548, %59 - %557 = fadd float %556, %551 - %558 = bitcast float %136 to i32 - %559 = bitcast float %182 to i32 - %560 = bitcast float %137 to i32 - %561 = bitcast float %183 to i32 - %562 = bitcast float %temp28.1 to i32 - %563 = bitcast float %temp29.1 to i32 - %564 = insertelement <8 x i32> undef, i32 %558, i32 0 - %565 = insertelement <8 x i32> %564, i32 %559, i32 1 - %566 = insertelement <8 x i32> %565, i32 %560, i32 2 - %567 = insertelement <8 x i32> %566, i32 %561, i32 3 - %568 = insertelement <8 x i32> %567, i32 %562, i32 4 - %569 = insertelement <8 x i32> %568, i32 %563, i32 5 - %570 = insertelement <8 x i32> %569, i32 undef, i32 6 - %571 = insertelement <8 x i32> %570, i32 undef, i32 7 - %572 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %571, <32 x i8> %74, <16 x i8> %76, i32 2) - %573 = extractelement <4 x float> %572, i32 0 - %574 = extractelement <4 x float> %572, i32 1 - %575 = extractelement <4 x float> %572, i32 2 - %576 = fmul float %573, %44 - %577 = fadd float %576, %553 - %578 = fmul float %574, %45 - %579 = fadd float %578, %555 - %580 = fmul float %575, %46 - %581 = fadd float %580, %557 - %582 = call i32 @llvm.SI.packf16(float %577, float %579) - %583 = bitcast i32 %582 to float - %584 = call i32 @llvm.SI.packf16(float %581, float %283) - %585 = bitcast i32 %584 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %583, float %585, float %583, float %585) - ret void - -ENDIF66: ; preds = %LOOP65 - %586 = bitcast float %temp28.1 to i32 - %587 = bitcast float %temp29.1 to i32 - %588 = insertelement <8 x i32> %237, i32 %586, i32 4 - %589 = insertelement <8 x i32> %588, i32 %587, i32 5 - %590 = insertelement <8 x i32> %589, i32 undef, i32 6 - %591 = insertelement <8 x i32> %590, i32 undef, i32 7 - %592 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %591, <32 x i8> %62, <16 x i8> %64, i32 2) - %593 = extractelement <4 x float> %592, i32 3 - %594 = fcmp oge float %temp30.1, %593 - %595 = sext i1 %594 to i32 - %596 = bitcast i32 %595 to float - %597 = bitcast float %596 to i32 - %598 = and i32 %597, 1065353216 - %599 = bitcast i32 %598 to float - %600 = fmul float 5.000000e-01, %temp32.0 - %601 = fsub float -0.000000e+00, %600 - %602 = fmul float %599, %temp32.0 - %603 = fadd float %602, %601 - %604 = fmul float %214, %603 - %605 = fadd float %604, %temp28.1 - %606 = fmul float %215, %603 - %607 = fadd float %606, %temp29.1 - %608 = fmul float %216, %603 - %609 = fadd float %608, %temp30.1 - %610 = fadd float %temp24.1, 1.000000e+00 - %611 = fmul float %temp32.0, 5.000000e-01 - br label %LOOP65 -} - -; Function Attrs: nounwind readnone -declare float @llvm.SI.load.const(<16 x i8>, i32) #1 - -; Function Attrs: nounwind readnone -declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 - -; Function Attrs: readnone -declare i32 @llvm.SI.tid() #2 - -; Function Attrs: readonly -declare float @ceil(float) #3 - -; Function Attrs: readnone -declare float @llvm.AMDGPU.rsq.f32(float) #2 - -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.sampled.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32) #1 - -; Function Attrs: readnone -declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #2 - -; Function Attrs: readnone -declare float @fabs(float) #2 - -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.sample.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1 - -; Function Attrs: nounwind readonly -declare float @llvm.pow.f32(float, float) #4 - -; Function Attrs: nounwind readnone -declare i32 @llvm.SI.packf16(float, float) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } -attributes #2 = { readnone } -attributes #3 = { readonly } -attributes #4 = { nounwind readonly } - -!0 = !{!"const", null, i32 1} - -; CHECK-LABEL: {{^}}main1: -; CHECK: s_endpgm -define void @main1([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { -main_body: - %21 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0 - %22 = load <16 x i8>, <16 x i8> addrspace(2)* %21, !tbaa !0 - %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 0) - %24 = call float @llvm.SI.load.const(<16 x i8> %22, i32 4) - %25 = call float @llvm.SI.load.const(<16 x i8> %22, i32 8) - %26 = call float @llvm.SI.load.const(<16 x i8> %22, i32 12) - %27 = call float @llvm.SI.load.const(<16 x i8> %22, i32 28) - %28 = call float @llvm.SI.load.const(<16 x i8> %22, i32 48) - %29 = call float @llvm.SI.load.const(<16 x i8> %22, i32 52) - %30 = call float @llvm.SI.load.const(<16 x i8> %22, i32 56) - %31 = call float @llvm.SI.load.const(<16 x i8> %22, i32 64) - %32 = call float @llvm.SI.load.const(<16 x i8> %22, i32 68) - %33 = call float @llvm.SI.load.const(<16 x i8> %22, i32 72) - %34 = call float @llvm.SI.load.const(<16 x i8> %22, i32 76) - %35 = call float @llvm.SI.load.const(<16 x i8> %22, i32 128) - %36 = call float @llvm.SI.load.const(<16 x i8> %22, i32 132) - %37 = call float @llvm.SI.load.const(<16 x i8> %22, i32 144) - %38 = call float @llvm.SI.load.const(<16 x i8> %22, i32 148) - %39 = call float @llvm.SI.load.const(<16 x i8> %22, i32 152) - %40 = call float @llvm.SI.load.const(<16 x i8> %22, i32 160) - %41 = call float @llvm.SI.load.const(<16 x i8> %22, i32 164) - %42 = call float @llvm.SI.load.const(<16 x i8> %22, i32 168) - %43 = call float @llvm.SI.load.const(<16 x i8> %22, i32 172) - %44 = call float @llvm.SI.load.const(<16 x i8> %22, i32 176) - %45 = call float @llvm.SI.load.const(<16 x i8> %22, i32 180) - %46 = call float @llvm.SI.load.const(<16 x i8> %22, i32 184) - %47 = call float @llvm.SI.load.const(<16 x i8> %22, i32 192) - %48 = call float @llvm.SI.load.const(<16 x i8> %22, i32 196) - %49 = call float @llvm.SI.load.const(<16 x i8> %22, i32 200) - %50 = call float @llvm.SI.load.const(<16 x i8> %22, i32 208) - %51 = call float @llvm.SI.load.const(<16 x i8> %22, i32 212) - %52 = call float @llvm.SI.load.const(<16 x i8> %22, i32 216) - %53 = call float @llvm.SI.load.const(<16 x i8> %22, i32 220) - %54 = call float @llvm.SI.load.const(<16 x i8> %22, i32 236) - %55 = call float @llvm.SI.load.const(<16 x i8> %22, i32 240) - %56 = call float @llvm.SI.load.const(<16 x i8> %22, i32 244) - %57 = call float @llvm.SI.load.const(<16 x i8> %22, i32 248) - %58 = call float @llvm.SI.load.const(<16 x i8> %22, i32 252) - %59 = call float @llvm.SI.load.const(<16 x i8> %22, i32 256) - %60 = call float @llvm.SI.load.const(<16 x i8> %22, i32 260) - %61 = call float @llvm.SI.load.const(<16 x i8> %22, i32 264) - %62 = call float @llvm.SI.load.const(<16 x i8> %22, i32 268) - %63 = call float @llvm.SI.load.const(<16 x i8> %22, i32 272) - %64 = call float @llvm.SI.load.const(<16 x i8> %22, i32 276) - %65 = call float @llvm.SI.load.const(<16 x i8> %22, i32 280) - %66 = call float @llvm.SI.load.const(<16 x i8> %22, i32 284) - %67 = call float @llvm.SI.load.const(<16 x i8> %22, i32 288) - %68 = call float @llvm.SI.load.const(<16 x i8> %22, i32 292) - %69 = call float @llvm.SI.load.const(<16 x i8> %22, i32 464) - %70 = call float @llvm.SI.load.const(<16 x i8> %22, i32 468) - %71 = call float @llvm.SI.load.const(<16 x i8> %22, i32 472) - %72 = call float @llvm.SI.load.const(<16 x i8> %22, i32 496) - %73 = call float @llvm.SI.load.const(<16 x i8> %22, i32 500) - %74 = call float @llvm.SI.load.const(<16 x i8> %22, i32 504) - %75 = call float @llvm.SI.load.const(<16 x i8> %22, i32 512) - %76 = call float @llvm.SI.load.const(<16 x i8> %22, i32 516) - %77 = call float @llvm.SI.load.const(<16 x i8> %22, i32 524) - %78 = call float @llvm.SI.load.const(<16 x i8> %22, i32 532) - %79 = call float @llvm.SI.load.const(<16 x i8> %22, i32 536) - %80 = call float @llvm.SI.load.const(<16 x i8> %22, i32 540) - %81 = call float @llvm.SI.load.const(<16 x i8> %22, i32 544) - %82 = call float @llvm.SI.load.const(<16 x i8> %22, i32 548) - %83 = call float @llvm.SI.load.const(<16 x i8> %22, i32 552) - %84 = call float @llvm.SI.load.const(<16 x i8> %22, i32 556) - %85 = call float @llvm.SI.load.const(<16 x i8> %22, i32 560) - %86 = call float @llvm.SI.load.const(<16 x i8> %22, i32 564) - %87 = call float @llvm.SI.load.const(<16 x i8> %22, i32 568) - %88 = call float @llvm.SI.load.const(<16 x i8> %22, i32 572) - %89 = call float @llvm.SI.load.const(<16 x i8> %22, i32 576) - %90 = call float @llvm.SI.load.const(<16 x i8> %22, i32 580) - %91 = call float @llvm.SI.load.const(<16 x i8> %22, i32 584) - %92 = call float @llvm.SI.load.const(<16 x i8> %22, i32 588) - %93 = call float @llvm.SI.load.const(<16 x i8> %22, i32 592) - %94 = call float @llvm.SI.load.const(<16 x i8> %22, i32 596) - %95 = call float @llvm.SI.load.const(<16 x i8> %22, i32 600) - %96 = call float @llvm.SI.load.const(<16 x i8> %22, i32 604) - %97 = call float @llvm.SI.load.const(<16 x i8> %22, i32 608) - %98 = call float @llvm.SI.load.const(<16 x i8> %22, i32 612) - %99 = call float @llvm.SI.load.const(<16 x i8> %22, i32 616) - %100 = call float @llvm.SI.load.const(<16 x i8> %22, i32 624) - %101 = call float @llvm.SI.load.const(<16 x i8> %22, i32 628) - %102 = call float @llvm.SI.load.const(<16 x i8> %22, i32 632) - %103 = call float @llvm.SI.load.const(<16 x i8> %22, i32 636) - %104 = call float @llvm.SI.load.const(<16 x i8> %22, i32 640) - %105 = call float @llvm.SI.load.const(<16 x i8> %22, i32 644) - %106 = call float @llvm.SI.load.const(<16 x i8> %22, i32 648) - %107 = call float @llvm.SI.load.const(<16 x i8> %22, i32 652) - %108 = call float @llvm.SI.load.const(<16 x i8> %22, i32 656) - %109 = call float @llvm.SI.load.const(<16 x i8> %22, i32 660) - %110 = call float @llvm.SI.load.const(<16 x i8> %22, i32 664) - %111 = call float @llvm.SI.load.const(<16 x i8> %22, i32 668) - %112 = call float @llvm.SI.load.const(<16 x i8> %22, i32 672) - %113 = call float @llvm.SI.load.const(<16 x i8> %22, i32 676) - %114 = call float @llvm.SI.load.const(<16 x i8> %22, i32 680) - %115 = call float @llvm.SI.load.const(<16 x i8> %22, i32 684) - %116 = call float @llvm.SI.load.const(<16 x i8> %22, i32 688) - %117 = call float @llvm.SI.load.const(<16 x i8> %22, i32 692) - %118 = call float @llvm.SI.load.const(<16 x i8> %22, i32 696) - %119 = call float @llvm.SI.load.const(<16 x i8> %22, i32 700) - %120 = call float @llvm.SI.load.const(<16 x i8> %22, i32 704) - %121 = call float @llvm.SI.load.const(<16 x i8> %22, i32 708) - %122 = call float @llvm.SI.load.const(<16 x i8> %22, i32 712) - %123 = call float @llvm.SI.load.const(<16 x i8> %22, i32 716) - %124 = call float @llvm.SI.load.const(<16 x i8> %22, i32 864) - %125 = call float @llvm.SI.load.const(<16 x i8> %22, i32 868) - %126 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0 - %127 = load <32 x i8>, <32 x i8> addrspace(2)* %126, !tbaa !0 - %128 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0 - %129 = load <16 x i8>, <16 x i8> addrspace(2)* %128, !tbaa !0 - %130 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 1 - %131 = load <32 x i8>, <32 x i8> addrspace(2)* %130, !tbaa !0 - %132 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 1 - %133 = load <16 x i8>, <16 x i8> addrspace(2)* %132, !tbaa !0 - %134 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 2 - %135 = load <32 x i8>, <32 x i8> addrspace(2)* %134, !tbaa !0 - %136 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 2 - %137 = load <16 x i8>, <16 x i8> addrspace(2)* %136, !tbaa !0 - %138 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 3 - %139 = load <32 x i8>, <32 x i8> addrspace(2)* %138, !tbaa !0 - %140 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 3 - %141 = load <16 x i8>, <16 x i8> addrspace(2)* %140, !tbaa !0 - %142 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 4 - %143 = load <32 x i8>, <32 x i8> addrspace(2)* %142, !tbaa !0 - %144 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 4 - %145 = load <16 x i8>, <16 x i8> addrspace(2)* %144, !tbaa !0 - %146 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 5 - %147 = load <32 x i8>, <32 x i8> addrspace(2)* %146, !tbaa !0 - %148 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 5 - %149 = load <16 x i8>, <16 x i8> addrspace(2)* %148, !tbaa !0 - %150 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 6 - %151 = load <32 x i8>, <32 x i8> addrspace(2)* %150, !tbaa !0 - %152 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 6 - %153 = load <16 x i8>, <16 x i8> addrspace(2)* %152, !tbaa !0 - %154 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 7 - %155 = load <32 x i8>, <32 x i8> addrspace(2)* %154, !tbaa !0 - %156 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 7 - %157 = load <16 x i8>, <16 x i8> addrspace(2)* %156, !tbaa !0 - %158 = getelementptr [16 x <32 x i8>], [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 8 - %159 = load <32 x i8>, <32 x i8> addrspace(2)* %158, !tbaa !0 - %160 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 8 - %161 = load <16 x i8>, <16 x i8> addrspace(2)* %160, !tbaa !0 - %162 = fcmp ugt float %17, 0.000000e+00 - %163 = select i1 %162, float 1.000000e+00, float 0.000000e+00 - %164 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %4, <2 x i32> %6) - %165 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %4, <2 x i32> %6) - %166 = call float @llvm.SI.fs.interp(i32 2, i32 0, i32 %4, <2 x i32> %6) - %167 = call float @llvm.SI.fs.interp(i32 3, i32 0, i32 %4, <2 x i32> %6) - %168 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %4, <2 x i32> %6) - %169 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %4, <2 x i32> %6) - %170 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %4, <2 x i32> %6) - %171 = call float @llvm.SI.fs.interp(i32 3, i32 1, i32 %4, <2 x i32> %6) - %172 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %4, <2 x i32> %6) - %173 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %4, <2 x i32> %6) - %174 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %4, <2 x i32> %6) - %175 = call float @llvm.SI.fs.interp(i32 3, i32 2, i32 %4, <2 x i32> %6) - %176 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %4, <2 x i32> %6) - %177 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %4, <2 x i32> %6) - %178 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %4, <2 x i32> %6) - %179 = call float @llvm.SI.fs.interp(i32 3, i32 3, i32 %4, <2 x i32> %6) - %180 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %4, <2 x i32> %6) - %181 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %4, <2 x i32> %6) - %182 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %4, <2 x i32> %6) - %183 = call float @llvm.SI.fs.interp(i32 3, i32 4, i32 %4, <2 x i32> %6) - %184 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %4, <2 x i32> %6) - %185 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %4, <2 x i32> %6) - %186 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %4, <2 x i32> %6) - %187 = call float @llvm.SI.fs.interp(i32 3, i32 5, i32 %4, <2 x i32> %6) - %188 = call float @llvm.SI.fs.interp(i32 0, i32 6, i32 %4, <2 x i32> %6) - %189 = call float @llvm.SI.fs.interp(i32 1, i32 6, i32 %4, <2 x i32> %6) - %190 = call float @llvm.SI.fs.interp(i32 2, i32 6, i32 %4, <2 x i32> %6) - %191 = call float @llvm.SI.fs.interp(i32 3, i32 6, i32 %4, <2 x i32> %6) - %192 = call float @llvm.SI.fs.interp(i32 0, i32 7, i32 %4, <2 x i32> %6) - %193 = call float @llvm.SI.fs.interp(i32 1, i32 7, i32 %4, <2 x i32> %6) - %194 = call float @llvm.SI.fs.interp(i32 2, i32 7, i32 %4, <2 x i32> %6) - %195 = call float @llvm.SI.fs.interp(i32 3, i32 7, i32 %4, <2 x i32> %6) - %196 = fmul float %14, %124 - %197 = fadd float %196, %125 - %198 = call float @llvm.AMDIL.clamp.(float %163, float 0.000000e+00, float 1.000000e+00) - %199 = call float @llvm.AMDIL.clamp.(float 0.000000e+00, float 0.000000e+00, float 1.000000e+00) - %200 = call float @llvm.AMDIL.clamp.(float 0.000000e+00, float 0.000000e+00, float 1.000000e+00) - %201 = call float @llvm.AMDIL.clamp.(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00) - %202 = bitcast float %198 to i32 - %203 = icmp ne i32 %202, 0 - %. = select i1 %203, float -1.000000e+00, float 1.000000e+00 - %204 = fsub float -0.000000e+00, %164 - %205 = fadd float %44, %204 - %206 = fsub float -0.000000e+00, %165 - %207 = fadd float %45, %206 - %208 = fsub float -0.000000e+00, %166 - %209 = fadd float %46, %208 - %210 = fmul float %205, %205 - %211 = fmul float %207, %207 - %212 = fadd float %211, %210 - %213 = fmul float %209, %209 - %214 = fadd float %212, %213 - %215 = call float @llvm.AMDGPU.rsq.f32(float %214) - %216 = fmul float %205, %215 - %217 = fmul float %207, %215 - %218 = fmul float %209, %215 - %219 = fmul float %., %54 - %220 = fmul float %13, %47 - %221 = fmul float %197, %48 - %222 = bitcast float %174 to i32 - %223 = bitcast float %175 to i32 - %224 = insertelement <2 x i32> undef, i32 %222, i32 0 - %225 = insertelement <2 x i32> %224, i32 %223, i32 1 - %226 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %225, <32 x i8> %131, <16 x i8> %133, i32 2) - %227 = extractelement <4 x float> %226, i32 0 - %228 = extractelement <4 x float> %226, i32 1 - %229 = extractelement <4 x float> %226, i32 2 - %230 = extractelement <4 x float> %226, i32 3 - %231 = fmul float %227, 0x4012611180000000 - %232 = fmul float %228, 0x4012611180000000 - %233 = fmul float %229, 0x4012611180000000 - %234 = call float @llvm.AMDGPU.lrp(float %27, float %231, float 1.000000e+00) - %235 = call float @llvm.AMDGPU.lrp(float %27, float %232, float 1.000000e+00) - %236 = call float @llvm.AMDGPU.lrp(float %27, float %233, float 1.000000e+00) - %237 = fmul float %216, %184 - %238 = fmul float %217, %185 - %239 = fadd float %238, %237 - %240 = fmul float %218, %186 - %241 = fadd float %239, %240 - %242 = fmul float %216, %187 - %243 = fmul float %217, %188 - %244 = fadd float %243, %242 - %245 = fmul float %218, %189 - %246 = fadd float %244, %245 - %247 = fmul float %216, %190 - %248 = fmul float %217, %191 - %249 = fadd float %248, %247 - %250 = fmul float %218, %192 - %251 = fadd float %249, %250 - %252 = call float @llvm.AMDIL.clamp.(float %251, float 0.000000e+00, float 1.000000e+00) - %253 = fmul float %214, 0x3F5A36E2E0000000 - %254 = call float @llvm.AMDIL.clamp.(float %253, float 0.000000e+00, float 1.000000e+00) - %255 = fsub float -0.000000e+00, %254 - %256 = fadd float 1.000000e+00, %255 - %257 = call float @llvm.pow.f32(float %252, float 2.500000e-01) - %258 = fmul float %39, %257 - %259 = fmul float %241, %258 - %260 = fmul float %246, %258 - %261 = fmul float %259, %230 - %262 = fmul float %260, %230 - %263 = fadd float %252, 0x3EE4F8B580000000 - %264 = fsub float -0.000000e+00, %252 - %265 = fadd float 1.000000e+00, %264 - %266 = fmul float 1.200000e+01, %265 - %267 = fadd float %266, 4.000000e+00 - %268 = fsub float -0.000000e+00, %267 - %269 = fmul float %268, %263 - %270 = fsub float -0.000000e+00, %267 - %271 = fmul float %270, %263 - %272 = fsub float -0.000000e+00, %267 - %273 = fmul float %272, %263 - %274 = fdiv float 1.000000e+00, %269 - %275 = fdiv float 1.000000e+00, %271 - %276 = fdiv float 1.000000e+00, %273 - %277 = fmul float %261, %274 - %278 = fmul float %262, %275 - %279 = fmul float %263, %276 - br label %LOOP - -LOOP: ; preds = %LOOP, %main_body - %temp144.0 = phi float [ 1.000000e+00, %main_body ], [ %292, %LOOP ] - %temp168.0 = phi float [ %176, %main_body ], [ %288, %LOOP ] - %temp169.0 = phi float [ %177, %main_body ], [ %289, %LOOP ] - %temp170.0 = phi float [ %256, %main_body ], [ %290, %LOOP ] - %280 = bitcast float %temp168.0 to i32 - %281 = bitcast float %temp169.0 to i32 - %282 = insertelement <4 x i32> undef, i32 %280, i32 0 - %283 = insertelement <4 x i32> %282, i32 %281, i32 1 - %284 = insertelement <4 x i32> %283, i32 0, i32 2 - %285 = insertelement <4 x i32> %284, i32 undef, i32 3 - %286 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %285, <32 x i8> %147, <16 x i8> %149, i32 2) - %287 = extractelement <4 x float> %286, i32 3 - %288 = fadd float %temp168.0, %277 - %289 = fadd float %temp169.0, %278 - %290 = fadd float %temp170.0, %279 - %291 = fsub float -0.000000e+00, %287 - %292 = fadd float %290, %291 - %293 = fcmp oge float 0.000000e+00, %292 - %294 = sext i1 %293 to i32 - %295 = bitcast i32 %294 to float - %296 = bitcast float %295 to i32 - %297 = icmp ne i32 %296, 0 - br i1 %297, label %IF189, label %LOOP - -IF189: ; preds = %LOOP - %298 = extractelement <4 x float> %286, i32 0 - %299 = extractelement <4 x float> %286, i32 1 - %300 = extractelement <4 x float> %286, i32 2 - %301 = fsub float -0.000000e+00, %292 - %302 = fadd float %temp144.0, %301 - %303 = fdiv float 1.000000e+00, %302 - %304 = fmul float %292, %303 - %305 = fadd float %304, -1.000000e+00 - %306 = fmul float %305, %277 - %307 = fadd float %306, %288 - %308 = fmul float %305, %278 - %309 = fadd float %308, %289 - %310 = fsub float -0.000000e+00, %176 - %311 = fadd float %307, %310 - %312 = fsub float -0.000000e+00, %177 - %313 = fadd float %309, %312 - %314 = fadd float %176, %311 - %315 = fadd float %177, %313 - %316 = fmul float %311, %67 - %317 = fmul float %313, %68 - %318 = fmul float %316, %55 - %319 = fmul float %316, %56 - %320 = fmul float %317, %57 - %321 = fadd float %320, %318 - %322 = fmul float %317, %58 - %323 = fadd float %322, %319 - %324 = fadd float %178, %321 - %325 = fadd float %179, %323 - %326 = fmul float %316, %59 - %327 = fmul float %316, %60 - %328 = fmul float %316, %61 - %329 = fmul float %316, %62 - %330 = fmul float %317, %63 - %331 = fadd float %330, %326 - %332 = fmul float %317, %64 - %333 = fadd float %332, %327 - %334 = fmul float %317, %65 - %335 = fadd float %334, %328 - %336 = fmul float %317, %66 - %337 = fadd float %336, %329 - %338 = fadd float %168, %331 - %339 = fadd float %169, %333 - %340 = fadd float %170, %335 - %341 = fadd float %171, %337 - %342 = bitcast float %338 to i32 - %343 = bitcast float %339 to i32 - %344 = insertelement <2 x i32> undef, i32 %342, i32 0 - %345 = insertelement <2 x i32> %344, i32 %343, i32 1 - %346 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %345, <32 x i8> %135, <16 x i8> %137, i32 2) - %347 = extractelement <4 x float> %346, i32 0 - %348 = extractelement <4 x float> %346, i32 1 - %349 = extractelement <4 x float> %346, i32 2 - %350 = extractelement <4 x float> %346, i32 3 - %351 = fmul float %347, %23 - %352 = fmul float %348, %24 - %353 = fmul float %349, %25 - %354 = fmul float %350, %26 - %355 = fmul float %351, %180 - %356 = fmul float %352, %181 - %357 = fmul float %353, %182 - %358 = fmul float %354, %183 - %359 = fsub float -0.000000e+00, %350 - %360 = fadd float 1.000000e+00, %359 - %361 = fmul float %360, %49 - %362 = call float @llvm.AMDGPU.lrp(float %361, float %347, float %355) - %363 = call float @llvm.AMDGPU.lrp(float %361, float %348, float %356) - %364 = call float @llvm.AMDGPU.lrp(float %361, float %349, float %357) - %365 = bitcast float %340 to i32 - %366 = bitcast float %341 to i32 - %367 = insertelement <2 x i32> undef, i32 %365, i32 0 - %368 = insertelement <2 x i32> %367, i32 %366, i32 1 - %369 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %368, <32 x i8> %151, <16 x i8> %153, i32 2) - %370 = extractelement <4 x float> %369, i32 2 - %371 = fmul float %362, %234 - %372 = fmul float %363, %235 - %373 = fmul float %364, %236 - %374 = fmul float %358, %230 - %375 = bitcast float %314 to i32 - %376 = bitcast float %315 to i32 - %377 = insertelement <2 x i32> undef, i32 %375, i32 0 - %378 = insertelement <2 x i32> %377, i32 %376, i32 1 - %379 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %378, <32 x i8> %139, <16 x i8> %141, i32 2) - %380 = extractelement <4 x float> %379, i32 0 - %381 = extractelement <4 x float> %379, i32 1 - %382 = extractelement <4 x float> %379, i32 2 - %383 = extractelement <4 x float> %379, i32 3 - %384 = fcmp olt float 0.000000e+00, %382 - %385 = sext i1 %384 to i32 - %386 = bitcast i32 %385 to float - %387 = bitcast float %386 to i32 - %388 = icmp ne i32 %387, 0 - %.224 = select i1 %388, float %381, float %380 - %.225 = select i1 %388, float %383, float %381 - %389 = bitcast float %324 to i32 - %390 = bitcast float %325 to i32 - %391 = insertelement <2 x i32> undef, i32 %389, i32 0 - %392 = insertelement <2 x i32> %391, i32 %390, i32 1 - %393 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %392, <32 x i8> %143, <16 x i8> %145, i32 2) - %394 = extractelement <4 x float> %393, i32 0 - %395 = extractelement <4 x float> %393, i32 1 - %396 = extractelement <4 x float> %393, i32 2 - %397 = extractelement <4 x float> %393, i32 3 - %398 = fcmp olt float 0.000000e+00, %396 - %399 = sext i1 %398 to i32 - %400 = bitcast i32 %399 to float - %401 = bitcast float %400 to i32 - %402 = icmp ne i32 %401, 0 - %temp112.1 = select i1 %402, float %395, float %394 - %temp113.1 = select i1 %402, float %397, float %395 - %403 = fmul float %.224, 2.000000e+00 - %404 = fadd float %403, -1.000000e+00 - %405 = fmul float %.225, 2.000000e+00 - %406 = fadd float %405, -1.000000e+00 - %407 = fmul float %temp112.1, 2.000000e+00 - %408 = fadd float %407, -1.000000e+00 - %409 = fmul float %temp113.1, 2.000000e+00 - %410 = fadd float %409, -1.000000e+00 - %411 = fsub float -0.000000e+00, %404 - %412 = fmul float %411, %35 - %413 = fsub float -0.000000e+00, %406 - %414 = fmul float %413, %35 - %415 = fsub float -0.000000e+00, %408 - %416 = fmul float %415, %36 - %417 = fsub float -0.000000e+00, %410 - %418 = fmul float %417, %36 - %419 = fmul float %416, %370 - %420 = fmul float %418, %370 - %421 = call float @fabs(float %412) - %422 = call float @fabs(float %414) - %423 = fsub float -0.000000e+00, %421 - %424 = fadd float 1.000000e+00, %423 - %425 = fsub float -0.000000e+00, %422 - %426 = fadd float 1.000000e+00, %425 - %427 = fmul float %424, %419 - %428 = fadd float %427, %412 - %429 = fmul float %426, %420 - %430 = fadd float %429, %414 - %431 = fmul float %428, %428 - %432 = fmul float %430, %430 - %433 = fadd float %431, %432 - %434 = fsub float -0.000000e+00, %433 - %435 = fadd float 0x3FF00068E0000000, %434 - %436 = call float @llvm.AMDIL.clamp.(float %435, float 0.000000e+00, float 1.000000e+00) - %437 = call float @llvm.AMDGPU.rsq.f32(float %436) - %438 = fmul float %437, %436 - %439 = fsub float -0.000000e+00, %436 - %440 = call float @llvm.AMDGPU.cndlt(float %439, float %438, float 0.000000e+00) - %441 = fmul float %184, %428 - %442 = fmul float %185, %428 - %443 = fmul float %186, %428 - %444 = fmul float %187, %430 - %445 = fadd float %444, %441 - %446 = fmul float %188, %430 - %447 = fadd float %446, %442 - %448 = fmul float %189, %430 - %449 = fadd float %448, %443 - %450 = fmul float %190, %440 - %451 = fadd float %450, %445 - %452 = fmul float %191, %440 - %453 = fadd float %452, %447 - %454 = fmul float %192, %440 - %455 = fadd float %454, %449 - %456 = fmul float %451, %451 - %457 = fmul float %453, %453 - %458 = fadd float %457, %456 - %459 = fmul float %455, %455 - %460 = fadd float %458, %459 - %461 = call float @llvm.AMDGPU.rsq.f32(float %460) - %462 = fmul float %451, %461 - %463 = fmul float %453, %461 - %464 = fmul float %455, %461 - %465 = fcmp olt float 0.000000e+00, %219 - %466 = sext i1 %465 to i32 - %467 = bitcast i32 %466 to float - %468 = bitcast float %467 to i32 - %469 = icmp ne i32 %468, 0 - br i1 %469, label %IF198, label %ENDIF197 - -IF198: ; preds = %IF189 - %470 = fsub float -0.000000e+00, %462 - %471 = fsub float -0.000000e+00, %463 - %472 = fsub float -0.000000e+00, %464 - br label %ENDIF197 - -ENDIF197: ; preds = %IF189, %IF198 - %temp14.0 = phi float [ %472, %IF198 ], [ %464, %IF189 ] - %temp13.0 = phi float [ %471, %IF198 ], [ %463, %IF189 ] - %temp12.0 = phi float [ %470, %IF198 ], [ %462, %IF189 ] - %473 = bitcast float %220 to i32 - %474 = bitcast float %221 to i32 - %475 = insertelement <2 x i32> undef, i32 %473, i32 0 - %476 = insertelement <2 x i32> %475, i32 %474, i32 1 - %477 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %476, <32 x i8> %159, <16 x i8> %161, i32 2) - %478 = extractelement <4 x float> %477, i32 0 - %479 = extractelement <4 x float> %477, i32 1 - %480 = extractelement <4 x float> %477, i32 2 - %481 = extractelement <4 x float> %477, i32 3 - %482 = fmul float %478, %40 - %483 = fadd float %482, %41 - %484 = fmul float %479, %40 - %485 = fadd float %484, %41 - %486 = fmul float %480, %40 - %487 = fadd float %486, %41 - %488 = fmul float %481, %42 - %489 = fadd float %488, %43 - %490 = bitcast float %172 to i32 - %491 = bitcast float %173 to i32 - %492 = insertelement <2 x i32> undef, i32 %490, i32 0 - %493 = insertelement <2 x i32> %492, i32 %491, i32 1 - %494 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %493, <32 x i8> %155, <16 x i8> %157, i32 2) - %495 = extractelement <4 x float> %494, i32 0 - %496 = extractelement <4 x float> %494, i32 1 - %497 = extractelement <4 x float> %494, i32 2 - %498 = extractelement <4 x float> %494, i32 3 - %499 = fmul float %498, 3.200000e+01 - %500 = fadd float %499, -1.600000e+01 - %501 = call float @llvm.AMDIL.exp.(float %500) - %502 = fmul float %495, %501 - %503 = fmul float %496, %501 - %504 = fmul float %497, %501 - %505 = fmul float %28, %502 - %506 = fadd float %505, %193 - %507 = fmul float %29, %503 - %508 = fadd float %507, %194 - %509 = fmul float %30, %504 - %510 = fadd float %509, %195 - %511 = fmul float %506, %489 - %512 = fmul float %508, %489 - %513 = fmul float %510, %489 - %514 = fmul float %489, 5.000000e-01 - %515 = fadd float %514, 5.000000e-01 - %516 = fmul float %483, %515 - %517 = fadd float %516, %511 - %518 = fmul float %485, %515 - %519 = fadd float %518, %512 - %520 = fmul float %487, %515 - %521 = fadd float %520, %513 - %522 = fmul float %517, %371 - %523 = fmul float %519, %372 - %524 = fmul float %521, %373 - %525 = fmul float %428, 0x3FDB272440000000 - %526 = fmul float %430, 0xBFDB272440000000 - %527 = fadd float %526, %525 - %528 = fmul float %440, 0x3FE99999A0000000 - %529 = fadd float %527, %528 - %530 = fmul float %529, 5.000000e-01 - %531 = fadd float %530, 0x3FE3333340000000 - %532 = fmul float %531, %531 - %533 = fmul float %522, %532 - %534 = fmul float %523, %532 - %535 = fmul float %524, %532 - %536 = fsub float -0.000000e+00, %72 - %537 = fsub float -0.000000e+00, %73 - %538 = fsub float -0.000000e+00, %74 - %539 = fmul float %temp12.0, %536 - %540 = fmul float %temp13.0, %537 - %541 = fadd float %540, %539 - %542 = fmul float %temp14.0, %538 - %543 = fadd float %541, %542 - %544 = call float @llvm.AMDIL.clamp.(float %543, float 0.000000e+00, float 1.000000e+00) - %545 = fmul float %371, %544 - %546 = fmul float %372, %544 - %547 = fmul float %373, %544 - %548 = fmul float %545, %69 - %549 = fmul float %546, %70 - %550 = fmul float %547, %71 - %551 = fsub float -0.000000e+00, %164 - %552 = fadd float %97, %551 - %553 = fsub float -0.000000e+00, %165 - %554 = fadd float %98, %553 - %555 = fsub float -0.000000e+00, %166 - %556 = fadd float %99, %555 - %557 = fmul float %552, %552 - %558 = fmul float %554, %554 - %559 = fadd float %558, %557 - %560 = fmul float %556, %556 - %561 = fadd float %559, %560 - %562 = call float @llvm.AMDGPU.rsq.f32(float %561) - %563 = fmul float %562, %561 - %564 = fsub float -0.000000e+00, %561 - %565 = call float @llvm.AMDGPU.cndlt(float %564, float %563, float 0.000000e+00) - %566 = fsub float -0.000000e+00, %84 - %567 = fadd float %565, %566 - %568 = fsub float -0.000000e+00, %83 - %569 = fadd float %565, %568 - %570 = fsub float -0.000000e+00, %82 - %571 = fadd float %565, %570 - %572 = fsub float -0.000000e+00, %84 - %573 = fadd float %83, %572 - %574 = fsub float -0.000000e+00, %83 - %575 = fadd float %82, %574 - %576 = fsub float -0.000000e+00, %82 - %577 = fadd float %81, %576 - %578 = fdiv float 1.000000e+00, %573 - %579 = fdiv float 1.000000e+00, %575 - %580 = fdiv float 1.000000e+00, %577 - %581 = fmul float %567, %578 - %582 = fmul float %569, %579 - %583 = fmul float %571, %580 - %584 = fcmp olt float %565, %83 - %585 = sext i1 %584 to i32 - %586 = bitcast i32 %585 to float - %587 = bitcast float %586 to i32 - %588 = icmp ne i32 %587, 0 - br i1 %588, label %ENDIF200, label %ELSE202 - -ELSE202: ; preds = %ENDIF197 - %589 = fcmp olt float %565, %82 - %590 = sext i1 %589 to i32 - %591 = bitcast i32 %590 to float - %592 = bitcast float %591 to i32 - %593 = icmp ne i32 %592, 0 - br i1 %593, label %ENDIF200, label %ELSE205 - -ENDIF200: ; preds = %ELSE205, %ELSE202, %ENDIF197 - %temp80.0 = phi float [ %581, %ENDIF197 ], [ %.226, %ELSE205 ], [ %582, %ELSE202 ] - %temp88.0 = phi float [ %122, %ENDIF197 ], [ %.227, %ELSE205 ], [ %120, %ELSE202 ] - %temp89.0 = phi float [ %123, %ENDIF197 ], [ %.228, %ELSE205 ], [ %121, %ELSE202 ] - %temp90.0 = phi float [ %120, %ENDIF197 ], [ %116, %ELSE205 ], [ %118, %ELSE202 ] - %temp91.0 = phi float [ %121, %ENDIF197 ], [ %117, %ELSE205 ], [ %119, %ELSE202 ] - %594 = fcmp olt float %565, %83 - %595 = sext i1 %594 to i32 - %596 = bitcast i32 %595 to float - %597 = bitcast float %596 to i32 - %598 = icmp ne i32 %597, 0 - br i1 %598, label %ENDIF209, label %ELSE211 - -ELSE205: ; preds = %ELSE202 - %599 = fcmp olt float %565, %81 - %600 = sext i1 %599 to i32 - %601 = bitcast i32 %600 to float - %602 = bitcast float %601 to i32 - %603 = icmp ne i32 %602, 0 - %.226 = select i1 %603, float %583, float 1.000000e+00 - %.227 = select i1 %603, float %118, float %116 - %.228 = select i1 %603, float %119, float %117 - br label %ENDIF200 - -ELSE211: ; preds = %ENDIF200 - %604 = fcmp olt float %565, %82 - %605 = sext i1 %604 to i32 - %606 = bitcast i32 %605 to float - %607 = bitcast float %606 to i32 - %608 = icmp ne i32 %607, 0 - br i1 %608, label %ENDIF209, label %ELSE214 - -ENDIF209: ; preds = %ELSE214, %ELSE211, %ENDIF200 - %temp52.0 = phi float [ %108, %ENDIF200 ], [ %100, %ELSE214 ], [ %104, %ELSE211 ] - %temp53.0 = phi float [ %109, %ENDIF200 ], [ %101, %ELSE214 ], [ %105, %ELSE211 ] - %temp54.0 = phi float [ %110, %ENDIF200 ], [ %102, %ELSE214 ], [ %106, %ELSE211 ] - %temp55.0 = phi float [ %111, %ENDIF200 ], [ %103, %ELSE214 ], [ %107, %ELSE211 ] - %temp68.0 = phi float [ %112, %ENDIF200 ], [ %.230, %ELSE214 ], [ %108, %ELSE211 ] - %temp69.0 = phi float [ %113, %ENDIF200 ], [ %.231, %ELSE214 ], [ %109, %ELSE211 ] - %temp70.0 = phi float [ %114, %ENDIF200 ], [ %.232, %ELSE214 ], [ %110, %ELSE211 ] - %temp71.0 = phi float [ %115, %ENDIF200 ], [ %.233, %ELSE214 ], [ %111, %ELSE211 ] - %609 = fmul float %164, %85 - %610 = fmul float %165, %86 - %611 = fadd float %609, %610 - %612 = fmul float %166, %87 - %613 = fadd float %611, %612 - %614 = fmul float %167, %88 - %615 = fadd float %613, %614 - %616 = fmul float %164, %89 - %617 = fmul float %165, %90 - %618 = fadd float %616, %617 - %619 = fmul float %166, %91 - %620 = fadd float %618, %619 - %621 = fmul float %167, %92 - %622 = fadd float %620, %621 - %623 = fmul float %164, %93 - %624 = fmul float %165, %94 - %625 = fadd float %623, %624 - %626 = fmul float %166, %95 - %627 = fadd float %625, %626 - %628 = fmul float %167, %96 - %629 = fadd float %627, %628 - %630 = fsub float -0.000000e+00, %78 - %631 = fadd float 1.000000e+00, %630 - %632 = call float @fabs(float %615) - %633 = call float @fabs(float %622) - %634 = fcmp oge float %631, %632 - %635 = sext i1 %634 to i32 - %636 = bitcast i32 %635 to float - %637 = bitcast float %636 to i32 - %638 = and i32 %637, 1065353216 - %639 = bitcast i32 %638 to float - %640 = fcmp oge float %631, %633 - %641 = sext i1 %640 to i32 - %642 = bitcast i32 %641 to float - %643 = bitcast float %642 to i32 - %644 = and i32 %643, 1065353216 - %645 = bitcast i32 %644 to float - %646 = fmul float %639, %645 - %647 = fmul float %629, %646 - %648 = fmul float %615, %temp68.0 - %649 = fadd float %648, %temp70.0 - %650 = fmul float %622, %temp69.0 - %651 = fadd float %650, %temp71.0 - %652 = fmul float %615, %temp52.0 - %653 = fadd float %652, %temp54.0 - %654 = fmul float %622, %temp53.0 - %655 = fadd float %654, %temp55.0 - %656 = fadd float %temp80.0, -1.000000e+00 - %657 = fmul float %656, %77 - %658 = fadd float %657, 1.000000e+00 - %659 = call float @llvm.AMDIL.clamp.(float %658, float 0.000000e+00, float 1.000000e+00) - %660 = bitcast float %649 to i32 - %661 = bitcast float %651 to i32 - %662 = bitcast float 0.000000e+00 to i32 - %663 = insertelement <4 x i32> undef, i32 %660, i32 0 - %664 = insertelement <4 x i32> %663, i32 %661, i32 1 - %665 = insertelement <4 x i32> %664, i32 %662, i32 2 - %666 = insertelement <4 x i32> %665, i32 undef, i32 3 - %667 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %666, <32 x i8> %127, <16 x i8> %129, i32 2) - %668 = extractelement <4 x float> %667, i32 0 - %669 = extractelement <4 x float> %667, i32 1 - %670 = bitcast float %653 to i32 - %671 = bitcast float %655 to i32 - %672 = bitcast float 0.000000e+00 to i32 - %673 = insertelement <4 x i32> undef, i32 %670, i32 0 - %674 = insertelement <4 x i32> %673, i32 %671, i32 1 - %675 = insertelement <4 x i32> %674, i32 %672, i32 2 - %676 = insertelement <4 x i32> %675, i32 undef, i32 3 - %677 = call <4 x float> @llvm.SI.samplel.v4i32(<4 x i32> %676, <32 x i8> %127, <16 x i8> %129, i32 2) - %678 = extractelement <4 x float> %677, i32 0 - %679 = extractelement <4 x float> %677, i32 1 - %680 = fsub float -0.000000e+00, %669 - %681 = fadd float 1.000000e+00, %680 - %682 = fsub float -0.000000e+00, %679 - %683 = fadd float 1.000000e+00, %682 - %684 = fmul float %681, 2.500000e-01 - %685 = fmul float %683, 2.500000e-01 - %686 = fsub float -0.000000e+00, %684 - %687 = fadd float %668, %686 - %688 = fsub float -0.000000e+00, %685 - %689 = fadd float %678, %688 - %690 = fmul float %647, %temp88.0 - %691 = fadd float %690, %temp89.0 - %692 = fmul float %647, %temp90.0 - %693 = fadd float %692, %temp91.0 - %694 = call float @llvm.AMDIL.clamp.(float %691, float 0.000000e+00, float 1.000000e+00) - %695 = call float @llvm.AMDIL.clamp.(float %693, float 0.000000e+00, float 1.000000e+00) - %696 = fsub float -0.000000e+00, %694 - %697 = fadd float %668, %696 - %698 = fsub float -0.000000e+00, %695 - %699 = fadd float %678, %698 - %700 = fmul float %668, %668 - %701 = fmul float %678, %678 - %702 = fsub float -0.000000e+00, %700 - %703 = fadd float %687, %702 - %704 = fsub float -0.000000e+00, %701 - %705 = fadd float %689, %704 - %706 = fcmp uge float %703, %75 - %707 = select i1 %706, float %703, float %75 - %708 = fcmp uge float %705, %75 - %709 = select i1 %708, float %705, float %75 - %710 = fmul float %697, %697 - %711 = fadd float %710, %707 - %712 = fmul float %699, %699 - %713 = fadd float %712, %709 - %714 = fdiv float 1.000000e+00, %711 - %715 = fdiv float 1.000000e+00, %713 - %716 = fmul float %707, %714 - %717 = fmul float %709, %715 - %718 = fcmp oge float %697, 0.000000e+00 - %719 = sext i1 %718 to i32 - %720 = bitcast i32 %719 to float - %721 = bitcast float %720 to i32 - %722 = icmp ne i32 %721, 0 - %.229 = select i1 %722, float 1.000000e+00, float %716 - %723 = fcmp oge float %699, 0.000000e+00 - %724 = sext i1 %723 to i32 - %725 = bitcast i32 %724 to float - %726 = bitcast float %725 to i32 - %727 = icmp ne i32 %726, 0 - %temp28.0 = select i1 %727, float 1.000000e+00, float %717 - %728 = call float @llvm.AMDGPU.lrp(float %659, float %temp28.0, float %.229) - %729 = call float @llvm.pow.f32(float %728, float %76) - %730 = fmul float %729, %79 - %731 = fadd float %730, %80 - %732 = call float @llvm.AMDIL.clamp.(float %731, float 0.000000e+00, float 1.000000e+00) - %733 = fmul float %732, %732 - %734 = fmul float 2.000000e+00, %732 - %735 = fsub float -0.000000e+00, %734 - %736 = fadd float 3.000000e+00, %735 - %737 = fmul float %733, %736 - %738 = fmul float %548, %737 - %739 = fmul float %549, %737 - %740 = fmul float %550, %737 - %741 = fmul float %738, %515 - %742 = fadd float %741, %533 - %743 = fmul float %739, %515 - %744 = fadd float %743, %534 - %745 = fmul float %740, %515 - %746 = fadd float %745, %535 - %747 = call float @llvm.AMDGPU.lrp(float %230, float %287, float 1.000000e+00) - %748 = call float @llvm.AMDGPU.lrp(float %37, float %298, float 1.000000e+00) - %749 = call float @llvm.AMDGPU.lrp(float %37, float %299, float 1.000000e+00) - %750 = call float @llvm.AMDGPU.lrp(float %37, float %300, float 1.000000e+00) - %751 = call float @llvm.AMDGPU.lrp(float %38, float %747, float 1.000000e+00) - %752 = fmul float %748, %751 - %753 = fmul float %749, %751 - %754 = fmul float %750, %751 - %755 = fmul float %742, %752 - %756 = fmul float %744, %753 - %757 = fmul float %746, %754 - %758 = fmul float %temp12.0, %216 - %759 = fmul float %temp13.0, %217 - %760 = fadd float %759, %758 - %761 = fmul float %temp14.0, %218 - %762 = fadd float %760, %761 - %763 = call float @fabs(float %762) - %764 = fmul float %763, %763 - %765 = fmul float %764, %50 - %766 = fadd float %765, %51 - %767 = call float @llvm.AMDIL.clamp.(float %766, float 0.000000e+00, float 1.000000e+00) - %768 = fsub float -0.000000e+00, %767 - %769 = fadd float 1.000000e+00, %768 - %770 = fmul float %33, %769 - %771 = fmul float %33, %769 - %772 = fmul float %33, %769 - %773 = fmul float %34, %769 - %774 = call float @llvm.AMDGPU.lrp(float %770, float %31, float %755) - %775 = call float @llvm.AMDGPU.lrp(float %771, float %31, float %756) - %776 = call float @llvm.AMDGPU.lrp(float %772, float %31, float %757) - %777 = call float @llvm.AMDGPU.lrp(float %773, float %32, float %374) - %778 = fcmp uge float %774, 0x3E6FFFFE60000000 - %779 = select i1 %778, float %774, float 0x3E6FFFFE60000000 - %780 = fcmp uge float %775, 0x3E6FFFFE60000000 - %781 = select i1 %780, float %775, float 0x3E6FFFFE60000000 - %782 = fcmp uge float %776, 0x3E6FFFFE60000000 - %783 = select i1 %782, float %776, float 0x3E6FFFFE60000000 - %784 = fcmp uge float %779, 6.550400e+04 - %785 = select i1 %784, float 6.550400e+04, float %779 - %786 = fcmp uge float %781, 6.550400e+04 - %787 = select i1 %786, float 6.550400e+04, float %781 - %788 = fcmp uge float %783, 6.550400e+04 - %789 = select i1 %788, float 6.550400e+04, float %783 - %790 = fmul float %777, %52 - %791 = fadd float %790, %53 - %792 = call float @llvm.AMDIL.clamp.(float %791, float 0.000000e+00, float 1.000000e+00) - %793 = call i32 @llvm.SI.packf16(float %785, float %787) - %794 = bitcast i32 %793 to float - %795 = call i32 @llvm.SI.packf16(float %789, float %792) - %796 = bitcast i32 %795 to float - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %794, float %796, float %794, float %796) - ret void - -ELSE214: ; preds = %ELSE211 - %797 = fcmp olt float %565, %81 - %798 = sext i1 %797 to i32 - %799 = bitcast i32 %798 to float - %800 = bitcast float %799 to i32 - %801 = icmp ne i32 %800, 0 - %.230 = select i1 %801, float %104, float %100 - %.231 = select i1 %801, float %105, float %101 - %.232 = select i1 %801, float %106, float %102 - %.233 = select i1 %801, float %107, float %103 - br label %ENDIF209 -} - -; Function Attrs: readnone -declare float @llvm.AMDIL.clamp.(float, float, float) #2 - -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1 - -; Function Attrs: readnone -declare float @llvm.AMDGPU.lrp(float, float, float) #2 - -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.samplel.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1 - -; Function Attrs: readnone -declare float @llvm.AMDGPU.cndlt(float, float, float) #2 - -; Function Attrs: readnone -declare float @llvm.AMDIL.exp.(float) #2 - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } -attributes #2 = { readnone } -attributes #3 = { nounwind readonly } -attributes #4 = { readonly } diff --git a/test/CodeGen/R600/si-spill-cf.ll b/test/CodeGen/R600/si-spill-cf.ll deleted file mode 100644 index 4b2d8ec6bf0..00000000000 --- a/test/CodeGen/R600/si-spill-cf.ll +++ /dev/null @@ -1,501 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s -verify-machineinstrs | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s -verify-machineinstrs | FileCheck -check-prefix=SI %s - -; If this occurs it is likely due to reordering and the restore was -; originally supposed to happen before SI_END_CF. -; SI: s_or_b64 exec, exec, [[SAVED:s\[[0-9]+:[0-9]+\]|[a-z]+]] -; SI-NOT: v_readlane_b32 [[SAVED]] - -define void @main() #0 { -main_body: - %0 = call float @llvm.SI.load.const(<16 x i8> undef, i32 16) - %1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 32) - %2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 80) - %3 = call float @llvm.SI.load.const(<16 x i8> undef, i32 84) - %4 = call float @llvm.SI.load.const(<16 x i8> undef, i32 88) - %5 = call float @llvm.SI.load.const(<16 x i8> undef, i32 96) - %6 = call float @llvm.SI.load.const(<16 x i8> undef, i32 100) - %7 = call float @llvm.SI.load.const(<16 x i8> undef, i32 104) - %8 = call float @llvm.SI.load.const(<16 x i8> undef, i32 112) - %9 = call float @llvm.SI.load.const(<16 x i8> undef, i32 116) - %10 = call float @llvm.SI.load.const(<16 x i8> undef, i32 120) - %11 = call float @llvm.SI.load.const(<16 x i8> undef, i32 128) - %12 = call float @llvm.SI.load.const(<16 x i8> undef, i32 132) - %13 = call float @llvm.SI.load.const(<16 x i8> undef, i32 136) - %14 = call float @llvm.SI.load.const(<16 x i8> undef, i32 144) - %15 = call float @llvm.SI.load.const(<16 x i8> undef, i32 148) - %16 = call float @llvm.SI.load.const(<16 x i8> undef, i32 152) - %17 = call float @llvm.SI.load.const(<16 x i8> undef, i32 160) - %18 = call float @llvm.SI.load.const(<16 x i8> undef, i32 164) - %19 = call float @llvm.SI.load.const(<16 x i8> undef, i32 168) - %20 = call float @llvm.SI.load.const(<16 x i8> undef, i32 176) - %21 = call float @llvm.SI.load.const(<16 x i8> undef, i32 180) - %22 = call float @llvm.SI.load.const(<16 x i8> undef, i32 184) - %23 = call float @llvm.SI.load.const(<16 x i8> undef, i32 192) - %24 = call float @llvm.SI.load.const(<16 x i8> undef, i32 196) - %25 = call float @llvm.SI.load.const(<16 x i8> undef, i32 200) - %26 = call float @llvm.SI.load.const(<16 x i8> undef, i32 208) - %27 = call float @llvm.SI.load.const(<16 x i8> undef, i32 212) - %28 = call float @llvm.SI.load.const(<16 x i8> undef, i32 216) - %29 = call float @llvm.SI.load.const(<16 x i8> undef, i32 224) - %30 = call float @llvm.SI.load.const(<16 x i8> undef, i32 228) - %31 = call float @llvm.SI.load.const(<16 x i8> undef, i32 232) - %32 = call float @llvm.SI.load.const(<16 x i8> undef, i32 240) - %33 = call float @llvm.SI.load.const(<16 x i8> undef, i32 244) - %34 = call float @llvm.SI.load.const(<16 x i8> undef, i32 248) - %35 = call float @llvm.SI.load.const(<16 x i8> undef, i32 256) - %36 = call float @llvm.SI.load.const(<16 x i8> undef, i32 260) - %37 = call float @llvm.SI.load.const(<16 x i8> undef, i32 264) - %38 = call float @llvm.SI.load.const(<16 x i8> undef, i32 272) - %39 = call float @llvm.SI.load.const(<16 x i8> undef, i32 276) - %40 = call float @llvm.SI.load.const(<16 x i8> undef, i32 280) - %41 = call float @llvm.SI.load.const(<16 x i8> undef, i32 288) - %42 = call float @llvm.SI.load.const(<16 x i8> undef, i32 292) - %43 = call float @llvm.SI.load.const(<16 x i8> undef, i32 296) - %44 = call float @llvm.SI.load.const(<16 x i8> undef, i32 304) - %45 = call float @llvm.SI.load.const(<16 x i8> undef, i32 308) - %46 = call float @llvm.SI.load.const(<16 x i8> undef, i32 312) - %47 = call float @llvm.SI.load.const(<16 x i8> undef, i32 320) - %48 = call float @llvm.SI.load.const(<16 x i8> undef, i32 324) - %49 = call float @llvm.SI.load.const(<16 x i8> undef, i32 328) - %50 = call float @llvm.SI.load.const(<16 x i8> undef, i32 336) - %51 = call float @llvm.SI.load.const(<16 x i8> undef, i32 340) - %52 = call float @llvm.SI.load.const(<16 x i8> undef, i32 344) - %53 = call float @llvm.SI.load.const(<16 x i8> undef, i32 352) - %54 = call float @llvm.SI.load.const(<16 x i8> undef, i32 356) - %55 = call float @llvm.SI.load.const(<16 x i8> undef, i32 360) - %56 = call float @llvm.SI.load.const(<16 x i8> undef, i32 368) - %57 = call float @llvm.SI.load.const(<16 x i8> undef, i32 372) - %58 = call float @llvm.SI.load.const(<16 x i8> undef, i32 376) - %59 = call float @llvm.SI.load.const(<16 x i8> undef, i32 384) - %60 = call float @llvm.SI.load.const(<16 x i8> undef, i32 388) - %61 = call float @llvm.SI.load.const(<16 x i8> undef, i32 392) - %62 = call float @llvm.SI.load.const(<16 x i8> undef, i32 400) - %63 = call float @llvm.SI.load.const(<16 x i8> undef, i32 404) - %64 = call float @llvm.SI.load.const(<16 x i8> undef, i32 408) - %65 = call float @llvm.SI.load.const(<16 x i8> undef, i32 416) - %66 = call float @llvm.SI.load.const(<16 x i8> undef, i32 420) - br label %LOOP - -LOOP: ; preds = %ENDIF2795, %main_body - %temp894.0 = phi float [ 0.000000e+00, %main_body ], [ %temp894.1, %ENDIF2795 ] - %temp18.0 = phi float [ undef, %main_body ], [ %temp18.1, %ENDIF2795 ] - %67 = icmp sgt i32 undef, 4 - br i1 %67, label %ENDLOOP, label %ENDIF - -ENDLOOP: ; preds = %ELSE2566, %LOOP - %68 = call float @llvm.AMDGPU.lrp(float %0, float undef, float undef) - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float undef, float %68, float undef, float 1.000000e+00) - ret void - -ENDIF: ; preds = %LOOP - %69 = fsub float %2, undef - %70 = fsub float %3, undef - %71 = fsub float %4, undef - %72 = fmul float %69, 0.000000e+00 - %73 = fmul float %70, undef - %74 = fmul float %71, undef - %75 = fsub float %6, undef - %76 = fsub float %7, undef - %77 = fmul float %75, undef - %78 = fmul float %76, 0.000000e+00 - %79 = call float @llvm.minnum.f32(float %74, float %78) - %80 = call float @llvm.maxnum.f32(float %72, float 0.000000e+00) - %81 = call float @llvm.maxnum.f32(float %73, float %77) - %82 = call float @llvm.maxnum.f32(float undef, float %79) - %83 = call float @llvm.minnum.f32(float %80, float %81) - %84 = call float @llvm.minnum.f32(float %83, float undef) - %85 = fsub float %14, undef - %86 = fsub float %15, undef - %87 = fsub float %16, undef - %88 = fmul float %85, undef - %89 = fmul float %86, undef - %90 = fmul float %87, undef - %91 = fsub float %17, undef - %92 = fsub float %18, undef - %93 = fsub float %19, undef - %94 = fmul float %91, 0.000000e+00 - %95 = fmul float %92, undef - %96 = fmul float %93, undef - %97 = call float @llvm.minnum.f32(float %89, float %95) - %98 = call float @llvm.maxnum.f32(float %88, float %94) - %99 = call float @llvm.maxnum.f32(float %90, float %96) - %100 = call float @llvm.maxnum.f32(float undef, float %97) - %101 = call float @llvm.maxnum.f32(float %100, float undef) - %102 = call float @llvm.minnum.f32(float %98, float undef) - %103 = call float @llvm.minnum.f32(float %102, float %99) - %104 = fsub float %30, undef - %105 = fsub float %31, undef - %106 = fmul float %104, 0.000000e+00 - %107 = fmul float %105, 0.000000e+00 - %108 = call float @llvm.minnum.f32(float undef, float %106) - %109 = call float @llvm.maxnum.f32(float undef, float %107) - %110 = call float @llvm.maxnum.f32(float undef, float %108) - %111 = call float @llvm.maxnum.f32(float %110, float undef) - %112 = call float @llvm.minnum.f32(float undef, float %109) - %113 = fsub float %32, undef - %114 = fsub float %33, undef - %115 = fsub float %34, undef - %116 = fmul float %113, 0.000000e+00 - %117 = fmul float %114, undef - %118 = fmul float %115, undef - %119 = fsub float %35, undef - %120 = fsub float %36, undef - %121 = fsub float %37, undef - %122 = fmul float %119, undef - %123 = fmul float %120, undef - %124 = fmul float %121, undef - %125 = call float @llvm.minnum.f32(float %116, float %122) - %126 = call float @llvm.minnum.f32(float %117, float %123) - %127 = call float @llvm.minnum.f32(float %118, float %124) - %128 = call float @llvm.maxnum.f32(float %125, float %126) - %129 = call float @llvm.maxnum.f32(float %128, float %127) - %130 = fsub float %38, undef - %131 = fsub float %39, undef - %132 = fsub float %40, undef - %133 = fmul float %130, 0.000000e+00 - %134 = fmul float %131, undef - %135 = fmul float %132, undef - %136 = fsub float %41, undef - %137 = fsub float %42, undef - %138 = fsub float %43, undef - %139 = fmul float %136, undef - %140 = fmul float %137, undef - %141 = fmul float %138, undef - %142 = call float @llvm.minnum.f32(float %133, float %139) - %143 = call float @llvm.minnum.f32(float %134, float %140) - %144 = call float @llvm.minnum.f32(float %135, float %141) - %145 = call float @llvm.maxnum.f32(float %142, float %143) - %146 = call float @llvm.maxnum.f32(float %145, float %144) - %147 = fsub float %44, undef - %148 = fsub float %45, undef - %149 = fsub float %46, undef - %150 = fmul float %147, 0.000000e+00 - %151 = fmul float %148, 0.000000e+00 - %152 = fmul float %149, undef - %153 = fsub float %47, undef - %154 = fsub float %48, undef - %155 = fsub float %49, undef - %156 = fmul float %153, undef - %157 = fmul float %154, 0.000000e+00 - %158 = fmul float %155, undef - %159 = call float @llvm.minnum.f32(float %150, float %156) - %160 = call float @llvm.minnum.f32(float %151, float %157) - %161 = call float @llvm.minnum.f32(float %152, float %158) - %162 = call float @llvm.maxnum.f32(float %159, float %160) - %163 = call float @llvm.maxnum.f32(float %162, float %161) - %164 = fsub float %50, undef - %165 = fsub float %51, undef - %166 = fsub float %52, undef - %167 = fmul float %164, undef - %168 = fmul float %165, 0.000000e+00 - %169 = fmul float %166, 0.000000e+00 - %170 = fsub float %53, undef - %171 = fsub float %54, undef - %172 = fsub float %55, undef - %173 = fdiv float 1.000000e+00, %temp18.0 - %174 = fmul float %170, undef - %175 = fmul float %171, undef - %176 = fmul float %172, %173 - %177 = call float @llvm.minnum.f32(float %167, float %174) - %178 = call float @llvm.minnum.f32(float %168, float %175) - %179 = call float @llvm.minnum.f32(float %169, float %176) - %180 = call float @llvm.maxnum.f32(float %177, float %178) - %181 = call float @llvm.maxnum.f32(float %180, float %179) - %182 = fsub float %62, undef - %183 = fsub float %63, undef - %184 = fsub float %64, undef - %185 = fmul float %182, 0.000000e+00 - %186 = fmul float %183, undef - %187 = fmul float %184, undef - %188 = fsub float %65, undef - %189 = fsub float %66, undef - %190 = fmul float %188, undef - %191 = fmul float %189, undef - %192 = call float @llvm.maxnum.f32(float %185, float %190) - %193 = call float @llvm.maxnum.f32(float %186, float %191) - %194 = call float @llvm.maxnum.f32(float %187, float undef) - %195 = call float @llvm.minnum.f32(float %192, float %193) - %196 = call float @llvm.minnum.f32(float %195, float %194) - %.temp292.7 = select i1 undef, float %163, float undef - %temp292.9 = select i1 false, float %181, float %.temp292.7 - %.temp292.9 = select i1 undef, float undef, float %temp292.9 - %197 = fcmp ogt float undef, 0.000000e+00 - %198 = fcmp olt float undef, %196 - %199 = and i1 %197, %198 - %200 = fcmp olt float undef, %.temp292.9 - %201 = and i1 %199, %200 - %temp292.11 = select i1 %201, float undef, float %.temp292.9 - br i1 undef, label %IF2565, label %ELSE2566 - -IF2565: ; preds = %ENDIF - br i1 false, label %ENDIF2582, label %ELSE2584 - -ELSE2566: ; preds = %ENDIF - %202 = fcmp oeq float %temp292.11, 1.000000e+04 - br i1 %202, label %ENDLOOP, label %ELSE2593 - -ENDIF2564: ; preds = %ENDIF2594, %ENDIF2588 - %temp894.1 = phi float [ undef, %ENDIF2588 ], [ %temp894.2, %ENDIF2594 ] - %temp18.1 = phi float [ %219, %ENDIF2588 ], [ undef, %ENDIF2594 ] - %203 = fsub float %5, undef - %204 = fmul float %203, undef - %205 = call float @llvm.maxnum.f32(float undef, float %204) - %206 = call float @llvm.minnum.f32(float %205, float undef) - %207 = call float @llvm.minnum.f32(float %206, float undef) - %208 = fcmp ogt float undef, 0.000000e+00 - %209 = fcmp olt float undef, 1.000000e+00 - %210 = and i1 %208, %209 - %211 = fcmp olt float undef, %207 - %212 = and i1 %210, %211 - br i1 %212, label %ENDIF2795, label %ELSE2797 - -ELSE2584: ; preds = %IF2565 - br label %ENDIF2582 - -ENDIF2582: ; preds = %ELSE2584, %IF2565 - %213 = fadd float %1, undef - %214 = fadd float 0.000000e+00, %213 - %215 = call float @llvm.AMDIL.fraction.(float %214) - br i1 undef, label %IF2589, label %ELSE2590 - -IF2589: ; preds = %ENDIF2582 - br label %ENDIF2588 - -ELSE2590: ; preds = %ENDIF2582 - br label %ENDIF2588 - -ENDIF2588: ; preds = %ELSE2590, %IF2589 - %216 = fsub float 1.000000e+00, %215 - %217 = call float @llvm.sqrt.f32(float %216) - %218 = fmul float %217, undef - %219 = fadd float %218, undef - br label %ENDIF2564 - -ELSE2593: ; preds = %ELSE2566 - %220 = fcmp oeq float %temp292.11, %82 - %221 = fcmp olt float %82, %84 - %222 = and i1 %220, %221 - br i1 %222, label %ENDIF2594, label %ELSE2596 - -ELSE2596: ; preds = %ELSE2593 - %223 = fcmp oeq float %temp292.11, %101 - %224 = fcmp olt float %101, %103 - %225 = and i1 %223, %224 - br i1 %225, label %ENDIF2594, label %ELSE2632 - -ENDIF2594: ; preds = %ELSE2788, %ELSE2785, %ELSE2782, %ELSE2779, %IF2775, %ELSE2761, %ELSE2758, %IF2757, %ELSE2704, %ELSE2686, %ELSE2671, %ELSE2668, %IF2667, %ELSE2632, %ELSE2596, %ELSE2593 - %temp894.2 = phi float [ 0.000000e+00, %IF2667 ], [ 0.000000e+00, %ELSE2671 ], [ 0.000000e+00, %IF2757 ], [ 0.000000e+00, %ELSE2761 ], [ %temp894.0, %ELSE2758 ], [ 0.000000e+00, %IF2775 ], [ 0.000000e+00, %ELSE2779 ], [ 0.000000e+00, %ELSE2782 ], [ %.2848, %ELSE2788 ], [ 0.000000e+00, %ELSE2785 ], [ 0.000000e+00, %ELSE2593 ], [ 0.000000e+00, %ELSE2632 ], [ 0.000000e+00, %ELSE2704 ], [ 0.000000e+00, %ELSE2686 ], [ 0.000000e+00, %ELSE2668 ], [ 0.000000e+00, %ELSE2596 ] - %226 = fmul float %temp894.2, undef - br label %ENDIF2564 - -ELSE2632: ; preds = %ELSE2596 - br i1 undef, label %ENDIF2594, label %ELSE2650 - -ELSE2650: ; preds = %ELSE2632 - %227 = fcmp oeq float %temp292.11, %111 - %228 = fcmp olt float %111, %112 - %229 = and i1 %227, %228 - br i1 %229, label %IF2667, label %ELSE2668 - -IF2667: ; preds = %ELSE2650 - br i1 undef, label %ENDIF2594, label %ELSE2671 - -ELSE2668: ; preds = %ELSE2650 - %230 = fcmp oeq float %temp292.11, %129 - %231 = fcmp olt float %129, undef - %232 = and i1 %230, %231 - br i1 %232, label %ENDIF2594, label %ELSE2686 - -ELSE2671: ; preds = %IF2667 - br label %ENDIF2594 - -ELSE2686: ; preds = %ELSE2668 - %233 = fcmp oeq float %temp292.11, %146 - %234 = fcmp olt float %146, undef - %235 = and i1 %233, %234 - br i1 %235, label %ENDIF2594, label %ELSE2704 - -ELSE2704: ; preds = %ELSE2686 - %236 = fcmp oeq float %temp292.11, %181 - %237 = fcmp olt float %181, undef - %238 = and i1 %236, %237 - br i1 %238, label %ENDIF2594, label %ELSE2740 - -ELSE2740: ; preds = %ELSE2704 - br i1 undef, label %IF2757, label %ELSE2758 - -IF2757: ; preds = %ELSE2740 - br i1 undef, label %ENDIF2594, label %ELSE2761 - -ELSE2758: ; preds = %ELSE2740 - br i1 undef, label %IF2775, label %ENDIF2594 - -ELSE2761: ; preds = %IF2757 - br label %ENDIF2594 - -IF2775: ; preds = %ELSE2758 - %239 = fcmp olt float undef, undef - br i1 %239, label %ENDIF2594, label %ELSE2779 - -ELSE2779: ; preds = %IF2775 - br i1 undef, label %ENDIF2594, label %ELSE2782 - -ELSE2782: ; preds = %ELSE2779 - br i1 undef, label %ENDIF2594, label %ELSE2785 - -ELSE2785: ; preds = %ELSE2782 - %240 = fcmp olt float undef, 0.000000e+00 - br i1 %240, label %ENDIF2594, label %ELSE2788 - -ELSE2788: ; preds = %ELSE2785 - %241 = fcmp olt float 0.000000e+00, undef - %.2848 = select i1 %241, float -1.000000e+00, float 1.000000e+00 - br label %ENDIF2594 - -ELSE2797: ; preds = %ENDIF2564 - %242 = fsub float %8, undef - %243 = fsub float %9, undef - %244 = fsub float %10, undef - %245 = fmul float %242, undef - %246 = fmul float %243, undef - %247 = fmul float %244, undef - %248 = fsub float %11, undef - %249 = fsub float %12, undef - %250 = fsub float %13, undef - %251 = fmul float %248, undef - %252 = fmul float %249, undef - %253 = fmul float %250, undef - %254 = call float @llvm.minnum.f32(float %245, float %251) - %255 = call float @llvm.minnum.f32(float %246, float %252) - %256 = call float @llvm.maxnum.f32(float %247, float %253) - %257 = call float @llvm.maxnum.f32(float %254, float %255) - %258 = call float @llvm.maxnum.f32(float %257, float undef) - %259 = call float @llvm.minnum.f32(float undef, float %256) - %260 = fcmp ogt float %258, 0.000000e+00 - %261 = fcmp olt float %258, 1.000000e+00 - %262 = and i1 %260, %261 - %263 = fcmp olt float %258, %259 - %264 = and i1 %262, %263 - br i1 %264, label %ENDIF2795, label %ELSE2800 - -ENDIF2795: ; preds = %ELSE2824, %ELSE2821, %ELSE2818, %ELSE2815, %ELSE2812, %ELSE2809, %ELSE2806, %ELSE2803, %ELSE2800, %ELSE2797, %ENDIF2564 - br label %LOOP - -ELSE2800: ; preds = %ELSE2797 - br i1 undef, label %ENDIF2795, label %ELSE2803 - -ELSE2803: ; preds = %ELSE2800 - %265 = fsub float %20, undef - %266 = fsub float %21, undef - %267 = fsub float %22, undef - %268 = fmul float %265, undef - %269 = fmul float %266, undef - %270 = fmul float %267, 0.000000e+00 - %271 = fsub float %23, undef - %272 = fsub float %24, undef - %273 = fsub float %25, undef - %274 = fmul float %271, undef - %275 = fmul float %272, undef - %276 = fmul float %273, undef - %277 = call float @llvm.minnum.f32(float %268, float %274) - %278 = call float @llvm.maxnum.f32(float %269, float %275) - %279 = call float @llvm.maxnum.f32(float %270, float %276) - %280 = call float @llvm.maxnum.f32(float %277, float undef) - %281 = call float @llvm.maxnum.f32(float %280, float undef) - %282 = call float @llvm.minnum.f32(float undef, float %278) - %283 = call float @llvm.minnum.f32(float %282, float %279) - %284 = fcmp ogt float %281, 0.000000e+00 - %285 = fcmp olt float %281, 1.000000e+00 - %286 = and i1 %284, %285 - %287 = fcmp olt float %281, %283 - %288 = and i1 %286, %287 - br i1 %288, label %ENDIF2795, label %ELSE2806 - -ELSE2806: ; preds = %ELSE2803 - %289 = fsub float %26, undef - %290 = fsub float %27, undef - %291 = fsub float %28, undef - %292 = fmul float %289, undef - %293 = fmul float %290, 0.000000e+00 - %294 = fmul float %291, undef - %295 = fsub float %29, undef - %296 = fmul float %295, undef - %297 = call float @llvm.minnum.f32(float %292, float %296) - %298 = call float @llvm.minnum.f32(float %293, float undef) - %299 = call float @llvm.maxnum.f32(float %294, float undef) - %300 = call float @llvm.maxnum.f32(float %297, float %298) - %301 = call float @llvm.maxnum.f32(float %300, float undef) - %302 = call float @llvm.minnum.f32(float undef, float %299) - %303 = fcmp ogt float %301, 0.000000e+00 - %304 = fcmp olt float %301, 1.000000e+00 - %305 = and i1 %303, %304 - %306 = fcmp olt float %301, %302 - %307 = and i1 %305, %306 - br i1 %307, label %ENDIF2795, label %ELSE2809 - -ELSE2809: ; preds = %ELSE2806 - br i1 undef, label %ENDIF2795, label %ELSE2812 - -ELSE2812: ; preds = %ELSE2809 - br i1 undef, label %ENDIF2795, label %ELSE2815 - -ELSE2815: ; preds = %ELSE2812 - br i1 undef, label %ENDIF2795, label %ELSE2818 - -ELSE2818: ; preds = %ELSE2815 - br i1 undef, label %ENDIF2795, label %ELSE2821 - -ELSE2821: ; preds = %ELSE2818 - %308 = fsub float %56, undef - %309 = fsub float %57, undef - %310 = fsub float %58, undef - %311 = fmul float %308, undef - %312 = fmul float %309, 0.000000e+00 - %313 = fmul float %310, undef - %314 = fsub float %59, undef - %315 = fsub float %60, undef - %316 = fsub float %61, undef - %317 = fmul float %314, undef - %318 = fmul float %315, undef - %319 = fmul float %316, undef - %320 = call float @llvm.maxnum.f32(float %311, float %317) - %321 = call float @llvm.maxnum.f32(float %312, float %318) - %322 = call float @llvm.maxnum.f32(float %313, float %319) - %323 = call float @llvm.minnum.f32(float %320, float %321) - %324 = call float @llvm.minnum.f32(float %323, float %322) - %325 = fcmp ogt float undef, 0.000000e+00 - %326 = fcmp olt float undef, 1.000000e+00 - %327 = and i1 %325, %326 - %328 = fcmp olt float undef, %324 - %329 = and i1 %327, %328 - br i1 %329, label %ENDIF2795, label %ELSE2824 - -ELSE2824: ; preds = %ELSE2821 - %.2849 = select i1 undef, float 0.000000e+00, float 1.000000e+00 - br label %ENDIF2795 -} - -; Function Attrs: nounwind readnone -declare float @llvm.SI.load.const(<16 x i8>, i32) #1 - -; Function Attrs: readnone -declare float @llvm.AMDIL.fraction.(float) #2 - -; Function Attrs: nounwind readnone -declare float @llvm.sqrt.f32(float) #1 - -; Function Attrs: nounwind readnone -declare float @llvm.minnum.f32(float, float) #1 - -; Function Attrs: nounwind readnone -declare float @llvm.maxnum.f32(float, float) #1 - -; Function Attrs: readnone -declare float @llvm.AMDGPU.lrp(float, float, float) #2 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" } -attributes #1 = { nounwind readnone } -attributes #2 = { readnone } diff --git a/test/CodeGen/R600/si-triv-disjoint-mem-access.ll b/test/CodeGen/R600/si-triv-disjoint-mem-access.ll deleted file mode 100644 index 5a6129aaa3f..00000000000 --- a/test/CodeGen/R600/si-triv-disjoint-mem-access.ll +++ /dev/null @@ -1,236 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s - -declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -declare void @llvm.AMDGPU.barrier.local() #2 - - -@stored_lds_ptr = addrspace(3) global i32 addrspace(3)* undef, align 4 -@stored_constant_ptr = addrspace(3) global i32 addrspace(2)* undef, align 8 -@stored_global_ptr = addrspace(3) global i32 addrspace(1)* undef, align 8 - -; FUNC-LABEL: @reorder_local_load_global_store_local_load -; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4 -; CI-NEXT: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8 -; CI: buffer_store_dword -define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { - %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 - - %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1 - %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2 - - %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4 - store i32 99, i32 addrspace(1)* %gptr, align 4 - %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4 - - %add = add nsw i32 %tmp1, %tmp2 - - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @no_reorder_local_load_volatile_global_store_local_load -; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4 -; CI: buffer_store_dword -; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8 -define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { - %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 - - %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1 - %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2 - - %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4 - store volatile i32 99, i32 addrspace(1)* %gptr, align 4 - %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4 - - %add = add nsw i32 %tmp1, %tmp2 - - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @no_reorder_barrier_local_load_global_store_local_load -; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4 -; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8 -; CI: buffer_store_dword -define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { - %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 - - %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1 - %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2 - - %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4 - store i32 99, i32 addrspace(1)* %gptr, align 4 - call void @llvm.AMDGPU.barrier.local() #2 - %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4 - - %add = add nsw i32 %tmp1, %tmp2 - - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; Technically we could reorder these, but just comparing the -; instruction type of the load is insufficient. - -; FUNC-LABEL: @no_reorder_constant_load_global_store_constant_load -; CI: buffer_load_dword -; CI: buffer_store_dword -; CI: buffer_load_dword -; CI: buffer_store_dword -define void @no_reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { - %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8 - - %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1 - %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2 - - %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4 - store i32 99, i32 addrspace(1)* %gptr, align 4 - %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4 - - %add = add nsw i32 %tmp1, %tmp2 - - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @reorder_constant_load_local_store_constant_load -; CI: buffer_load_dword -; CI: buffer_load_dword -; CI: ds_write_b32 -; CI: buffer_store_dword -define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 { - %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8 - - %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1 - %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2 - - %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4 - store i32 99, i32 addrspace(3)* %lptr, align 4 - %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4 - - %add = add nsw i32 %tmp1, %tmp2 - - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @reorder_smrd_load_local_store_smrd_load -; CI: s_load_dword -; CI: s_load_dword -; CI: s_load_dword -; CI: ds_write_b32 -; CI: buffer_store_dword -define void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(2)* %ptr0) #0 { - %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1 - %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2 - - %tmp1 = load i32, i32 addrspace(2)* %ptr1, align 4 - store i32 99, i32 addrspace(3)* %lptr, align 4 - %tmp2 = load i32, i32 addrspace(2)* %ptr2, align 4 - - %add = add nsw i32 %tmp1, %tmp2 - - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @reorder_global_load_local_store_global_load -; CI: buffer_load_dword -; CI: buffer_load_dword -; CI: ds_write_b32 -; CI: buffer_store_dword -define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 { - %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 1 - %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 2 - - %tmp1 = load i32, i32 addrspace(1)* %ptr1, align 4 - store i32 99, i32 addrspace(3)* %lptr, align 4 - %tmp2 = load i32, i32 addrspace(1)* %ptr2, align 4 - - %add = add nsw i32 %tmp1, %tmp2 - - store i32 %add, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @reorder_local_offsets -; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12 -; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400 -; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404 -; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400 -; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404 -; CI: buffer_store_dword -; CI: s_endpgm -define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 { - %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3 - %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 100 - %ptr3 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 101 - - store i32 123, i32 addrspace(3)* %ptr1, align 4 - %tmp1 = load i32, i32 addrspace(3)* %ptr2, align 4 - %tmp2 = load i32, i32 addrspace(3)* %ptr3, align 4 - store i32 123, i32 addrspace(3)* %ptr2, align 4 - %tmp3 = load i32, i32 addrspace(3)* %ptr1, align 4 - store i32 789, i32 addrspace(3)* %ptr3, align 4 - - %add.0 = add nsw i32 %tmp2, %tmp1 - %add.1 = add nsw i32 %add.0, %tmp3 - store i32 %add.1, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: @reorder_global_offsets -; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12 -; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400 -; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404 -; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400 -; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404 -; CI: buffer_store_dword -; CI: s_endpgm -define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 { - %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3 - %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 100 - %ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 101 - - store i32 123, i32 addrspace(1)* %ptr1, align 4 - %tmp1 = load i32, i32 addrspace(1)* %ptr2, align 4 - %tmp2 = load i32, i32 addrspace(1)* %ptr3, align 4 - store i32 123, i32 addrspace(1)* %ptr2, align 4 - %tmp3 = load i32, i32 addrspace(1)* %ptr1, align 4 - store i32 789, i32 addrspace(1)* %ptr3, align 4 - - %add.0 = add nsw i32 %tmp2, %tmp1 - %add.1 = add nsw i32 %add.0, %tmp3 - store i32 %add.1, i32 addrspace(1)* %out, align 4 - ret void -} - -; XFUNC-LABEL: @reorder_local_load_tbuffer_store_local_load -; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x4 -; XCI: TBUFFER_STORE_FORMAT -; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x8 -; define void @reorder_local_load_tbuffer_store_local_load(i32 addrspace(1)* %out, i32 %a1, i32 %vaddr) #1 { -; %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 - -; %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1 -; %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 2 - -; %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4 - -; %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 -; call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, -; i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1, -; i32 1, i32 0) - -; %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4 - -; %add = add nsw i32 %tmp1, %tmp2 - -; store i32 %add, i32 addrspace(1)* %out, align 4 -; ret void -; } - -attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #1 = { "ShaderType"="1" nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #2 = { nounwind noduplicate } diff --git a/test/CodeGen/R600/si-vector-hang.ll b/test/CodeGen/R600/si-vector-hang.ll deleted file mode 100644 index bd427dd3ed4..00000000000 --- a/test/CodeGen/R600/si-vector-hang.ll +++ /dev/null @@ -1,105 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -; CHECK: {{^}}test_8_min_char: -; CHECK: buffer_store_byte -; CHECK: buffer_store_byte -; CHECK: buffer_store_byte -; CHECK: buffer_store_byte -; CHECK: buffer_store_byte -; CHECK: buffer_store_byte -; CHECK: buffer_store_byte -; CHECK: buffer_store_byte -; ModuleID = 'radeon' - -define void @test_8_min_char(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture readonly %in0, i8 addrspace(1)* nocapture readonly %in1) #0 { -entry: - %0 = load i8, i8 addrspace(1)* %in0, align 1 - %1 = insertelement <8 x i8> undef, i8 %0, i32 0 - %arrayidx2.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 1 - %2 = load i8, i8 addrspace(1)* %arrayidx2.i.i, align 1 - %3 = insertelement <8 x i8> %1, i8 %2, i32 1 - %arrayidx6.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 2 - %4 = load i8, i8 addrspace(1)* %arrayidx6.i.i, align 1 - %5 = insertelement <8 x i8> %3, i8 %4, i32 2 - %arrayidx10.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 3 - %6 = load i8, i8 addrspace(1)* %arrayidx10.i.i, align 1 - %7 = insertelement <8 x i8> %5, i8 %6, i32 3 - %arrayidx.i.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 4 - %8 = load i8, i8 addrspace(1)* %arrayidx.i.i, align 1 - %9 = insertelement <8 x i8> undef, i8 %8, i32 0 - %arrayidx2.i9.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 5 - %10 = load i8, i8 addrspace(1)* %arrayidx2.i9.i, align 1 - %11 = insertelement <8 x i8> %9, i8 %10, i32 1 - %arrayidx6.i11.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 6 - %12 = load i8, i8 addrspace(1)* %arrayidx6.i11.i, align 1 - %13 = insertelement <8 x i8> %11, i8 %12, i32 2 - %arrayidx10.i13.i = getelementptr inbounds i8, i8 addrspace(1)* %in0, i64 7 - %14 = load i8, i8 addrspace(1)* %arrayidx10.i13.i, align 1 - %15 = insertelement <8 x i8> %13, i8 %14, i32 3 - %vecinit5.i = shufflevector <8 x i8> %7, <8 x i8> %15, <8 x i32> - %16 = load i8, i8 addrspace(1)* %in1, align 1 - %17 = insertelement <8 x i8> undef, i8 %16, i32 0 - %arrayidx2.i.i4 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 1 - %18 = load i8, i8 addrspace(1)* %arrayidx2.i.i4, align 1 - %19 = insertelement <8 x i8> %17, i8 %18, i32 1 - %arrayidx6.i.i5 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 2 - %20 = load i8, i8 addrspace(1)* %arrayidx6.i.i5, align 1 - %21 = insertelement <8 x i8> %19, i8 %20, i32 2 - %arrayidx10.i.i6 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 3 - %22 = load i8, i8 addrspace(1)* %arrayidx10.i.i6, align 1 - %23 = insertelement <8 x i8> %21, i8 %22, i32 3 - %arrayidx.i.i7 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 4 - %24 = load i8, i8 addrspace(1)* %arrayidx.i.i7, align 1 - %25 = insertelement <8 x i8> undef, i8 %24, i32 0 - %arrayidx2.i9.i8 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 5 - %26 = load i8, i8 addrspace(1)* %arrayidx2.i9.i8, align 1 - %27 = insertelement <8 x i8> %25, i8 %26, i32 1 - %arrayidx6.i11.i9 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 6 - %28 = load i8, i8 addrspace(1)* %arrayidx6.i11.i9, align 1 - %29 = insertelement <8 x i8> %27, i8 %28, i32 2 - %arrayidx10.i13.i10 = getelementptr inbounds i8, i8 addrspace(1)* %in1, i64 7 - %30 = load i8, i8 addrspace(1)* %arrayidx10.i13.i10, align 1 - %31 = insertelement <8 x i8> %29, i8 %30, i32 3 - %vecinit5.i11 = shufflevector <8 x i8> %23, <8 x i8> %31, <8 x i32> - %cmp.i = icmp slt <8 x i8> %vecinit5.i, %vecinit5.i11 - %cond.i = select <8 x i1> %cmp.i, <8 x i8> %vecinit5.i, <8 x i8> %vecinit5.i11 - %32 = extractelement <8 x i8> %cond.i, i32 0 - store i8 %32, i8 addrspace(1)* %out, align 1 - %33 = extractelement <8 x i8> %cond.i, i32 1 - %arrayidx2.i.i.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 - store i8 %33, i8 addrspace(1)* %arrayidx2.i.i.i, align 1 - %34 = extractelement <8 x i8> %cond.i, i32 2 - %arrayidx.i.i.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 2 - store i8 %34, i8 addrspace(1)* %arrayidx.i.i.i, align 1 - %35 = extractelement <8 x i8> %cond.i, i32 3 - %arrayidx2.i6.i.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 3 - store i8 %35, i8 addrspace(1)* %arrayidx2.i6.i.i, align 1 - %arrayidx.i.i3 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 4 - %36 = extractelement <8 x i8> %cond.i, i32 4 - store i8 %36, i8 addrspace(1)* %arrayidx.i.i3, align 1 - %37 = extractelement <8 x i8> %cond.i, i32 5 - %arrayidx2.i.i6.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 5 - store i8 %37, i8 addrspace(1)* %arrayidx2.i.i6.i, align 1 - %38 = extractelement <8 x i8> %cond.i, i32 6 - %arrayidx.i.i7.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 6 - store i8 %38, i8 addrspace(1)* %arrayidx.i.i7.i, align 1 - %39 = extractelement <8 x i8> %cond.i, i32 7 - %arrayidx2.i6.i8.i = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 7 - store i8 %39, i8 addrspace(1)* %arrayidx2.i6.i8.i, align 1 - ret void -} - -attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } - -!opencl.kernels = !{!0, !1, !2, !3, !4, !5, !6, !7, !8} - -!0 = !{null} -!1 = !{null} -!2 = !{null} -!3 = !{void (i8 addrspace(1)*, i8 addrspace(1)*, i8 addrspace(1)*)* @test_8_min_char} -!4 = !{null} -!5 = !{null} -!6 = !{null} -!7 = !{null} -!8 = !{null} diff --git a/test/CodeGen/R600/sign_extend.ll b/test/CodeGen/R600/sign_extend.ll deleted file mode 100644 index 06bee114c23..00000000000 --- a/test/CodeGen/R600/sign_extend.ll +++ /dev/null @@ -1,63 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}s_sext_i1_to_i32: -; SI: v_cndmask_b32_e64 -; SI: s_endpgm -define void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %cmp = icmp eq i32 %a, %b - %sext = sext i1 %cmp to i32 - store i32 %sext, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}test_s_sext_i32_to_i64: -; SI: s_ashr_i32 -; SI: s_endpg -define void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind { -entry: - %mul = mul i32 %a, %b - %add = add i32 %mul, %c - %sext = sext i32 %add to i64 - store i64 %sext, i64 addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}s_sext_i1_to_i64: -; SI: v_cndmask_b32_e64 v[[LOREG:[0-9]+]], 0, -1, vcc -; SI: v_mov_b32_e32 v[[HIREG:[0-9]+]], v[[LOREG]] -; SI: buffer_store_dwordx2 v{{\[}}[[LOREG]]:[[HIREG]]{{\]}} -; SI: s_endpgm -define void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %cmp = icmp eq i32 %a, %b - %sext = sext i1 %cmp to i64 - store i64 %sext, i64 addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}s_sext_i32_to_i64: -; SI: s_ashr_i32 -; SI: s_endpgm -define void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind { - %sext = sext i32 %a to i64 - store i64 %sext, i64 addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}v_sext_i32_to_i64: -; SI: v_ashr -; SI: s_endpgm -define void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { - %val = load i32, i32 addrspace(1)* %in, align 4 - %sext = sext i32 %val to i64 - store i64 %sext, i64 addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}s_sext_i16_to_i64: -; SI: s_endpgm -define void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind { - %sext = sext i16 %a to i64 - store i64 %sext, i64 addrspace(1)* %out, align 8 - ret void -} diff --git a/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll b/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll deleted file mode 100644 index dffee70b6b0..00000000000 --- a/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll +++ /dev/null @@ -1,39 +0,0 @@ -; XFAIL: * -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI %s - -; 64-bit select was originally lowered with a build_pair, and this -; could be simplified to 1 cndmask instead of 2, but that broken when -; it started being implemented with a v2i32 build_vector and -; bitcasting. -define void @trunc_select_i64(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) { - %cmp = icmp eq i32 %c, 0 - %select = select i1 %cmp, i64 %a, i64 %b - %trunc = trunc i64 %select to i32 - store i32 %trunc, i32 addrspace(1)* %out, align 4 - ret void -} - -; FIXME: Fix truncating store for local memory -; SI-LABEL: {{^}}trunc_load_alloca_i64: -; SI: v_movrels_b32 -; SI-NOT: v_movrels_b32 -; SI: s_endpgm -define void @trunc_load_alloca_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) { - %idx = add i32 %a, %b - %alloca = alloca i64, i32 4 - %gep0 = getelementptr i64, i64* %alloca, i64 0 - %gep1 = getelementptr i64, i64* %alloca, i64 1 - %gep2 = getelementptr i64, i64* %alloca, i64 2 - %gep3 = getelementptr i64, i64* %alloca, i64 3 - store i64 24, i64* %gep0, align 8 - store i64 9334, i64* %gep1, align 8 - store i64 3935, i64* %gep2, align 8 - store i64 9342, i64* %gep3, align 8 - %gep = getelementptr i64, i64* %alloca, i32 %idx - %load = load i64, i64* %gep, align 8 - %mask = and i64 %load, 4294967296 - %add = add i64 %mask, -1 - store i64 %add, i64 addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/sint_to_fp.f64.ll b/test/CodeGen/R600/sint_to_fp.f64.ll deleted file mode 100644 index da4e91db3a3..00000000000 --- a/test/CodeGen/R600/sint_to_fp.f64.ll +++ /dev/null @@ -1,61 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -; SI-LABEL: {{^}}sint_to_fp_i32_to_f64 -; SI: v_cvt_f64_i32_e32 -define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) { - %result = sitofp i32 %in to double - store double %result, double addrspace(1)* %out - ret void -} - -; FIXME: select on 0, 0 -; SI-LABEL: {{^}}sint_to_fp_i1_f64: -; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]], -; We can't fold the SGPRs into v_cndmask_b32_e64, because it already -; uses an SGPR for [[CMP]] -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]] -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, [[CMP]] -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) { - %cmp = icmp eq i32 %in, 0 - %fp = sitofp i1 %cmp to double - store double %fp, double addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}sint_to_fp_i1_f64_load: -; SI: v_cndmask_b32_e64 [[IRESULT:v[0-9]]], 0, -1 -; SI-NEXT: v_cvt_f64_i32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]] -; SI: buffer_store_dwordx2 [[RESULT]] -; SI: s_endpgm -define void @sint_to_fp_i1_f64_load(double addrspace(1)* %out, i1 %in) { - %fp = sitofp i1 %in to double - store double %fp, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: @s_sint_to_fp_i64_to_f64 -define void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) { - %result = sitofp i64 %in to double - store double %result, double addrspace(1)* %out - ret void -} - -; SI-LABEL: @v_sint_to_fp_i64_to_f64 -; SI: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} -; SI: v_cvt_f64_i32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]] -; SI: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32 -; SI: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]] -; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]] -; SI: buffer_store_dwordx2 [[RESULT]] -define void @v_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %val = load i64, i64 addrspace(1)* %gep, align 8 - %result = sitofp i64 %val to double - store double %result, double addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/sint_to_fp.ll b/test/CodeGen/R600/sint_to_fp.ll deleted file mode 100644 index 8506441d136..00000000000 --- a/test/CodeGen/R600/sint_to_fp.ll +++ /dev/null @@ -1,64 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s - - -; FUNC-LABEL: {{^}}s_sint_to_fp_i32_to_f32: -; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z -; SI: v_cvt_f32_i32_e32 {{v[0-9]+}}, {{s[0-9]+$}} -define void @s_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) { - %result = sitofp i32 %in to float - store float %result, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sint_to_fp_v2i32: -; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W -; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X - -; SI: v_cvt_f32_i32_e32 -; SI: v_cvt_f32_i32_e32 -define void @sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) { - %result = sitofp <2 x i32> %in to <2 x float> - store <2 x float> %result, <2 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sint_to_fp_v4i32: -; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_cvt_f32_i32_e32 -; SI: v_cvt_f32_i32_e32 -; SI: v_cvt_f32_i32_e32 -; SI: v_cvt_f32_i32_e32 -define void @sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %value = load <4 x i32>, <4 x i32> addrspace(1) * %in - %result = sitofp <4 x i32> %value to <4 x float> - store <4 x float> %result, <4 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sint_to_fp_i1_f32: -; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]], -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) { - %cmp = icmp eq i32 %in, 0 - %fp = uitofp i1 %cmp to float - store float %fp, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}sint_to_fp_i1_f32_load: -; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1.0 -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) { - %fp = sitofp i1 %in to float - store float %fp, float addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/smrd.ll b/test/CodeGen/R600/smrd.ll deleted file mode 100644 index b0c18ca5959..00000000000 --- a/test/CodeGen/R600/smrd.ll +++ /dev/null @@ -1,111 +0,0 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN %s - -; SMRD load with an immediate offset. -; GCN-LABEL: {{^}}smrd0: -; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01 -; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 -define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { -entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 1 - %1 = load i32, i32 addrspace(2)* %0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; SMRD load with the largest possible immediate offset. -; GCN-LABEL: {{^}}smrd1: -; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff -; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc -define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { -entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 255 - %1 = load i32, i32 addrspace(2)* %0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; SMRD load with an offset greater than the largest possible immediate. -; GCN-LABEL: {{^}}smrd2: -; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400 -; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] -; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 -; GCN: s_endpgm -define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { -entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 256 - %1 = load i32, i32 addrspace(2)* %0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; SMRD load with a 64-bit offset -; GCN-LABEL: {{^}}smrd3: -; FIXME: There are too many copies here because we don't fold immediates -; through REG_SEQUENCE -; SI: s_mov_b32 s[[SLO:[0-9]+]], 0 ; -; SI: s_mov_b32 s[[SHI:[0-9]+]], 4 -; SI: s_mov_b32 s[[SSLO:[0-9]+]], s[[SLO]] -; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SSLO]] -; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] -; FIXME: We should be able to use s_load_dword here -; SI: buffer_load_dword v{{[0-9]+}}, v{{\[}}[[VLO]]:[[VHI]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 -; TODO: Add VI checks -; GCN: s_endpgm -define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) { -entry: - %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32 - %1 = load i32, i32 addrspace(2)* %0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; SMRD load using the load.const intrinsic with an immediate offset -; GCN-LABEL: {{^}}smrd_load_const0: -; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04 -; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10 -define void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { -main_body: - %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 - %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 - %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16) - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22) - ret void -} - -; SMRD load using the load.const intrinsic with the largest possible immediate -; offset. -; GCN-LABEL: {{^}}smrd_load_const1: -; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff -; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc -define void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { -main_body: - %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 - %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 - %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1020) - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22) - ret void -} -; SMRD load using the load.const intrinsic with an offset greater than the -; largets possible immediate. -; immediate offset. -; GCN-LABEL: {{^}}smrd_load_const2: -; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400 -; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] -; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 -define void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { -main_body: - %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 - %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 - %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1024) - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22) - ret void -} - -; Function Attrs: nounwind readnone -declare float @llvm.SI.load.const(<16 x i8>, i32) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/split-scalar-i64-add.ll b/test/CodeGen/R600/split-scalar-i64-add.ll deleted file mode 100644 index 46409cdfae1..00000000000 --- a/test/CodeGen/R600/split-scalar-i64-add.ll +++ /dev/null @@ -1,48 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -declare i32 @llvm.r600.read.tidig.x() readnone - -; This is broken because the low half of the 64-bit add remains on the -; SALU, but the upper half does not. The addc expects the carry bit -; set in vcc, which is undefined since the low scalar half add sets -; scc instead. - -; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_0: -; SI: v_add_i32 -; SI: v_addc_u32 -define void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) { - %vec.0 = insertelement <2 x i32> undef, i32 %val, i32 0 - %vec.1 = insertelement <2 x i32> %vec.0, i32 999999, i32 1 - %bc = bitcast <2 x i32> %vec.1 to i64 - %add = add i64 %bc, 399 - store i64 %add, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_1: -; SI: v_add_i32 -; SI: v_addc_u32 -define void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i64 %val1) { - %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0 - %vec.1 = insertelement <2 x i32> %vec.0, i32 99999, i32 1 - %bc = bitcast <2 x i32> %vec.1 to i64 - %add = add i64 %bc, %val1 - store i64 %add, i64 addrspace(1)* %out, align 8 - ret void -} - -; Doesn't use constants -; FUNC-LABEL @imp_def_vcc_split_i64_add_2 -; SI: v_add_i32 -; SI: v_addc_u32 -define void @imp_def_vcc_split_i64_add_2(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) { - %tid = call i32 @llvm.r600.read.tidig.x() readnone - %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid - %load = load i32, i32 addrspace(1)* %gep - %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0 - %vec.1 = insertelement <2 x i32> %vec.0, i32 %load, i32 1 - %bc = bitcast <2 x i32> %vec.1 to i64 - %add = add i64 %bc, %val1 - store i64 %add, i64 addrspace(1)* %out, align 8 - ret void -} diff --git a/test/CodeGen/R600/sra.ll b/test/CodeGen/R600/sra.ll deleted file mode 100644 index bcbc32f4c05..00000000000 --- a/test/CodeGen/R600/sra.ll +++ /dev/null @@ -1,213 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI %s - -;EG-LABEL: {{^}}ashr_v2i32: -;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -;SI-LABEL: {{^}}ashr_v2i32: -;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -;VI-LABEL: {{^}}ashr_v2i32: -;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 - %a = load <2 x i32>, <2 x i32> addrspace(1) * %in - %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr - %result = ashr <2 x i32> %a, %b - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -;EG-LABEL: {{^}}ashr_v4i32: -;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -;SI-LABEL: {{^}}ashr_v4i32: -;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;SI: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -;VI-LABEL: {{^}}ashr_v4i32: -;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -;VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 - %a = load <4 x i32>, <4 x i32> addrspace(1) * %in - %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr - %result = ashr <4 x i32> %a, %b - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -;EG-LABEL: {{^}}ashr_i64: -;EG: ASHR - -;SI-LABEL: {{^}}ashr_i64: -;SI: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8 - -;VI-LABEL: {{^}}ashr_i64: -;VI: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8 - -define void @ashr_i64(i64 addrspace(1)* %out, i32 %in) { -entry: - %0 = sext i32 %in to i64 - %1 = ashr i64 %0, 8 - store i64 %1, i64 addrspace(1)* %out - ret void -} - -;EG-LABEL: {{^}}ashr_i64_2: -;EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]] -;EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}} -;EG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1 -;EG_CHECK-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal -;EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]] -;EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}} -;EG-DAG: ASHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}} -;EG-DAG: ASHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal -;EG-DAG: ASHR {{\*? *}}[[HIBIG:T[0-9]+\.[XYZW]]], [[OPHI]], literal -;EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal -;EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}} -;EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}} - -;SI-LABEL: {{^}}ashr_i64_2: -;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} - -;VI-LABEL: {{^}}ashr_i64_2: -;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} - -define void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { -entry: - %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 - %a = load i64, i64 addrspace(1) * %in - %b = load i64, i64 addrspace(1) * %b_ptr - %result = ashr i64 %a, %b - store i64 %result, i64 addrspace(1)* %out - ret void -} - -;EG-LABEL: {{^}}ashr_v2i64: -;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]] -;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]] -;EG-DAG: LSHL {{\*? *}}[[COMPSHA]] -;EG-DAG: LSHL {{\*? *}}[[COMPSHB]] -;EG-DAG: LSHL {{.*}}, 1 -;EG-DAG: LSHL {{.*}}, 1 -;EG-DAG: ASHR {{.*}}, [[SHA]] -;EG-DAG: ASHR {{.*}}, [[SHB]] -;EG-DAG: LSHR {{.*}}, [[SHA]] -;EG-DAG: LSHR {{.*}}, [[SHB]] -;EG-DAG: OR_INT -;EG-DAG: OR_INT -;EG-DAG: ADD_INT {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal -;EG-DAG: ADD_INT {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal -;EG-DAG: ASHR -;EG-DAG: ASHR -;EG-DAG: ASHR {{.*}}, literal -;EG-DAG: ASHR {{.*}}, literal -;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal -;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal -;EG-DAG: CNDE_INT -;EG-DAG: CNDE_INT -;EG-DAG: CNDE_INT -;EG-DAG: CNDE_INT - -;SI-LABEL: {{^}}ashr_v2i64: -;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} -;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} - -;VI-LABEL: {{^}}ashr_v2i64: -;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} -;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} - -define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 - %a = load <2 x i64>, <2 x i64> addrspace(1) * %in - %b = load <2 x i64>, <2 x i64> addrspace(1) * %b_ptr - %result = ashr <2 x i64> %a, %b - store <2 x i64> %result, <2 x i64> addrspace(1)* %out - ret void -} - -;EG-LABEL: {{^}}ashr_v4i64: -;EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]] -;EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]] -;EG-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]] -;EG-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]] -;EG-DAG: LSHL {{\*? *}}[[COMPSHA]] -;EG-DAG: LSHL {{\*? *}}[[COMPSHB]] -;EG-DAG: LSHL {{\*? *}}[[COMPSHC]] -;EG-DAG: LSHL {{\*? *}}[[COMPSHD]] -;EG-DAG: LSHL {{.*}}, 1 -;EG-DAG: LSHL {{.*}}, 1 -;EG-DAG: LSHL {{.*}}, 1 -;EG-DAG: LSHL {{.*}}, 1 -;EG-DAG: ASHR {{.*}}, [[SHA]] -;EG-DAG: ASHR {{.*}}, [[SHB]] -;EG-DAG: ASHR {{.*}}, [[SHC]] -;EG-DAG: ASHR {{.*}}, [[SHD]] -;EG-DAG: LSHR {{.*}}, [[SHA]] -;EG-DAG: LSHR {{.*}}, [[SHB]] -;EG-DAG: LSHR {{.*}}, [[SHA]] -;EG-DAG: LSHR {{.*}}, [[SHB]] -;EG-DAG: OR_INT -;EG-DAG: OR_INT -;EG-DAG: OR_INT -;EG-DAG: OR_INT -;EG-DAG: ADD_INT {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal -;EG-DAG: ADD_INT {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal -;EG-DAG: ADD_INT {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal -;EG-DAG: ADD_INT {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal -;EG-DAG: ASHR -;EG-DAG: ASHR -;EG-DAG: ASHR -;EG-DAG: ASHR -;EG-DAG: ASHR {{.*}}, literal -;EG-DAG: ASHR {{.*}}, literal -;EG-DAG: ASHR {{.*}}, literal -;EG-DAG: ASHR {{.*}}, literal -;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal -;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal -;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal -;EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal -;EG-DAG: CNDE_INT -;EG-DAG: CNDE_INT -;EG-DAG: CNDE_INT -;EG-DAG: CNDE_INT -;EG-DAG: CNDE_INT -;EG-DAG: CNDE_INT -;EG-DAG: CNDE_INT -;EG-DAG: CNDE_INT - -;SI-LABEL: {{^}}ashr_v4i64: -;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} -;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} -;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} -;SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} - -;VI-LABEL: {{^}}ashr_v4i64: -;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} -;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} -;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} -;VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} - -define void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 - %a = load <4 x i64>, <4 x i64> addrspace(1) * %in - %b = load <4 x i64>, <4 x i64> addrspace(1) * %b_ptr - %result = ashr <4 x i64> %a, %b - store <4 x i64> %result, <4 x i64> addrspace(1)* %out - ret void -} - diff --git a/test/CodeGen/R600/srem.ll b/test/CodeGen/R600/srem.ll deleted file mode 100644 index c78fd549b31..00000000000 --- a/test/CodeGen/R600/srem.ll +++ /dev/null @@ -1,112 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s - -define void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in - %den = load i32, i32 addrspace(1) * %den_ptr - %result = srem i32 %num, %den - store i32 %result, i32 addrspace(1)* %out - ret void -} - -define void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %num = load i32, i32 addrspace(1) * %in - %result = srem i32 %num, 4 - store i32 %result, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}srem_i32_7: -; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x92492493 -; SI: v_mul_hi_i32 {{v[0-9]+}}, [[MAGIC]], -; SI: v_mul_lo_i32 -; SI: v_sub_i32 -; SI: s_endpgm -define void @srem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %num = load i32, i32 addrspace(1) * %in - %result = srem i32 %num, 7 - store i32 %result, i32 addrspace(1)* %out - ret void -} - -define void @srem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 - %num = load <2 x i32>, <2 x i32> addrspace(1) * %in - %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr - %result = srem <2 x i32> %num, %den - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -define void @srem_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %num = load <2 x i32>, <2 x i32> addrspace(1) * %in - %result = srem <2 x i32> %num, - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -define void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 - %num = load <4 x i32>, <4 x i32> addrspace(1) * %in - %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr - %result = srem <4 x i32> %num, %den - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -define void @srem_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %num = load <4 x i32>, <4 x i32> addrspace(1) * %in - %result = srem <4 x i32> %num, - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -define void @srem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { - %den_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 - %num = load i64, i64 addrspace(1) * %in - %den = load i64, i64 addrspace(1) * %den_ptr - %result = srem i64 %num, %den - store i64 %result, i64 addrspace(1)* %out - ret void -} - -define void @srem_i64_4(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { - %num = load i64, i64 addrspace(1) * %in - %result = srem i64 %num, 4 - store i64 %result, i64 addrspace(1)* %out - ret void -} - -define void @srem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { - %den_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 - %num = load <2 x i64>, <2 x i64> addrspace(1) * %in - %den = load <2 x i64>, <2 x i64> addrspace(1) * %den_ptr - %result = srem <2 x i64> %num, %den - store <2 x i64> %result, <2 x i64> addrspace(1)* %out - ret void -} - -define void @srem_v2i64_4(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { - %num = load <2 x i64>, <2 x i64> addrspace(1) * %in - %result = srem <2 x i64> %num, - store <2 x i64> %result, <2 x i64> addrspace(1)* %out - ret void -} - -define void @srem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { - %den_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 - %num = load <4 x i64>, <4 x i64> addrspace(1) * %in - %den = load <4 x i64>, <4 x i64> addrspace(1) * %den_ptr - %result = srem <4 x i64> %num, %den - store <4 x i64> %result, <4 x i64> addrspace(1)* %out - ret void -} - -define void @srem_v4i64_4(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { - %num = load <4 x i64>, <4 x i64> addrspace(1) * %in - %result = srem <4 x i64> %num, - store <4 x i64> %result, <4 x i64> addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/srl.ll b/test/CodeGen/R600/srl.ll deleted file mode 100644 index 4904d7fa1bd..00000000000 --- a/test/CodeGen/R600/srl.ll +++ /dev/null @@ -1,186 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}lshr_i32: -; SI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %a = load i32, i32 addrspace(1)* %in - %b = load i32, i32 addrspace(1)* %b_ptr - %result = lshr i32 %a, %b - store i32 %result, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}lshr_v2i32: -; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 - %a = load <2 x i32>, <2 x i32> addrspace(1)* %in - %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr - %result = lshr <2 x i32> %a, %b - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}lshr_v4i32: -; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 - %a = load <4 x i32>, <4 x i32> addrspace(1)* %in - %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr - %result = lshr <4 x i32> %a, %b - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}lshr_i64: -; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} -; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} - -; EG: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]] -; EG: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}} -; EG: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1 -; EG-DAG: ADD_INT {{\*? *}}[[BIGSH:T[0-9]+\.[XYZW]]], [[SHIFT]], literal -; EG-DAG: LSHR {{\*? *}}[[LOSMTMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], [[SHIFT]] -; EG-DAG: OR_INT {{\*? *}}[[LOSM:T[0-9]+\.[XYZW]]], {{[[LOSMTMP]]|PV.[XYZW]}}, {{[[OVERF]]|PV.[XYZW]}} -; EG-DAG: LSHR {{\*? *}}[[HISM:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}} -; EG-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], {{PS|[[SHIFT]]}} -; EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal -; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}} -; EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0 -define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { - %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 - %a = load i64, i64 addrspace(1)* %in - %b = load i64, i64 addrspace(1)* %b_ptr - %result = lshr i64 %a, %b - store i64 %result, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}lshr_v2i64: -; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} -; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} - -; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} -; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} - -; EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]] -; EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]] -; EG-DAG: LSHL {{\*? *}}[[COMPSHA]] -; EG-DAG: LSHL {{\*? *}}[[COMPSHB]] -; EG-DAG: LSHL {{.*}}, 1 -; EG-DAG: LSHL {{.*}}, 1 -; EG-DAG: LSHR {{.*}}, [[SHA]] -; EG-DAG: LSHR {{.*}}, [[SHB]] -; EG-DAG: LSHR {{.*}}, [[SHA]] -; EG-DAG: LSHR {{.*}}, [[SHB]] -; EG-DAG: OR_INT -; EG-DAG: OR_INT -; EG-DAG: ADD_INT {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal -; EG-DAG: ADD_INT {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal -; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal -; EG-DAG: CNDE_INT {{.*}}, 0.0 -; EG-DAG: CNDE_INT {{.*}}, 0.0 -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -define void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 - %a = load <2 x i64>, <2 x i64> addrspace(1)* %in - %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr - %result = lshr <2 x i64> %a, %b - store <2 x i64> %result, <2 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}lshr_v4i64: -; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} -; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} -; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} -; SI: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}} - -; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} -; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} -; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} -; VI: v_lshrrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}} - -; EG-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]] -; EG-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]] -; EG-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]] -; EG-DAG: SUB_INT {{\*? *}}[[COMPSHD:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHD:T[0-9]+\.[XYZW]]] -; EG-DAG: LSHL {{\*? *}}[[COMPSHA]] -; EG-DAG: LSHL {{\*? *}}[[COMPSHB]] -; EG-DAG: LSHL {{\*? *}}[[COMPSHC]] -; EG-DAG: LSHL {{\*? *}}[[COMPSHD]] -; EG-DAG: LSHL {{.*}}, 1 -; EG-DAG: LSHL {{.*}}, 1 -; EG-DAG: LSHL {{.*}}, 1 -; EG-DAG: LSHL {{.*}}, 1 -; EG-DAG: LSHR {{.*}}, [[SHA]] -; EG-DAG: LSHR {{.*}}, [[SHB]] -; EG-DAG: LSHR {{.*}}, [[SHC]] -; EG-DAG: LSHR {{.*}}, [[SHD]] -; EG-DAG: LSHR {{.*}}, [[SHA]] -; EG-DAG: LSHR {{.*}}, [[SHB]] -; EG-DAG: LSHR {{.*}}, [[SHC]] -; EG-DAG: LSHR {{.*}}, [[SHD]] -; EG-DAG: OR_INT -; EG-DAG: OR_INT -; EG-DAG: OR_INT -; EG-DAG: OR_INT -; EG-DAG: ADD_INT {{\*? *}}[[BIGSHA:T[0-9]+\.[XYZW]]]{{.*}}, literal -; EG-DAG: ADD_INT {{\*? *}}[[BIGSHB:T[0-9]+\.[XYZW]]]{{.*}}, literal -; EG-DAG: ADD_INT {{\*? *}}[[BIGSHC:T[0-9]+\.[XYZW]]]{{.*}}, literal -; EG-DAG: ADD_INT {{\*? *}}[[BIGSHD:T[0-9]+\.[XYZW]]]{{.*}}, literal -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: LSHR -; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHA]], literal -; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHB]], literal -; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHC]], literal -; EG-DAG: SETGT_UINT {{\*? *T[0-9]\.[XYZW]}}, [[SHD]], literal -; EG-DAG: CNDE_INT {{.*}}, 0.0 -; EG-DAG: CNDE_INT {{.*}}, 0.0 -; EG-DAG: CNDE_INT {{.*}}, 0.0 -; EG-DAG: CNDE_INT {{.*}}, 0.0 -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 - %a = load <4 x i64>, <4 x i64> addrspace(1)* %in - %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr - %result = lshr <4 x i64> %a, %b - store <4 x i64> %result, <4 x i64> addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/ssubo.ll b/test/CodeGen/R600/ssubo.ll deleted file mode 100644 index 26884a1b776..00000000000 --- a/test/CodeGen/R600/ssubo.ll +++ /dev/null @@ -1,65 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s - -declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone -declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone - -; FUNC-LABEL: {{^}}ssubo_i64_zext: -define void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind - %val = extractvalue { i64, i1 } %ssub, 0 - %carry = extractvalue { i64, i1 } %ssub, 1 - %ext = zext i1 %carry to i64 - %add2 = add i64 %val, %ext - store i64 %add2, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_ssubo_i32: -define void @s_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { - %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind - %val = extractvalue { i32, i1 } %ssub, 0 - %carry = extractvalue { i32, i1 } %ssub, 1 - store i32 %val, i32 addrspace(1)* %out, align 4 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} - -; FUNC-LABEL: {{^}}v_ssubo_i32: -define void @v_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %a = load i32, i32 addrspace(1)* %aptr, align 4 - %b = load i32, i32 addrspace(1)* %bptr, align 4 - %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind - %val = extractvalue { i32, i1 } %ssub, 0 - %carry = extractvalue { i32, i1 } %ssub, 1 - store i32 %val, i32 addrspace(1)* %out, align 4 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} - -; FUNC-LABEL: {{^}}s_ssubo_i64: -; SI: s_sub_u32 -; SI: s_subb_u32 -define void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { - %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind - %val = extractvalue { i64, i1 } %ssub, 0 - %carry = extractvalue { i64, i1 } %ssub, 1 - store i64 %val, i64 addrspace(1)* %out, align 8 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} - -; FUNC-LABEL: {{^}}v_ssubo_i64: -; SI: v_sub_i32_e32 -; SI: v_subb_u32_e32 -define void @v_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { - %a = load i64, i64 addrspace(1)* %aptr, align 4 - %b = load i64, i64 addrspace(1)* %bptr, align 4 - %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind - %val = extractvalue { i64, i1 } %ssub, 0 - %carry = extractvalue { i64, i1 } %ssub, 1 - store i64 %val, i64 addrspace(1)* %out, align 8 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} diff --git a/test/CodeGen/R600/store-barrier.ll b/test/CodeGen/R600/store-barrier.ll deleted file mode 100644 index 4a72b4d090a..00000000000 --- a/test/CodeGen/R600/store-barrier.ll +++ /dev/null @@ -1,42 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck --check-prefix=CHECK %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck --check-prefix=CHECK %s - -; This test is for a bug in the machine scheduler where stores without -; an underlying object would be moved across the barrier. In this -; test, the <2 x i8> store will be split into two i8 stores, so they -; won't have an underlying object. - -; CHECK-LABEL: {{^}}test: -; CHECK: ds_write_b8 -; CHECK: ds_write_b8 -; CHECK: s_barrier -; CHECK: s_endpgm -; Function Attrs: nounwind -define void @test(<2 x i8> addrspace(3)* nocapture %arg, <2 x i8> addrspace(1)* nocapture readonly %arg1, i32 addrspace(1)* nocapture readonly %arg2, <2 x i8> addrspace(1)* nocapture %arg3, i32 %arg4, i64 %tmp9) { -bb: - %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp9 - %tmp13 = load i32, i32 addrspace(1)* %tmp10, align 2 - %tmp14 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp13 - %tmp15 = load <2 x i8>, <2 x i8> addrspace(3)* %tmp14, align 2 - %tmp16 = add i32 %tmp13, 1 - %tmp17 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp16 - store <2 x i8> %tmp15, <2 x i8> addrspace(3)* %tmp17, align 2 - tail call void @llvm.AMDGPU.barrier.local() #2 - %tmp25 = load i32, i32 addrspace(1)* %tmp10, align 4 - %tmp26 = sext i32 %tmp25 to i64 - %tmp27 = sext i32 %arg4 to i64 - %tmp28 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp25, i32 %arg4 - %tmp29 = load i8, i8 addrspace(3)* %tmp28, align 1 - %tmp30 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(1)* %arg3, i64 %tmp26, i64 %tmp27 - store i8 %tmp29, i8 addrspace(1)* %tmp30, align 1 - %tmp32 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp25, i32 0 - %tmp33 = load i8, i8 addrspace(3)* %tmp32, align 1 - %tmp35 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(1)* %arg3, i64 %tmp26, i64 0 - store i8 %tmp33, i8 addrspace(1)* %tmp35, align 1 - ret void -} - -; Function Attrs: noduplicate nounwind -declare void @llvm.AMDGPU.barrier.local() #2 - -attributes #2 = { noduplicate nounwind } diff --git a/test/CodeGen/R600/store-v3i32.ll b/test/CodeGen/R600/store-v3i32.ll deleted file mode 100644 index 33617b55ed6..00000000000 --- a/test/CodeGen/R600/store-v3i32.ll +++ /dev/null @@ -1,13 +0,0 @@ -; XFAIL: * -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s - -; 3 vectors have the same size and alignment as 4 vectors, so this -; should be done in a single store. - -; SI-LABEL: {{^}}store_v3i32: -; SI: buffer_store_dwordx4 -define void @store_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a) nounwind { - store <3 x i32> %a, <3 x i32> addrspace(1)* %out, align 16 - ret void -} diff --git a/test/CodeGen/R600/store-v3i64.ll b/test/CodeGen/R600/store-v3i64.ll deleted file mode 100644 index e0c554ad2c1..00000000000 --- a/test/CodeGen/R600/store-v3i64.ll +++ /dev/null @@ -1,29 +0,0 @@ -; XFAIL: * -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}global_store_v3i64: -; SI: buffer_store_dwordx4 -; SI: buffer_store_dwordx4 -define void @global_store_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %x) { - store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 32 - ret void -} - -; SI-LABEL: {{^}}global_store_v3i64_unaligned: -define void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) { - store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1 - ret void -} - -; SI-LABEL: {{^}}local_store_v3i64: -define void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) { - store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 32 - ret void -} - -; SI-LABEL: {{^}}local_store_v3i64_unaligned: -define void @local_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) { - store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1 - ret void -} diff --git a/test/CodeGen/R600/store-vector-ptrs.ll b/test/CodeGen/R600/store-vector-ptrs.ll deleted file mode 100644 index d5af3b29118..00000000000 --- a/test/CodeGen/R600/store-vector-ptrs.ll +++ /dev/null @@ -1,12 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s - -; This tests for a bug that caused a crash in -; AMDGPUDAGToDAGISel::SelectMUBUFScratch() which is used for selecting -; scratch loads and stores. -; CHECK-LABEL: {{^}}store_vector_ptrs: -define void @store_vector_ptrs(<4 x i32*>* %out, <4 x [1024 x i32]*> %array) nounwind { - %p = getelementptr [1024 x i32], <4 x [1024 x i32]*> %array, <4 x i16> zeroinitializer, <4 x i16> - store <4 x i32*> %p, <4 x i32*>* %out - ret void -} diff --git a/test/CodeGen/R600/store.ll b/test/CodeGen/R600/store.ll deleted file mode 100644 index 0f89405e073..00000000000 --- a/test/CodeGen/R600/store.ll +++ /dev/null @@ -1,369 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s - -;===------------------------------------------------------------------------===; -; Global Address Space -;===------------------------------------------------------------------------===; -; FUNC-LABEL: {{^}}store_i1: -; EG: MEM_RAT MSKOR -; SI: buffer_store_byte -define void @store_i1(i1 addrspace(1)* %out) { -entry: - store i1 true, i1 addrspace(1)* %out - ret void -} - -; i8 store -; FUNC-LABEL: {{^}}store_i8: -; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X - -; IG 0: Get the byte index and truncate the value -; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x -; EG: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x -; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y -; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43) - - -; IG 1: Truncate the calculated the shift amount for the mask - -; IG 2: Shift the value and the mask -; EG: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]] -; EG: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]] -; EG-NEXT: 255 -; IG 3: Initialize the Y and Z channels to zero -; XXX: An optimal scheduler should merge this into one of the prevous IGs. -; EG: MOV T[[RW_GPR]].Y, 0.0 -; EG: MOV * T[[RW_GPR]].Z, 0.0 - -; SI: buffer_store_byte - -define void @store_i8(i8 addrspace(1)* %out, i8 %in) { -entry: - store i8 %in, i8 addrspace(1)* %out - ret void -} - -; i16 store -; FUNC-LABEL: {{^}}store_i16: -; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X - -; IG 0: Get the byte index and truncate the value - - -; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x -; EG-NEXT: 3(4.203895e-45), - -; EG: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x -; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y - -; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) -; IG 1: Truncate the calculated the shift amount for the mask - -; IG 2: Shift the value and the mask -; EG: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]] -; EG: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]] -; EG-NEXT: 65535 -; IG 3: Initialize the Y and Z channels to zero -; XXX: An optimal scheduler should merge this into one of the prevous IGs. -; EG: MOV T[[RW_GPR]].Y, 0.0 -; EG: MOV * T[[RW_GPR]].Z, 0.0 - -; SI: buffer_store_short -define void @store_i16(i16 addrspace(1)* %out, i16 %in) { -entry: - store i16 %in, i16 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}store_v2i8: -; EG: MEM_RAT MSKOR -; EG-NOT: MEM_RAT MSKOR - -; SI: buffer_store_byte -; SI: buffer_store_byte -define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) { -entry: - %0 = trunc <2 x i32> %in to <2 x i8> - store <2 x i8> %0, <2 x i8> addrspace(1)* %out - ret void -} - - -; FUNC-LABEL: {{^}}store_v2i16: -; EG: MEM_RAT_CACHELESS STORE_RAW - -; CM: MEM_RAT_CACHELESS STORE_DWORD - -; SI: buffer_store_short -; SI: buffer_store_short -define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) { -entry: - %0 = trunc <2 x i32> %in to <2 x i16> - store <2 x i16> %0, <2 x i16> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}store_v4i8: -; EG: MEM_RAT_CACHELESS STORE_RAW - -; CM: MEM_RAT_CACHELESS STORE_DWORD - -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { -entry: - %0 = trunc <4 x i32> %in to <4 x i8> - store <4 x i8> %0, <4 x i8> addrspace(1)* %out - ret void -} - -; floating-point store -; FUNC-LABEL: {{^}}store_f32: -; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.X, T[0-9]+\.X}}, 1 - -; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+\.X, T[0-9]+\.X}} - -; SI: buffer_store_dword - -define void @store_f32(float addrspace(1)* %out, float %in) { - store float %in, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}store_v4i16: -; EG: MEM_RAT MSKOR -; EG: MEM_RAT MSKOR -; EG: MEM_RAT MSKOR -; EG: MEM_RAT MSKOR -; EG-NOT: MEM_RAT MSKOR - -; SI: buffer_store_short -; SI: buffer_store_short -; SI: buffer_store_short -; SI: buffer_store_short -; SI-NOT: buffer_store_byte -define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) { -entry: - %0 = trunc <4 x i32> %in to <4 x i16> - store <4 x i16> %0, <4 x i16> addrspace(1)* %out - ret void -} - -; vec2 floating-point stores -; FUNC-LABEL: {{^}}store_v2f32: -; EG: MEM_RAT_CACHELESS STORE_RAW - -; CM: MEM_RAT_CACHELESS STORE_DWORD - -; SI: buffer_store_dwordx2 - -define void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) { -entry: - %0 = insertelement <2 x float> , float %a, i32 0 - %1 = insertelement <2 x float> %0, float %b, i32 1 - store <2 x float> %1, <2 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}store_v4i32: -; EG: MEM_RAT_CACHELESS STORE_RAW -; EG-NOT: MEM_RAT_CACHELESS STORE_RAW - -; CM: MEM_RAT_CACHELESS STORE_DWORD -; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD - -; SI: buffer_store_dwordx4 -define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) { -entry: - store <4 x i32> %in, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}store_i64_i8: -; EG: MEM_RAT MSKOR -; SI: buffer_store_byte -define void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) { -entry: - %0 = trunc i64 %in to i8 - store i8 %0, i8 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}store_i64_i16: -; EG: MEM_RAT MSKOR -; SI: buffer_store_short -define void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) { -entry: - %0 = trunc i64 %in to i16 - store i16 %0, i16 addrspace(1)* %out - ret void -} - -;===------------------------------------------------------------------------===; -; Local Address Space -;===------------------------------------------------------------------------===; - -; FUNC-LABEL: {{^}}store_local_i1: -; EG: LDS_BYTE_WRITE -; SI: ds_write_b8 -define void @store_local_i1(i1 addrspace(3)* %out) { -entry: - store i1 true, i1 addrspace(3)* %out - ret void -} - -; FUNC-LABEL: {{^}}store_local_i8: -; EG: LDS_BYTE_WRITE - -; SI: ds_write_b8 -define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) { - store i8 %in, i8 addrspace(3)* %out - ret void -} - -; FUNC-LABEL: {{^}}store_local_i16: -; EG: LDS_SHORT_WRITE - -; SI: ds_write_b16 -define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) { - store i16 %in, i16 addrspace(3)* %out - ret void -} - -; FUNC-LABEL: {{^}}store_local_v2i16: -; EG: LDS_WRITE - -; CM: LDS_WRITE - -; SI: ds_write_b16 -; SI: ds_write_b16 -define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) { -entry: - store <2 x i16> %in, <2 x i16> addrspace(3)* %out - ret void -} - -; FUNC-LABEL: {{^}}store_local_v4i8: -; EG: LDS_WRITE - -; CM: LDS_WRITE - -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -define void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) { -entry: - store <4 x i8> %in, <4 x i8> addrspace(3)* %out - ret void -} - -; FUNC-LABEL: {{^}}store_local_v2i32: -; EG: LDS_WRITE -; EG: LDS_WRITE - -; CM: LDS_WRITE -; CM: LDS_WRITE - -; SI: ds_write_b64 -define void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) { -entry: - store <2 x i32> %in, <2 x i32> addrspace(3)* %out - ret void -} - -; FUNC-LABEL: {{^}}store_local_v4i32: -; EG: LDS_WRITE -; EG: LDS_WRITE -; EG: LDS_WRITE -; EG: LDS_WRITE - -; CM: LDS_WRITE -; CM: LDS_WRITE -; CM: LDS_WRITE -; CM: LDS_WRITE - -; SI: ds_write_b32 -; SI: ds_write_b32 -; SI: ds_write_b32 -; SI: ds_write_b32 -define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) { -entry: - store <4 x i32> %in, <4 x i32> addrspace(3)* %out - ret void -} - -; FUNC-LABEL: {{^}}store_local_i64_i8: -; EG: LDS_BYTE_WRITE -; SI: ds_write_b8 -define void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) { -entry: - %0 = trunc i64 %in to i8 - store i8 %0, i8 addrspace(3)* %out - ret void -} - -; FUNC-LABEL: {{^}}store_local_i64_i16: -; EG: LDS_SHORT_WRITE -; SI: ds_write_b16 -define void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) { -entry: - %0 = trunc i64 %in to i16 - store i16 %0, i16 addrspace(3)* %out - ret void -} - -; The stores in this function are combined by the optimizer to create a -; 64-bit store with 32-bit alignment. This is legal for SI and the legalizer -; should not try to split the 64-bit store back into 2 32-bit stores. -; -; Evergreen / Northern Islands don't support 64-bit stores yet, so there should -; be two 32-bit stores. - -; FUNC-LABEL: {{^}}vecload2: -; EG: MEM_RAT_CACHELESS STORE_RAW - -; CM: MEM_RAT_CACHELESS STORE_DWORD - -; SI: buffer_store_dwordx2 -define void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 { -entry: - %0 = load i32, i32 addrspace(2)* %mem, align 4 - %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(2)* %mem, i64 1 - %1 = load i32, i32 addrspace(2)* %arrayidx1.i, align 4 - store i32 %0, i32 addrspace(1)* %out, align 4 - %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 - store i32 %1, i32 addrspace(1)* %arrayidx1, align 4 - ret void -} - -attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } - -; When i128 was a legal type this program generated cannot select errors: - -; FUNC-LABEL: {{^}}"i128-const-store": -; FIXME: We should be able to to this with one store instruction -; EG: STORE_RAW -; EG: STORE_RAW -; EG: STORE_RAW -; EG: STORE_RAW -; CM: STORE_DWORD -; CM: STORE_DWORD -; CM: STORE_DWORD -; CM: STORE_DWORD -; SI: buffer_store_dwordx4 -define void @i128-const-store(i32 addrspace(1)* %out) { -entry: - store i32 1, i32 addrspace(1)* %out, align 4 - %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 - store i32 1, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2 - store i32 2, i32 addrspace(1)* %arrayidx4, align 4 - %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 - store i32 2, i32 addrspace(1)* %arrayidx6, align 4 - ret void -} diff --git a/test/CodeGen/R600/store.r600.ll b/test/CodeGen/R600/store.r600.ll deleted file mode 100644 index 696fb033b5e..00000000000 --- a/test/CodeGen/R600/store.r600.ll +++ /dev/null @@ -1,22 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s - -; XXX: Merge this test into store.ll once it is supported on SI - -; v4i32 store -; EG: {{^}}store_v4i32: -; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1 - -define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %1 = load <4 x i32>, <4 x i32> addrspace(1) * %in - store <4 x i32> %1, <4 x i32> addrspace(1)* %out - ret void -} - -; v4f32 store -; EG: {{^}}store_v4f32: -; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1 -define void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { - %1 = load <4 x float>, <4 x float> addrspace(1) * %in - store <4 x float> %1, <4 x float> addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/structurize.ll b/test/CodeGen/R600/structurize.ll deleted file mode 100644 index 02e592e9a55..00000000000 --- a/test/CodeGen/R600/structurize.ll +++ /dev/null @@ -1,83 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood -mattr=disable-irstructurizer | FileCheck %s -; Test case for a crash in the AMDILCFGStructurizer from a CFG like this: -; -; entry -; / \ -; diamond_head branch_from -; / \ | -; diamond_false diamond_true -; \ / -; done -; -; When the diamond_true branch had more than 100 instructions. -; -; - -; CHECK-LABEL: {{^}}branch_into_diamond: -; === entry block: -; CHECK: ALU_PUSH_BEFORE -; === Branch instruction (IF): -; CHECK: JUMP - ; === branch_from block - ; CHECK: ALU - ; === Duplicated diamond_true block (There can be more than one ALU clause): - ; === XXX: We should be able to optimize this so the basic block is not - ; === duplicated. See comments in - ; === AMDGPUCFGStructurizer::improveSimpleJumpintoIf() - ; CHECK: ALU -; === Branch instruction (ELSE): -; CHECK: ELSE - ; === diamond_head block: - ; CHECK: ALU_PUSH_BEFORE - ; === Branch instruction (IF): - ; CHECK: JUMP - ; === diamond_true block (There can be more than one ALU clause): - ; ALU - ; === Branch instruction (ELSE): - ; CHECK: ELSE - ; === diamond_false block plus implicit ENDIF - ; CHECK: ALU_POP_AFTER -; === Branch instruction (ENDIF): -; CHECK: POP -; === done block: -; CHECK: ALU -; CHECK: MEM_RAT_CACHELESS -; CHECK: CF_END - - -define void @branch_into_diamond(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { -entry: -%0 = icmp ne i32 %a, 0 - br i1 %0, label %diamond_head, label %branch_from - -diamond_head: - %1 = icmp ne i32 %a, 1 - br i1 %1, label %diamond_true, label %diamond_false - -branch_from: - %2 = add i32 %a, 1 - br label %diamond_true - -diamond_false: - %3 = add i32 %a, 2 - br label %done - -diamond_true: - %4 = phi i32 [%2, %branch_from], [%a, %diamond_head] - ; This block needs to be > 100 ISA instructions to hit the bug, - ; so we'll use udiv instructions. - %div0 = udiv i32 %a, %b - %div1 = udiv i32 %div0, %4 - %div2 = udiv i32 %div1, 11 - %div3 = udiv i32 %div2, %a - %div4 = udiv i32 %div3, %b - %div5 = udiv i32 %div4, %c - %div6 = udiv i32 %div5, %div0 - %div7 = udiv i32 %div6, %div1 - br label %done - -done: - %5 = phi i32 [%3, %diamond_false], [%div7, %diamond_true] - store i32 %5, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/structurize1.ll b/test/CodeGen/R600/structurize1.ll deleted file mode 100644 index 77432c1f9d2..00000000000 --- a/test/CodeGen/R600/structurize1.ll +++ /dev/null @@ -1,62 +0,0 @@ -; RUN: llc < %s -march=r600 -mattr=disable-ifcvt -mcpu=redwood | FileCheck %s - -; This tests for abug where the AMDILCFGStructurizer was crashing on loops -; like this: -; -; for (i = 0; i < x; i++) { -; if (cond0) { -; if (cond1) { -; -; } else { -; -; } -; if (cond2) { -; -; } -; } -; } - -; CHECK-LABEL: {{^}}if_inside_loop: -; CHECK: LOOP_START_DX10 -; CHECK: END_LOOP -define void @if_inside_loop(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { -entry: - br label %for.body - -for.body: - %0 = phi i32 [0, %entry], [%inc, %for.inc] - %val = phi i32 [0, %entry], [%val.for.inc, %for.inc] - %inc = add i32 %0, 1 - %1 = icmp ult i32 10, %a - br i1 %1, label %for.inc, label %if.then - -if.then: - %2 = icmp ne i32 0, %b - br i1 %2, label %if.then.true, label %if.then.false - -if.then.true: - %3 = add i32 %a, %val - br label %if - -if.then.false: - %4 = mul i32 %a, %val - br label %if - -if: - %val.if = phi i32 [%3, %if.then.true], [%4, %if.then.false] - %5 = icmp ne i32 0, %c - br i1 %5, label %if.true, label %for.inc - -if.true: - %6 = add i32 %a, %val.if - br label %for.inc - -for.inc: - %val.for.inc = phi i32 [%val, %for.body], [%val.if, %if], [%6, %if.true] - %7 = icmp ne i32 0, %d - br i1 %7, label %for.body, label %exit - -exit: - store i32 %val.for.inc, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/sub.ll b/test/CodeGen/R600/sub.ll deleted file mode 100644 index b7fba0efa5b..00000000000 --- a/test/CodeGen/R600/sub.ll +++ /dev/null @@ -1,130 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - - -declare i32 @llvm.r600.read.tidig.x() readnone - -; FUNC-LABEL: {{^}}test_sub_i32: -; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_subrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -define void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %a = load i32, i32 addrspace(1)* %in - %b = load i32, i32 addrspace(1)* %b_ptr - %result = sub i32 %a, %b - store i32 %result, i32 addrspace(1)* %out - ret void -} - - -; FUNC-LABEL: {{^}}test_sub_v2i32: -; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -define void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 - %a = load <2 x i32>, <2 x i32> addrspace(1) * %in - %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr - %result = sub <2 x i32> %a, %b - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_sub_v4i32: -; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -define void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 - %a = load <4 x i32>, <4 x i32> addrspace(1) * %in - %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr - %result = sub <4 x i32> %a, %b - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}s_sub_i64: -; SI: s_sub_u32 -; SI: s_subb_u32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] -; EG-DAG: SUB_INT {{[* ]*}}[[LO]] -; EG-DAG: SUBB_UINT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT {{[* ]*}}[[HI]] -; EG-NOT: SUB -define void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind { - %result = sub i64 %a, %b - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_sub_i64: -; SI: v_sub_i32_e32 -; SI: v_subb_u32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] -; EG-DAG: SUB_INT {{[* ]*}}[[LO]] -; EG-DAG: SUBB_UINT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT {{[* ]*}}[[HI]] -; EG-NOT: SUB -define void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind { - %tid = call i32 @llvm.r600.read.tidig.x() readnone - %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid - %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid - %a = load i64, i64 addrspace(1)* %a_ptr - %b = load i64, i64 addrspace(1)* %b_ptr - %result = sub i64 %a, %b - store i64 %result, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}v_test_sub_v2i64: -; SI: v_sub_i32_e32 -; SI: v_subb_u32_e32 -; SI: v_sub_i32_e32 -; SI: v_subb_u32_e32 -define void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) { - %tid = call i32 @llvm.r600.read.tidig.x() readnone - %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid - %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid - %a = load <2 x i64>, <2 x i64> addrspace(1)* %a_ptr - %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr - %result = sub <2 x i64> %a, %b - store <2 x i64> %result, <2 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}v_test_sub_v4i64: -; SI: v_sub_i32_e32 -; SI: v_subb_u32_e32 -; SI: v_sub_i32_e32 -; SI: v_subb_u32_e32 -; SI: v_sub_i32_e32 -; SI: v_subb_u32_e32 -; SI: v_sub_i32_e32 -; SI: v_subb_u32_e32 -define void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) { - %tid = call i32 @llvm.r600.read.tidig.x() readnone - %a_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inA, i32 %tid - %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inB, i32 %tid - %a = load <4 x i64>, <4 x i64> addrspace(1)* %a_ptr - %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr - %result = sub <4 x i64> %a, %b - store <4 x i64> %result, <4 x i64> addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/subreg-coalescer-crash.ll b/test/CodeGen/R600/subreg-coalescer-crash.ll deleted file mode 100644 index c4dae4736cf..00000000000 --- a/test/CodeGen/R600/subreg-coalescer-crash.ll +++ /dev/null @@ -1,109 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -o - %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - %s - -; SI-LABEL:{{^}}row_filter_C1_D0: -; SI: s_endpgm -; Function Attrs: nounwind -define void @row_filter_C1_D0() { -entry: - br i1 undef, label %for.inc.1, label %do.body.preheader - -do.body.preheader: ; preds = %entry - %0 = insertelement <4 x i32> zeroinitializer, i32 undef, i32 1 - br i1 undef, label %do.body56.1, label %do.body90 - -do.body90: ; preds = %do.body56.2, %do.body56.1, %do.body.preheader - %1 = phi <4 x i32> [ %6, %do.body56.2 ], [ %5, %do.body56.1 ], [ %0, %do.body.preheader ] - %2 = insertelement <4 x i32> %1, i32 undef, i32 2 - %3 = insertelement <4 x i32> %2, i32 undef, i32 3 - br i1 undef, label %do.body124.1, label %do.body.1562.preheader - -do.body.1562.preheader: ; preds = %do.body124.1, %do.body90 - %storemerge = phi <4 x i32> [ %3, %do.body90 ], [ %7, %do.body124.1 ] - %4 = insertelement <4 x i32> undef, i32 undef, i32 1 - br label %for.inc.1 - -do.body56.1: ; preds = %do.body.preheader - %5 = insertelement <4 x i32> %0, i32 undef, i32 1 - %or.cond472.1 = or i1 undef, undef - br i1 %or.cond472.1, label %do.body56.2, label %do.body90 - -do.body56.2: ; preds = %do.body56.1 - %6 = insertelement <4 x i32> %5, i32 undef, i32 1 - br label %do.body90 - -do.body124.1: ; preds = %do.body90 - %7 = insertelement <4 x i32> %3, i32 undef, i32 3 - br label %do.body.1562.preheader - -for.inc.1: ; preds = %do.body.1562.preheader, %entry - %storemerge591 = phi <4 x i32> [ zeroinitializer, %entry ], [ %storemerge, %do.body.1562.preheader ] - %add.i495 = add <4 x i32> %storemerge591, undef - unreachable -} - -; SI-LABEL: {{^}}foo: -; SI: s_endpgm -define void @foo() #0 { -bb: - br i1 undef, label %bb2, label %bb1 - -bb1: ; preds = %bb - br i1 undef, label %bb4, label %bb6 - -bb2: ; preds = %bb4, %bb - %tmp = phi float [ %tmp5, %bb4 ], [ 0.000000e+00, %bb ] - br i1 undef, label %bb9, label %bb13 - -bb4: ; preds = %bb7, %bb6, %bb1 - %tmp5 = phi float [ undef, %bb1 ], [ undef, %bb6 ], [ %tmp8, %bb7 ] - br label %bb2 - -bb6: ; preds = %bb1 - br i1 undef, label %bb7, label %bb4 - -bb7: ; preds = %bb6 - %tmp8 = fmul float undef, undef - br label %bb4 - -bb9: ; preds = %bb2 - %tmp10 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 2) - %tmp11 = extractelement <4 x float> %tmp10, i32 1 - %tmp12 = extractelement <4 x float> %tmp10, i32 3 - br label %bb14 - -bb13: ; preds = %bb2 - br i1 undef, label %bb23, label %bb24 - -bb14: ; preds = %bb27, %bb24, %bb9 - %tmp15 = phi float [ %tmp12, %bb9 ], [ undef, %bb27 ], [ 0.000000e+00, %bb24 ] - %tmp16 = phi float [ %tmp11, %bb9 ], [ undef, %bb27 ], [ %tmp25, %bb24 ] - %tmp17 = fmul float 10.5, %tmp16 - %tmp18 = fmul float 11.5, %tmp15 - call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp18, float %tmp17, float %tmp17, float %tmp17) - ret void - -bb23: ; preds = %bb13 - br i1 undef, label %bb24, label %bb26 - -bb24: ; preds = %bb26, %bb23, %bb13 - %tmp25 = phi float [ %tmp, %bb13 ], [ %tmp, %bb26 ], [ 0.000000e+00, %bb23 ] - br i1 undef, label %bb27, label %bb14 - -bb26: ; preds = %bb23 - br label %bb24 - -bb27: ; preds = %bb24 - br label %bb14 -} - -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.SI.packf16(float, float) #1 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" "unsafe-fp-math"="true" } -attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/subreg-eliminate-dead.ll b/test/CodeGen/R600/subreg-eliminate-dead.ll deleted file mode 100644 index 8bd995a8ecb..00000000000 --- a/test/CodeGen/R600/subreg-eliminate-dead.ll +++ /dev/null @@ -1,19 +0,0 @@ -; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck %s -; LiveRangeEdit::eliminateDeadDef did not update LiveInterval sub ranges -; properly. - -; Just make sure this test doesn't crash. -; CHECK-LABEL: foobar: -; CHECK: s_endpgm -define void @foobar() { - %v0 = icmp eq <4 x i32> undef, - %v3 = sext <4 x i1> %v0 to <4 x i32> - %v4 = extractelement <4 x i32> %v3, i32 1 - %v5 = icmp ne i32 %v4, 0 - %v6 = select i1 %v5, i32 undef, i32 0 - %v15 = insertelement <2 x i32> undef, i32 %v6, i32 1 - store <2 x i32> %v15, <2 x i32> addrspace(1)* undef, align 8 - ret void -} - -declare double @llvm.fma.f64(double, double, double) diff --git a/test/CodeGen/R600/swizzle-export.ll b/test/CodeGen/R600/swizzle-export.ll deleted file mode 100644 index 000ee2faa47..00000000000 --- a/test/CodeGen/R600/swizzle-export.ll +++ /dev/null @@ -1,129 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s - -;EG: {{^}}main: -;EG: EXPORT T{{[0-9]+}}.XYXX -;EG: EXPORT T{{[0-9]+}}.ZXXX -;EG: EXPORT T{{[0-9]+}}.XXWX -;EG: EXPORT T{{[0-9]+}}.XXXW - -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { -main_body: - %0 = extractelement <4 x float> %reg1, i32 0 - %1 = extractelement <4 x float> %reg1, i32 1 - %2 = extractelement <4 x float> %reg1, i32 2 - %3 = extractelement <4 x float> %reg1, i32 3 - %4 = load <4 x float>, <4 x float> addrspace(8)* null - %5 = extractelement <4 x float> %4, i32 1 - %6 = load <4 x float>, <4 x float> addrspace(8)* null - %7 = extractelement <4 x float> %6, i32 2 - %8 = load <4 x float>, <4 x float> addrspace(8)* null - %9 = extractelement <4 x float> %8, i32 0 - %10 = fmul float 0.000000e+00, %9 - %11 = load <4 x float>, <4 x float> addrspace(8)* null - %12 = extractelement <4 x float> %11, i32 0 - %13 = fmul float %5, %12 - %14 = load <4 x float>, <4 x float> addrspace(8)* null - %15 = extractelement <4 x float> %14, i32 0 - %16 = fmul float 0.000000e+00, %15 - %17 = load <4 x float>, <4 x float> addrspace(8)* null - %18 = extractelement <4 x float> %17, i32 0 - %19 = fmul float 0.000000e+00, %18 - %20 = load <4 x float>, <4 x float> addrspace(8)* null - %21 = extractelement <4 x float> %20, i32 0 - %22 = fmul float %7, %21 - %23 = load <4 x float>, <4 x float> addrspace(8)* null - %24 = extractelement <4 x float> %23, i32 0 - %25 = fmul float 0.000000e+00, %24 - %26 = load <4 x float>, <4 x float> addrspace(8)* null - %27 = extractelement <4 x float> %26, i32 0 - %28 = fmul float 0.000000e+00, %27 - %29 = load <4 x float>, <4 x float> addrspace(8)* null - %30 = extractelement <4 x float> %29, i32 0 - %31 = fmul float 0.000000e+00, %30 - %32 = load <4 x float>, <4 x float> addrspace(8)* null - %33 = extractelement <4 x float> %32, i32 0 - %34 = fmul float 0.000000e+00, %33 - %35 = load <4 x float>, <4 x float> addrspace(8)* null - %36 = extractelement <4 x float> %35, i32 0 - %37 = fmul float 0.000000e+00, %36 - %38 = load <4 x float>, <4 x float> addrspace(8)* null - %39 = extractelement <4 x float> %38, i32 0 - %40 = fmul float 1.000000e+00, %39 - %41 = load <4 x float>, <4 x float> addrspace(8)* null - %42 = extractelement <4 x float> %41, i32 0 - %43 = fmul float 0.000000e+00, %42 - %44 = load <4 x float>, <4 x float> addrspace(8)* null - %45 = extractelement <4 x float> %44, i32 0 - %46 = fmul float 0.000000e+00, %45 - %47 = load <4 x float>, <4 x float> addrspace(8)* null - %48 = extractelement <4 x float> %47, i32 0 - %49 = fmul float 0.000000e+00, %48 - %50 = load <4 x float>, <4 x float> addrspace(8)* null - %51 = extractelement <4 x float> %50, i32 0 - %52 = fmul float 0.000000e+00, %51 - %53 = load <4 x float>, <4 x float> addrspace(8)* null - %54 = extractelement <4 x float> %53, i32 0 - %55 = fmul float 1.000000e+00, %54 - %56 = insertelement <4 x float> undef, float %0, i32 0 - %57 = insertelement <4 x float> %56, float %1, i32 1 - %58 = insertelement <4 x float> %57, float %2, i32 2 - %59 = insertelement <4 x float> %58, float %3, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %59, i32 60, i32 1) - %60 = insertelement <4 x float> undef, float %10, i32 0 - %61 = insertelement <4 x float> %60, float %13, i32 1 - %62 = insertelement <4 x float> %61, float %16, i32 2 - %63 = insertelement <4 x float> %62, float %19, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %63, i32 0, i32 2) - %64 = insertelement <4 x float> undef, float %22, i32 0 - %65 = insertelement <4 x float> %64, float %25, i32 1 - %66 = insertelement <4 x float> %65, float %28, i32 2 - %67 = insertelement <4 x float> %66, float %31, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %67, i32 1, i32 2) - %68 = insertelement <4 x float> undef, float %34, i32 0 - %69 = insertelement <4 x float> %68, float %37, i32 1 - %70 = insertelement <4 x float> %69, float %40, i32 2 - %71 = insertelement <4 x float> %70, float %43, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %71, i32 2, i32 2) - %72 = insertelement <4 x float> undef, float %46, i32 0 - %73 = insertelement <4 x float> %72, float %49, i32 1 - %74 = insertelement <4 x float> %73, float %52, i32 2 - %75 = insertelement <4 x float> %74, float %55, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %75, i32 3, i32 2) - ret void -} - -; EG: {{^}}main2: -; EG: T{{[0-9]+}}.XY__ -; EG: T{{[0-9]+}}.ZXY0 - -define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { -main_body: - %0 = extractelement <4 x float> %reg1, i32 0 - %1 = extractelement <4 x float> %reg1, i32 1 - %2 = fadd float %0, 2.5 - %3 = fmul float %1, 3.5 - %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) - %5 = extractelement <4 x float> %4, i32 0 - %6 = call float @llvm.cos.f32(float %5) - %7 = load <4 x float>, <4 x float> addrspace(8)* null - %8 = extractelement <4 x float> %7, i32 0 - %9 = load <4 x float>, <4 x float> addrspace(8)* null - %10 = extractelement <4 x float> %9, i32 1 - %11 = insertelement <4 x float> undef, float %2, i32 0 - %12 = insertelement <4 x float> %11, float %3, i32 1 - call void @llvm.R600.store.swizzle(<4 x float> %12, i32 60, i32 1) - %13 = insertelement <4 x float> undef, float %6, i32 0 - %14 = insertelement <4 x float> %13, float %8, i32 1 - %15 = insertelement <4 x float> %14, float %10, i32 2 - %16 = insertelement <4 x float> %15, float 0.000000e+00, i32 3 - call void @llvm.R600.store.swizzle(<4 x float> %16, i32 0, i32 2) - ret void -} - -; Function Attrs: nounwind readonly -declare float @llvm.cos.f32(float) #1 - -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="1" } -attributes #1 = { nounwind readonly } diff --git a/test/CodeGen/R600/tex-clause-antidep.ll b/test/CodeGen/R600/tex-clause-antidep.ll deleted file mode 100644 index cbb9c50974a..00000000000 --- a/test/CodeGen/R600/tex-clause-antidep.ll +++ /dev/null @@ -1,25 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;CHECK: TEX -;CHECK-NEXT: ALU - -define void @test(<4 x float> inreg %reg0) #0 { - %1 = extractelement <4 x float> %reg0, i32 0 - %2 = extractelement <4 x float> %reg0, i32 1 - %3 = extractelement <4 x float> %reg0, i32 2 - %4 = extractelement <4 x float> %reg0, i32 3 - %5 = insertelement <4 x float> undef, float %1, i32 0 - %6 = insertelement <4 x float> %5, float %2, i32 1 - %7 = insertelement <4 x float> %6, float %3, i32 2 - %8 = insertelement <4 x float> %7, float %4, i32 3 - %9 = call <4 x float> @llvm.R600.tex(<4 x float> %8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %10 = call <4 x float> @llvm.R600.tex(<4 x float> %8, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %11 = fadd <4 x float> %9, %10 - call void @llvm.R600.store.swizzle(<4 x float> %11, i32 0, i32 0) - ret void -} - -declare <4 x float> @llvm.R600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="1" } \ No newline at end of file diff --git a/test/CodeGen/R600/texture-input-merge.ll b/test/CodeGen/R600/texture-input-merge.ll deleted file mode 100644 index 789538af582..00000000000 --- a/test/CodeGen/R600/texture-input-merge.ll +++ /dev/null @@ -1,31 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;CHECK-NOT: MOV - -define void @test(<4 x float> inreg %reg0) #0 { - %1 = extractelement <4 x float> %reg0, i32 0 - %2 = extractelement <4 x float> %reg0, i32 1 - %3 = extractelement <4 x float> %reg0, i32 2 - %4 = extractelement <4 x float> %reg0, i32 3 - %5 = fmul float %1, 3.0 - %6 = fmul float %2, 3.0 - %7 = fmul float %3, 3.0 - %8 = fmul float %4, 3.0 - %9 = insertelement <4 x float> undef, float %5, i32 0 - %10 = insertelement <4 x float> %9, float %6, i32 1 - %11 = insertelement <4 x float> undef, float %7, i32 0 - %12 = insertelement <4 x float> %11, float %5, i32 1 - %13 = insertelement <4 x float> undef, float %8, i32 0 - %14 = call <4 x float> @llvm.R600.tex(<4 x float> %10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %15 = call <4 x float> @llvm.R600.tex(<4 x float> %12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %16 = call <4 x float> @llvm.R600.tex(<4 x float> %13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %17 = fadd <4 x float> %14, %15 - %18 = fadd <4 x float> %17, %16 - call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 0) - ret void -} - -declare <4 x float> @llvm.R600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="1" } \ No newline at end of file diff --git a/test/CodeGen/R600/trunc-cmp-constant.ll b/test/CodeGen/R600/trunc-cmp-constant.ll deleted file mode 100644 index dac74728b3c..00000000000 --- a/test/CodeGen/R600/trunc-cmp-constant.ll +++ /dev/null @@ -1,170 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL {{^}}sextload_i1_to_i32_trunc_cmp_eq_0: -; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] -; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]] -; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}} -; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc, -1{{$}} -; SI: v_cndmask_b32_e64 -; SI: buffer_store_byte -define void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = sext i1 %load to i32 - %cmp = icmp eq i32 %ext, 0 - store i1 %cmp, i1 addrspace(1)* %out - ret void -} - -; FIXME: The negate should be inverting the compare. -; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_0: -; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] -; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]] -; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}} -; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1 -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]] -; SI-NEXT: buffer_store_byte [[RESULT]] -define void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = zext i1 %load to i32 - %cmp = icmp eq i32 %ext, 0 - store i1 %cmp, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_eq_1: -; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} -; SI: buffer_store_byte [[RESULT]] -define void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = sext i1 %load to i32 - %cmp = icmp eq i32 %ext, 1 - store i1 %cmp, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_1: -; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] -; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]] -; SI-NEXT: buffer_store_byte [[RESULT]] -define void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = zext i1 %load to i32 - %cmp = icmp eq i32 %ext, 1 - store i1 %cmp, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_eq_neg1: -; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] -; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]] -; SI-NEXT: buffer_store_byte [[RESULT]] -define void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = sext i1 %load to i32 - %cmp = icmp eq i32 %ext, -1 - store i1 %cmp, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_neg1: -; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}} -; SI: buffer_store_byte [[RESULT]] -define void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = zext i1 %load to i32 - %cmp = icmp eq i32 %ext, -1 - store i1 %cmp, i1 addrspace(1)* %out - ret void -} - - -; FUNC-LABEL {{^}}sextload_i1_to_i32_trunc_cmp_ne_0: -; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] -; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]] -; SI-NEXT: buffer_store_byte [[RESULT]] -define void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = sext i1 %load to i32 - %cmp = icmp ne i32 %ext, 0 - store i1 %cmp, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_0: -; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] -; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]] -; SI-NEXT: buffer_store_byte [[RESULT]] -define void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = zext i1 %load to i32 - %cmp = icmp ne i32 %ext, 0 - store i1 %cmp, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_ne_1: -; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}} -; SI: buffer_store_byte [[RESULT]] -define void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = sext i1 %load to i32 - %cmp = icmp ne i32 %ext, 1 - store i1 %cmp, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_1: -; SI: buffer_load_ubyte [[LOAD:v[0-9]+]] -; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]] -; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}} -; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1 -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]] -; SI-NEXT: buffer_store_byte [[RESULT]] -define void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = zext i1 %load to i32 - %cmp = icmp ne i32 %ext, 1 - store i1 %cmp, i1 addrspace(1)* %out - ret void -} - -; FIXME: This should be one compare. -; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_ne_neg1: -; XSI: buffer_load_ubyte [[LOAD:v[0-9]+]] -; XSI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]] -; XSI: v_cmp_eq_i32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], [[TMP]], 0{{$}} -; XSI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP0]] -; XSI-NEXT: buffer_store_byte [[RESULT]] -define void @sextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = sext i1 %load to i32 - %cmp = icmp ne i32 %ext, -1 - store i1 %cmp, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_neg1: -; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}} -; SI: buffer_store_byte [[RESULT]] -define void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind { - %load = load i1, i1 addrspace(1)* %in - %ext = zext i1 %load to i32 - %cmp = icmp ne i32 %ext, -1 - store i1 %cmp, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}masked_load_i1_to_i32_trunc_cmp_ne_neg1: -; SI: buffer_load_sbyte [[LOAD:v[0-9]+]] -; SI: v_cmp_ne_i32_e32 vcc, -1, [[LOAD]]{{$}} -; SI-NEXT: v_cndmask_b32_e64 -; SI-NEXT: buffer_store_byte -define void @masked_load_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind { - %load = load i8, i8 addrspace(1)* %in - %masked = and i8 %load, 255 - %ext = sext i8 %masked to i32 - %cmp = icmp ne i32 %ext, -1 - store i1 %cmp, i1 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/trunc-store-f64-to-f16.ll b/test/CodeGen/R600/trunc-store-f64-to-f16.ll deleted file mode 100644 index c29872beef8..00000000000 --- a/test/CodeGen/R600/trunc-store-f64-to-f16.ll +++ /dev/null @@ -1,56 +0,0 @@ -; XFAIL: * -; RUN: llc -march=amdgcn -mcpu=SI < %s - -; GCN-LABEL: {{^}}global_truncstore_f64_to_f16: -; GCN: s_endpgm -define void @global_truncstore_f64_to_f16(half addrspace(1)* %out, double addrspace(1)* %in) #0 { - %val = load double, double addrspace(1)* %in - %cvt = fptrunc double %val to half - store half %cvt, half addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_truncstore_v2f64_to_v2f16: -; GCN: s_endpgm -define void @global_truncstore_v2f64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 { - %val = load <2 x double>, <2 x double> addrspace(1)* %in - %cvt = fptrunc <2 x double> %val to <2 x half> - store <2 x half> %cvt, <2 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_truncstore_v3f64_to_v3f16: -; GCN: s_endpgm -define void @global_truncstore_v3f64_to_v3f16(<3 x half> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 { - %val = load <3 x double>, <3 x double> addrspace(1)* %in - %cvt = fptrunc <3 x double> %val to <3 x half> - store <3 x half> %cvt, <3 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_truncstore_v4f64_to_v4f16: -; GCN: s_endpgm -define void @global_truncstore_v4f64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 { - %val = load <4 x double>, <4 x double> addrspace(1)* %in - %cvt = fptrunc <4 x double> %val to <4 x half> - store <4 x half> %cvt, <4 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_truncstore_v8f64_to_v8f16: -; GCN: s_endpgm -define void @global_truncstore_v8f64_to_v8f16(<8 x half> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 { - %val = load <8 x double>, <8 x double> addrspace(1)* %in - %cvt = fptrunc <8 x double> %val to <8 x half> - store <8 x half> %cvt, <8 x half> addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}global_truncstore_v16f64_to_v16f16: -; GCN: s_endpgm -define void @global_truncstore_v16f64_to_v16f16(<16 x half> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 { - %val = load <16 x double>, <16 x double> addrspace(1)* %in - %cvt = fptrunc <16 x double> %val to <16 x half> - store <16 x half> %cvt, <16 x half> addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/trunc-store-i1.ll b/test/CodeGen/R600/trunc-store-i1.ll deleted file mode 100644 index b71a838b62c..00000000000 --- a/test/CodeGen/R600/trunc-store-i1.ll +++ /dev/null @@ -1,33 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s - - -; SI-LABEL: {{^}}global_truncstore_i32_to_i1: -; SI: s_load_dword [[LOAD:s[0-9]+]], -; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1 -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]] -; SI: buffer_store_byte [[VREG]], -define void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwind { - %trunc = trunc i32 %val to i1 - store i1 %trunc, i1 addrspace(1)* %out, align 1 - ret void -} - -; SI-LABEL: {{^}}global_truncstore_i64_to_i1: -; SI: buffer_store_byte -define void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwind { - %trunc = trunc i64 %val to i1 - store i1 %trunc, i1 addrspace(1)* %out, align 1 - ret void -} - -; SI-LABEL: {{^}}global_truncstore_i16_to_i1: -; SI: s_load_dword [[LOAD:s[0-9]+]], -; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1 -; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]] -; SI: buffer_store_byte [[VREG]], -define void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind { - %trunc = trunc i16 %val to i1 - store i1 %trunc, i1 addrspace(1)* %out, align 1 - ret void -} diff --git a/test/CodeGen/R600/trunc-vector-store-assertion-failure.ll b/test/CodeGen/R600/trunc-vector-store-assertion-failure.ll deleted file mode 100644 index 878ea3f4899..00000000000 --- a/test/CodeGen/R600/trunc-vector-store-assertion-failure.ll +++ /dev/null @@ -1,20 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; This tests for a bug in the SelectionDAG where custom lowered truncated -; vector stores at the end of a basic block were not being added to the -; LegalizedNodes list, which triggered an assertion failure. - -; CHECK-LABEL: {{^}}test: -; CHECK: MEM_RAT_CACHELESS STORE_RAW -define void @test(<4 x i8> addrspace(1)* %out, i32 %cond, <4 x i8> %in) { -entry: - %0 = icmp eq i32 %cond, 0 - br i1 %0, label %if, label %done - -if: - store <4 x i8> %in, <4 x i8> addrspace(1)* %out - br label %done - -done: - ret void -} diff --git a/test/CodeGen/R600/trunc.ll b/test/CodeGen/R600/trunc.ll deleted file mode 100644 index bf690ca4cb2..00000000000 --- a/test/CodeGen/R600/trunc.ll +++ /dev/null @@ -1,100 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -define void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) { -; SI-LABEL: {{^}}trunc_i64_to_i32_store: -; SI: s_load_dword [[SLOAD:s[0-9]+]], s[0:1], 0xb -; SI: v_mov_b32_e32 [[VLOAD:v[0-9]+]], [[SLOAD]] -; SI: buffer_store_dword [[VLOAD]] - -; EG-LABEL: {{^}}trunc_i64_to_i32_store: -; EG: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 -; EG: LSHR -; EG-NEXT: 2( - - %result = trunc i64 %in to i32 store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}trunc_load_shl_i64: -; SI-DAG: s_load_dwordx2 -; SI-DAG: s_load_dword [[SREG:s[0-9]+]], -; SI: s_lshl_b32 [[SHL:s[0-9]+]], [[SREG]], 2 -; SI: v_mov_b32_e32 [[VSHL:v[0-9]+]], [[SHL]] -; SI: buffer_store_dword [[VSHL]], -define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) { - %b = shl i64 %a, 2 - %result = trunc i64 %b to i32 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}trunc_shl_i64: -; SI: s_load_dwordx2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd -; SI: s_lshl_b64 s{{\[}}[[LO_SHL:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_SREG]]:{{[0-9]+\]}}, 2 -; SI: s_add_u32 s[[LO_SREG2:[0-9]+]], s[[LO_SHL]], -; SI: s_addc_u32 -; SI: v_mov_b32_e32 -; SI: v_mov_b32_e32 -; SI: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]] -; SI: buffer_store_dword v[[LO_VREG]], -define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) { - %aa = add i64 %a, 234 ; Prevent shrinking store. - %b = shl i64 %aa, 2 - %result = trunc i64 %b to i32 - store i32 %result, i32 addrspace(1)* %out, align 4 - store i64 %b, i64 addrspace(1)* %out2, align 8 ; Prevent reducing ops to 32-bits - ret void -} - -; SI-LABEL: {{^}}trunc_i32_to_i1: -; SI: v_and_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} -; SI: v_cmp_eq_i32 -define void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) { - %a = load i32, i32 addrspace(1)* %ptr, align 4 - %trunc = trunc i32 %a to i1 - %result = select i1 %trunc, i32 1, i32 0 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}sgpr_trunc_i32_to_i1: -; SI: v_and_b32_e64 v{{[0-9]+}}, 1, s{{[0-9]+}} -; SI: v_cmp_eq_i32 -define void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) { - %trunc = trunc i32 %a to i1 - %result = select i1 %trunc, i32 1, i32 0 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}s_trunc_i64_to_i1: -; SI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb -; SI: v_and_b32_e64 [[MASKED:v[0-9]+]], 1, s[[SLO]] -; SI: v_cmp_eq_i32_e32 vcc, 1, [[MASKED]] -; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc -define void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) { - %trunc = trunc i64 %x to i1 - %sel = select i1 %trunc, i32 63, i32 -12 - store i32 %sel, i32 addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}v_trunc_i64_to_i1: -; SI: buffer_load_dwordx2 v{{\[}}[[VLO:[0-9]+]]:{{[0-9]+\]}} -; SI: v_and_b32_e32 [[MASKED:v[0-9]+]], 1, v[[VLO]] -; SI: v_cmp_eq_i32_e32 vcc, 1, [[MASKED]] -; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc -define void @v_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %x = load i64, i64 addrspace(1)* %gep - - %trunc = trunc i64 %x to i1 - %sel = select i1 %trunc, i32 63, i32 -12 - store i32 %sel, i32 addrspace(1)* %out.gep - ret void -} diff --git a/test/CodeGen/R600/tti-unroll-prefs.ll b/test/CodeGen/R600/tti-unroll-prefs.ll deleted file mode 100644 index 76c32afc1f2..00000000000 --- a/test/CodeGen/R600/tti-unroll-prefs.ll +++ /dev/null @@ -1,58 +0,0 @@ -; RUN: opt -loop-unroll -S -mtriple=amdgcn-- -mcpu=SI %s | FileCheck %s - -; This IR comes from this OpenCL C code: -; -; if (b + 4 > a) { -; for (int i = 0; i < 4; i++, b++) { -; if (b + 1 <= a) -; *(dst + c + b) = 0; -; else -; break; -; } -; } -; -; This test is meant to check that this loop isn't unrolled into more than -; four iterations. The loop unrolling preferences we currently use cause this -; loop to not be unrolled at all, but that may change in the future. - -; CHECK-LABEL: @test -; CHECK: store i8 0, i8 addrspace(1)* -; CHECK-NOT: store i8 0, i8 addrspace(1)* -; CHECK: ret void -define void @test(i8 addrspace(1)* nocapture %dst, i32 %a, i32 %b, i32 %c) { -entry: - %add = add nsw i32 %b, 4 - %cmp = icmp sgt i32 %add, %a - br i1 %cmp, label %for.cond.preheader, label %if.end7 - -for.cond.preheader: ; preds = %entry - %cmp313 = icmp slt i32 %b, %a - br i1 %cmp313, label %if.then4.lr.ph, label %if.end7.loopexit - -if.then4.lr.ph: ; preds = %for.cond.preheader - %0 = sext i32 %c to i64 - br label %if.then4 - -if.then4: ; preds = %if.then4.lr.ph, %if.then4 - %i.015 = phi i32 [ 0, %if.then4.lr.ph ], [ %inc, %if.then4 ] - %b.addr.014 = phi i32 [ %b, %if.then4.lr.ph ], [ %add2, %if.then4 ] - %add2 = add nsw i32 %b.addr.014, 1 - %1 = sext i32 %b.addr.014 to i64 - %add.ptr.sum = add nsw i64 %1, %0 - %add.ptr5 = getelementptr inbounds i8, i8 addrspace(1)* %dst, i64 %add.ptr.sum - store i8 0, i8 addrspace(1)* %add.ptr5, align 1 - %inc = add nsw i32 %i.015, 1 - %cmp1 = icmp slt i32 %inc, 4 - %cmp3 = icmp slt i32 %add2, %a - %or.cond = and i1 %cmp3, %cmp1 - br i1 %or.cond, label %if.then4, label %for.cond.if.end7.loopexit_crit_edge - -for.cond.if.end7.loopexit_crit_edge: ; preds = %if.then4 - br label %if.end7.loopexit - -if.end7.loopexit: ; preds = %for.cond.if.end7.loopexit_crit_edge, %for.cond.preheader - br label %if.end7 - -if.end7: ; preds = %if.end7.loopexit, %entry - ret void -} diff --git a/test/CodeGen/R600/uaddo.ll b/test/CodeGen/R600/uaddo.ll deleted file mode 100644 index 11438f267ad..00000000000 --- a/test/CodeGen/R600/uaddo.ll +++ /dev/null @@ -1,85 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone -declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone - -; FUNC-LABEL: {{^}}uaddo_i64_zext: -; SI: add -; SI: addc -; SI: addc - -; EG: ADDC_UINT -; EG: ADDC_UINT -define void @uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind - %val = extractvalue { i64, i1 } %uadd, 0 - %carry = extractvalue { i64, i1 } %uadd, 1 - %ext = zext i1 %carry to i64 - %add2 = add i64 %val, %ext - store i64 %add2, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_uaddo_i32: -; SI: s_add_i32 - -; EG: ADDC_UINT -; EG: ADD_INT -define void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { - %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) nounwind - %val = extractvalue { i32, i1 } %uadd, 0 - %carry = extractvalue { i32, i1 } %uadd, 1 - store i32 %val, i32 addrspace(1)* %out, align 4 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} - -; FUNC-LABEL: {{^}}v_uaddo_i32: -; SI: v_add_i32 - -; EG: ADDC_UINT -; EG: ADD_INT -define void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %a = load i32, i32 addrspace(1)* %aptr, align 4 - %b = load i32, i32 addrspace(1)* %bptr, align 4 - %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) nounwind - %val = extractvalue { i32, i1 } %uadd, 0 - %carry = extractvalue { i32, i1 } %uadd, 1 - store i32 %val, i32 addrspace(1)* %out, align 4 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} - -; FUNC-LABEL: {{^}}s_uaddo_i64: -; SI: s_add_u32 -; SI: s_addc_u32 - -; EG: ADDC_UINT -; EG: ADD_INT -define void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { - %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind - %val = extractvalue { i64, i1 } %uadd, 0 - %carry = extractvalue { i64, i1 } %uadd, 1 - store i64 %val, i64 addrspace(1)* %out, align 8 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} - -; FUNC-LABEL: {{^}}v_uaddo_i64: -; SI: v_add_i32 -; SI: v_addc_u32 - -; EG: ADDC_UINT -; EG: ADD_INT -define void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { - %a = load i64, i64 addrspace(1)* %aptr, align 4 - %b = load i64, i64 addrspace(1)* %bptr, align 4 - %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind - %val = extractvalue { i64, i1 } %uadd, 0 - %carry = extractvalue { i64, i1 } %uadd, 1 - store i64 %val, i64 addrspace(1)* %out, align 8 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} diff --git a/test/CodeGen/R600/udiv.ll b/test/CodeGen/R600/udiv.ll deleted file mode 100644 index de22a22e502..00000000000 --- a/test/CodeGen/R600/udiv.ll +++ /dev/null @@ -1,48 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s - -;EG-LABEL: {{^}}test: -;EG-NOT: SETGE_INT -;EG: CF_END - -define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %a = load i32, i32 addrspace(1) * %in - %b = load i32, i32 addrspace(1) * %b_ptr - %result = udiv i32 %a, %b - store i32 %result, i32 addrspace(1)* %out - ret void -} - -;The code generated by udiv is long and complex and may frequently change. -;The goal of this test is to make sure the ISel doesn't fail when it gets -;a v4i32 udiv - -;EG-LABEL: {{^}}test2: -;EG: CF_END -;SI-LABEL: {{^}}test2: -;SI: s_endpgm - -define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 - %a = load <2 x i32>, <2 x i32> addrspace(1) * %in - %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr - %result = udiv <2 x i32> %a, %b - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -;EG-LABEL: {{^}}test4: -;EG: CF_END -;SI-LABEL: {{^}}test4: -;SI: s_endpgm - -define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 - %a = load <4 x i32>, <4 x i32> addrspace(1) * %in - %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr - %result = udiv <4 x i32> %a, %b - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/udivrem.ll b/test/CodeGen/R600/udivrem.ll deleted file mode 100644 index b3837f28209..00000000000 --- a/test/CodeGen/R600/udivrem.ll +++ /dev/null @@ -1,345 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}test_udivrem: -; EG: RECIP_UINT -; EG-DAG: MULHI -; EG-DAG: MULLO_INT -; EG-DAG: SUB_INT -; EG: CNDE_INT -; EG: MULHI -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG: CNDE_INT -; EG: MULHI -; EG: MULLO_INT -; EG: SUB_INT -; EG-DAG: SETGE_UINT -; EG-DAG: SETGE_UINT -; EG: AND_INT -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT - -; SI: v_rcp_iflag_f32_e32 [[RCP:v[0-9]+]] -; SI-DAG: v_mul_hi_u32 [[RCP_HI:v[0-9]+]], [[RCP]] -; SI-DAG: v_mul_lo_i32 [[RCP_LO:v[0-9]+]], [[RCP]] -; SI-DAG: v_sub_i32_e32 [[NEG_RCP_LO:v[0-9]+]], 0, [[RCP_LO]] -; SI: v_cndmask_b32_e64 -; SI: v_mul_hi_u32 [[E:v[0-9]+]], {{v[0-9]+}}, [[RCP]] -; SI-DAG: v_add_i32_e32 [[RCP_A_E:v[0-9]+]], [[E]], [[RCP]] -; SI-DAG: v_subrev_i32_e32 [[RCP_S_E:v[0-9]+]], [[E]], [[RCP]] -; SI: v_cndmask_b32_e64 -; SI: v_mul_hi_u32 [[Quotient:v[0-9]+]] -; SI: v_mul_lo_i32 [[Num_S_Remainder:v[0-9]+]] -; SI-DAG: v_sub_i32_e32 [[Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[Num_S_Remainder]] -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI: v_and_b32_e32 [[Tmp1:v[0-9]+]] -; SI-DAG: v_add_i32_e32 [[Quotient_A_One:v[0-9]+]], 1, [[Quotient]] -; SI-DAG: v_subrev_i32_e32 [[Quotient_S_One:v[0-9]+]], -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_add_i32_e32 [[Remainder_A_Den:v[0-9]+]], -; SI-DAG: v_subrev_i32_e32 [[Remainder_S_Den:v[0-9]+]], -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI: s_endpgm -define void @test_udivrem(i32 addrspace(1)* %out, i32 %x, i32 %y) { - %result0 = udiv i32 %x, %y - store i32 %result0, i32 addrspace(1)* %out - %result1 = urem i32 %x, %y - store i32 %result1, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_udivrem_v2: -; EG-DAG: RECIP_UINT -; EG-DAG: MULHI -; EG-DAG: MULLO_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: MULHI -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: MULHI -; EG-DAG: MULLO_INT -; EG-DAG: SUB_INT -; EG-DAG: SETGE_UINT -; EG-DAG: SETGE_UINT -; EG-DAG: AND_INT -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: RECIP_UINT -; EG-DAG: MULHI -; EG-DAG: MULLO_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: MULHI -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: MULHI -; EG-DAG: MULLO_INT -; EG-DAG: SUB_INT -; EG-DAG: SETGE_UINT -; EG-DAG: SETGE_UINT -; EG-DAG: AND_INT -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT - -; SI-DAG: v_rcp_iflag_f32_e32 [[FIRST_RCP:v[0-9]+]] -; SI-DAG: v_mul_hi_u32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]] -; SI-DAG: v_mul_lo_i32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]] -; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]] -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]] -; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]] -; SI-DAG: v_subrev_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]] -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]] -; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]] -; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder:v[0-9]+]], [[FIRST_Num_S_Remainder]], v{{[0-9]+}} -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]] -; SI-DAG: v_add_i32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]] -; SI-DAG: v_subrev_i32_e32 [[FIRST_Quotient_S_One:v[0-9]+]], -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_add_i32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]], -; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]], -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_rcp_iflag_f32_e32 [[SECOND_RCP:v[0-9]+]] -; SI-DAG: v_mul_hi_u32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]] -; SI-DAG: v_mul_lo_i32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]] -; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]] -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]] -; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]] -; SI-DAG: v_subrev_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]] -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]] -; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]] -; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder:v[0-9]+]], [[SECOND_Num_S_Remainder]], v{{[0-9]+}} -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]] -; SI-DAG: v_add_i32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]] -; SI-DAG: v_subrev_i32_e32 [[SECOND_Quotient_S_One:v[0-9]+]], -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_add_i32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]], -; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]], -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI: s_endpgm -define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { - %result0 = udiv <2 x i32> %x, %y - store <2 x i32> %result0, <2 x i32> addrspace(1)* %out - %result1 = urem <2 x i32> %x, %y - store <2 x i32> %result1, <2 x i32> addrspace(1)* %out - ret void -} - - -; FUNC-LABEL: {{^}}test_udivrem_v4: -; EG-DAG: RECIP_UINT -; EG-DAG: MULHI -; EG-DAG: MULLO_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: MULHI -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: MULHI -; EG-DAG: MULLO_INT -; EG-DAG: SUB_INT -; EG-DAG: SETGE_UINT -; EG-DAG: SETGE_UINT -; EG-DAG: AND_INT -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: RECIP_UINT -; EG-DAG: MULHI -; EG-DAG: MULLO_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: MULHI -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: MULHI -; EG-DAG: MULLO_INT -; EG-DAG: SUB_INT -; EG-DAG: SETGE_UINT -; EG-DAG: SETGE_UINT -; EG-DAG: AND_INT -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: RECIP_UINT -; EG-DAG: MULHI -; EG-DAG: MULLO_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: MULHI -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: MULHI -; EG-DAG: MULLO_INT -; EG-DAG: SUB_INT -; EG-DAG: SETGE_UINT -; EG-DAG: SETGE_UINT -; EG-DAG: AND_INT -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: RECIP_UINT -; EG-DAG: MULHI -; EG-DAG: MULLO_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: MULHI -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: MULHI -; EG-DAG: MULLO_INT -; EG-DAG: SUB_INT -; EG-DAG: SETGE_UINT -; EG-DAG: SETGE_UINT -; EG-DAG: AND_INT -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT -; EG-DAG: ADD_INT -; EG-DAG: SUB_INT -; EG-DAG: CNDE_INT -; EG-DAG: CNDE_INT - -; SI-DAG: v_rcp_iflag_f32_e32 [[FIRST_RCP:v[0-9]+]] -; SI-DAG: v_mul_hi_u32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]] -; SI-DAG: v_mul_lo_i32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]] -; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]] -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]] -; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]] -; SI-DAG: v_subrev_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]] -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]] -; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]] -; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder:v[l0-9]+]], [[FIRST_Num_S_Remainder]], v{{[0-9]+}} -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]] -; SI-DAG: v_add_i32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]] -; SI-DAG: v_subrev_i32_e32 [[FIRST_Quotient_S_One:v[0-9]+]], -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_add_i32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]], -; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]], -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_rcp_iflag_f32_e32 [[SECOND_RCP:v[0-9]+]] -; SI-DAG: v_mul_hi_u32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]] -; SI-DAG: v_mul_lo_i32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]] -; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]] -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]] -; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]] -; SI-DAG: v_subrev_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]] -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]] -; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]] -; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder:v[0-9]+]], [[SECOND_Num_S_Remainder]], v{{[0-9]+}} -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]] -; SI-DAG: v_add_i32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]] -; SI-DAG: v_subrev_i32_e32 [[SECOND_Quotient_S_One:v[0-9]+]], -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_add_i32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]], -; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]], -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_rcp_iflag_f32_e32 [[THIRD_RCP:v[0-9]+]] -; SI-DAG: v_mul_hi_u32 [[THIRD_RCP_HI:v[0-9]+]], [[THIRD_RCP]] -; SI-DAG: v_mul_lo_i32 [[THIRD_RCP_LO:v[0-9]+]], [[THIRD_RCP]] -; SI-DAG: v_sub_i32_e32 [[THIRD_NEG_RCP_LO:v[0-9]+]], 0, [[THIRD_RCP_LO]] -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[THIRD_E:v[0-9]+]], {{v[0-9]+}}, [[THIRD_RCP]] -; SI-DAG: v_add_i32_e32 [[THIRD_RCP_A_E:v[0-9]+]], [[THIRD_E]], [[THIRD_RCP]] -; SI-DAG: v_subrev_i32_e32 [[THIRD_RCP_S_E:v[0-9]+]], [[THIRD_E]], [[THIRD_RCP]] -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[THIRD_Quotient:v[0-9]+]] -; SI-DAG: v_mul_lo_i32 [[THIRD_Num_S_Remainder:v[0-9]+]] -; SI-DAG: v_subrev_i32_e32 [[THIRD_Remainder:v[0-9]+]], [[THIRD_Num_S_Remainder]], {{v[0-9]+}} -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_and_b32_e32 [[THIRD_Tmp1:v[0-9]+]] -; SI-DAG: v_add_i32_e32 [[THIRD_Quotient_A_One:v[0-9]+]], {{.*}}, [[THIRD_Quotient]] -; SI-DAG: v_subrev_i32_e32 [[THIRD_Quotient_S_One:v[0-9]+]], -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_add_i32_e32 [[THIRD_Remainder_A_Den:v[0-9]+]], -; SI-DAG: v_subrev_i32_e32 [[THIRD_Remainder_S_Den:v[0-9]+]], -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_rcp_iflag_f32_e32 [[FOURTH_RCP:v[0-9]+]] -; SI-DAG: v_mul_hi_u32 [[FOURTH_RCP_HI:v[0-9]+]], [[FOURTH_RCP]] -; SI-DAG: v_mul_lo_i32 [[FOURTH_RCP_LO:v[0-9]+]], [[FOURTH_RCP]] -; SI-DAG: v_sub_i32_e32 [[FOURTH_NEG_RCP_LO:v[0-9]+]], 0, [[FOURTH_RCP_LO]] -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[FOURTH_E:v[0-9]+]], {{v[0-9]+}}, [[FOURTH_RCP]] -; SI-DAG: v_add_i32_e32 [[FOURTH_RCP_A_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]] -; SI-DAG: v_subrev_i32_e32 [[FOURTH_RCP_S_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]] -; SI-DAG: v_cndmask_b32_e64 -; SI: s_endpgm -define void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { - %result0 = udiv <4 x i32> %x, %y - store <4 x i32> %result0, <4 x i32> addrspace(1)* %out - %result1 = urem <4 x i32> %x, %y - store <4 x i32> %result1, <4 x i32> addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/udivrem24.ll b/test/CodeGen/R600/udivrem24.ll deleted file mode 100644 index 4de881b66f1..00000000000 --- a/test/CodeGen/R600/udivrem24.ll +++ /dev/null @@ -1,245 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}udiv24_i8: -; SI: v_cvt_f32_ubyte -; SI: v_cvt_f32_ubyte -; SI: v_rcp_f32 -; SI: v_cvt_u32_f32 - -; EG: UINT_TO_FLT -; EG-DAG: UINT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_UINT -define void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { - %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 - %num = load i8, i8 addrspace(1) * %in - %den = load i8, i8 addrspace(1) * %den_ptr - %result = udiv i8 %num, %den - store i8 %result, i8 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}udiv24_i16: -; SI: v_cvt_f32_u32 -; SI: v_cvt_f32_u32 -; SI: v_rcp_f32 -; SI: v_cvt_u32_f32 - -; EG: UINT_TO_FLT -; EG-DAG: UINT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_UINT -define void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { - %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 - %num = load i16, i16 addrspace(1) * %in, align 2 - %den = load i16, i16 addrspace(1) * %den_ptr, align 2 - %result = udiv i16 %num, %den - store i16 %result, i16 addrspace(1)* %out, align 2 - ret void -} - -; FUNC-LABEL: {{^}}udiv24_i32: -; SI: v_cvt_f32_u32 -; SI-DAG: v_cvt_f32_u32 -; SI-DAG: v_rcp_f32 -; SI: v_cvt_u32_f32 - -; EG: UINT_TO_FLT -; EG-DAG: UINT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_UINT -define void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 8 - %den.i24.0 = shl i32 %den, 8 - %num.i24 = lshr i32 %num.i24.0, 8 - %den.i24 = lshr i32 %den.i24.0, 8 - %result = udiv i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}udiv25_i32: -; RCP_IFLAG is for URECIP in the full 32b alg -; SI: v_rcp_iflag -; SI-NOT: v_rcp_f32 - -; EG-NOT: UINT_TO_FLT -; EG-NOT: RECIP_IEEE -define void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 7 - %den.i24.0 = shl i32 %den, 7 - %num.i24 = lshr i32 %num.i24.0, 7 - %den.i24 = lshr i32 %den.i24.0, 7 - %result = udiv i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_no_udiv24_i32_1: -; RCP_IFLAG is for URECIP in the full 32b alg -; SI: v_rcp_iflag -; SI-NOT: v_rcp_f32 - -; EG-NOT: UINT_TO_FLT -; EG-NOT: RECIP_IEEE -define void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 8 - %den.i24.0 = shl i32 %den, 7 - %num.i24 = lshr i32 %num.i24.0, 8 - %den.i24 = lshr i32 %den.i24.0, 7 - %result = udiv i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_no_udiv24_i32_2: -; RCP_IFLAG is for URECIP in the full 32b alg -; SI: v_rcp_iflag -; SI-NOT: v_rcp_f32 - -; EG-NOT: UINT_TO_FLT -; EG-NOT: RECIP_IEEE -define void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 7 - %den.i24.0 = shl i32 %den, 8 - %num.i24 = lshr i32 %num.i24.0, 7 - %den.i24 = lshr i32 %den.i24.0, 8 - %result = udiv i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}urem24_i8: -; SI: v_cvt_f32_ubyte -; SI: v_cvt_f32_ubyte -; SI: v_rcp_f32 -; SI: v_cvt_u32_f32 - -; EG: UINT_TO_FLT -; EG-DAG: UINT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_UINT -define void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { - %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 - %num = load i8, i8 addrspace(1) * %in - %den = load i8, i8 addrspace(1) * %den_ptr - %result = urem i8 %num, %den - store i8 %result, i8 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}urem24_i16: -; SI: v_cvt_f32_u32 -; SI: v_cvt_f32_u32 -; SI: v_rcp_f32 -; SI: v_cvt_u32_f32 - -; EG: UINT_TO_FLT -; EG-DAG: UINT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_UINT -define void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { - %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 - %num = load i16, i16 addrspace(1) * %in, align 2 - %den = load i16, i16 addrspace(1) * %den_ptr, align 2 - %result = urem i16 %num, %den - store i16 %result, i16 addrspace(1)* %out, align 2 - ret void -} - -; FUNC-LABEL: {{^}}urem24_i32: -; SI: v_cvt_f32_u32 -; SI: v_cvt_f32_u32 -; SI: v_rcp_f32 -; SI: v_cvt_u32_f32 - -; EG: UINT_TO_FLT -; EG-DAG: UINT_TO_FLT -; EG-DAG: RECIP_IEEE -; EG: FLT_TO_UINT -define void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 8 - %den.i24.0 = shl i32 %den, 8 - %num.i24 = lshr i32 %num.i24.0, 8 - %den.i24 = lshr i32 %den.i24.0, 8 - %result = urem i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}urem25_i32: -; RCP_IFLAG is for URECIP in the full 32b alg -; SI: v_rcp_iflag -; SI-NOT: v_rcp_f32 - -; EG-NOT: UINT_TO_FLT -; EG-NOT: RECIP_IEEE -define void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 7 - %den.i24.0 = shl i32 %den, 7 - %num.i24 = lshr i32 %num.i24.0, 7 - %den.i24 = lshr i32 %den.i24.0, 7 - %result = urem i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_no_urem24_i32_1: -; RCP_IFLAG is for URECIP in the full 32b alg -; SI: v_rcp_iflag -; SI-NOT: v_rcp_f32 - -; EG-NOT: UINT_TO_FLT -; EG-NOT: RECIP_IEEE -define void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 8 - %den.i24.0 = shl i32 %den, 7 - %num.i24 = lshr i32 %num.i24.0, 8 - %den.i24 = lshr i32 %den.i24.0, 7 - %result = urem i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}test_no_urem24_i32_2: -; RCP_IFLAG is for URECIP in the full 32b alg -; SI: v_rcp_iflag -; SI-NOT: v_rcp_f32 - -; EG-NOT: UINT_TO_FLT -; EG-NOT: RECIP_IEEE -define void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %num = load i32, i32 addrspace(1) * %in, align 4 - %den = load i32, i32 addrspace(1) * %den_ptr, align 4 - %num.i24.0 = shl i32 %num, 7 - %den.i24.0 = shl i32 %den, 8 - %num.i24 = lshr i32 %num.i24.0, 7 - %den.i24 = lshr i32 %den.i24.0, 8 - %result = urem i32 %num.i24, %den.i24 - store i32 %result, i32 addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/udivrem64.ll b/test/CodeGen/R600/udivrem64.ll deleted file mode 100644 index 9f3069bdf80..00000000000 --- a/test/CodeGen/R600/udivrem64.ll +++ /dev/null @@ -1,223 +0,0 @@ -;RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s -;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s -;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s - -;FUNC-LABEL: {{^}}test_udiv: -;EG: RECIP_UINT -;EG: LSHL {{.*}}, 1, -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT - -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN-NOT: v_mad_f32 -;SI-NOT: v_lshr_b64 -;VI-NOT: v_lshrrev_b64 -;GCN: s_endpgm -define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { - %result = udiv i64 %x, %y - store i64 %result, i64 addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}test_urem: -;EG: RECIP_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: BFE_UINT -;EG: AND_INT {{.*}}, 1, - -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN: s_bfe_u32 -;GCN-NOT: v_mad_f32 -;SI-NOT: v_lshr_b64 -;VI-NOT: v_lshrrev_b64 -;GCN: s_endpgm -define void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) { - %result = urem i64 %x, %y - store i64 %result, i64 addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}test_udiv3264: -;EG: RECIP_UINT -;EG-NOT: BFE_UINT - -;GCN-NOT: s_bfe_u32 -;GCN-NOT: v_mad_f32 -;SI-NOT: v_lshr_b64 -;VI-NOT: v_lshrrev_b64 -;GCN: s_endpgm -define void @test_udiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { - %1 = lshr i64 %x, 33 - %2 = lshr i64 %y, 33 - %result = udiv i64 %1, %2 - store i64 %result, i64 addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}test_urem3264: -;EG: RECIP_UINT -;EG-NOT: BFE_UINT - -;GCN-NOT: s_bfe_u32 -;GCN-NOT: v_mad_f32 -;SI-NOT: v_lshr_b64 -;VI-NOT: v_lshrrev_b64 -;GCN: s_endpgm -define void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { - %1 = lshr i64 %x, 33 - %2 = lshr i64 %y, 33 - %result = urem i64 %1, %2 - store i64 %result, i64 addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}test_udiv2464: -;EG: UINT_TO_FLT -;EG: UINT_TO_FLT -;EG: FLT_TO_UINT -;EG-NOT: RECIP_UINT -;EG-NOT: BFE_UINT - -;SI-NOT: v_lshr_b64 -;VI-NOT: v_lshrrev_b64 -;GCN: v_mad_f32 -;GCN: s_endpgm -define void @test_udiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { - %1 = lshr i64 %x, 40 - %2 = lshr i64 %y, 40 - %result = udiv i64 %1, %2 - store i64 %result, i64 addrspace(1)* %out - ret void -} - -;FUNC-LABEL: {{^}}test_urem2464: -;EG: UINT_TO_FLT -;EG: UINT_TO_FLT -;EG: FLT_TO_UINT -;EG-NOT: RECIP_UINT -;EG-NOT: BFE_UINT - -;SI-NOT: v_lshr_b64 -;VI-NOT: v_lshrrev_b64 -;GCN: v_mad_f32 -;GCN: s_endpgm -define void @test_urem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { - %1 = lshr i64 %x, 40 - %2 = lshr i64 %y, 40 - %result = urem i64 %1, %2 - store i64 %result, i64 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/uint_to_fp.f64.ll b/test/CodeGen/R600/uint_to_fp.f64.ll deleted file mode 100644 index dfec8eb15cb..00000000000 --- a/test/CodeGen/R600/uint_to_fp.f64.ll +++ /dev/null @@ -1,98 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -; SI-LABEL: {{^}}v_uint_to_fp_i64_to_f64 -; SI: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} -; SI: v_cvt_f64_u32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]] -; SI: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32 -; SI: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]] -; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]] -; SI: buffer_store_dwordx2 [[RESULT]] -define void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid - %val = load i64, i64 addrspace(1)* %gep, align 8 - %result = uitofp i64 %val to double - store double %result, double addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}s_uint_to_fp_i64_to_f64 -define void @s_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) { - %cast = uitofp i64 %in to double - store double %cast, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}s_uint_to_fp_v2i64_to_v2f64 -define void @s_uint_to_fp_v2i64_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i64> %in) { - %cast = uitofp <2 x i64> %in to <2 x double> - store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}s_uint_to_fp_v4i64_to_v4f64 -define void @s_uint_to_fp_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %in) { - %cast = uitofp <4 x i64> %in to <4 x double> - store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}s_uint_to_fp_i32_to_f64 -; SI: v_cvt_f64_u32_e32 -; SI: s_endpgm -define void @s_uint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) { - %cast = uitofp i32 %in to double - store double %cast, double addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}s_uint_to_fp_v2i32_to_v2f64 -; SI: v_cvt_f64_u32_e32 -; SI: v_cvt_f64_u32_e32 -; SI: s_endpgm -define void @s_uint_to_fp_v2i32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i32> %in) { - %cast = uitofp <2 x i32> %in to <2 x double> - store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16 - ret void -} - -; SI-LABEL: {{^}}s_uint_to_fp_v4i32_to_v4f64 -; SI: v_cvt_f64_u32_e32 -; SI: v_cvt_f64_u32_e32 -; SI: v_cvt_f64_u32_e32 -; SI: v_cvt_f64_u32_e32 -; SI: s_endpgm -define void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i32> %in) { - %cast = uitofp <4 x i32> %in to <4 x double> - store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16 - ret void -} - -; FIXME: select on 0, 0 -; SI-LABEL: {{^}}uint_to_fp_i1_to_f64: -; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]], -; We can't fold the SGPRs into v_cndmask_b32_e64, because it already -; uses an SGPR for [[CMP]] -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]] -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, [[CMP]] -; SI: buffer_store_dwordx2 -; SI: s_endpgm -define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) { - %cmp = icmp eq i32 %in, 0 - %fp = uitofp i1 %cmp to double - store double %fp, double addrspace(1)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}uint_to_fp_i1_to_f64_load: -; SI: v_cndmask_b32_e64 [[IRESULT:v[0-9]]], 0, 1 -; SI-NEXT: v_cvt_f64_u32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]] -; SI: buffer_store_dwordx2 [[RESULT]] -; SI: s_endpgm -define void @uint_to_fp_i1_to_f64_load(double addrspace(1)* %out, i1 %in) { - %fp = uitofp i1 %in to double - store double %fp, double addrspace(1)* %out, align 8 - ret void -} diff --git a/test/CodeGen/R600/uint_to_fp.ll b/test/CodeGen/R600/uint_to_fp.ll deleted file mode 100644 index 00fea80b1bc..00000000000 --- a/test/CodeGen/R600/uint_to_fp.ll +++ /dev/null @@ -1,82 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}uint_to_fp_i32_to_f32: -; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z - -; SI: v_cvt_f32_u32_e32 -; SI: s_endpgm -define void @uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) { - %result = uitofp i32 %in to float - store float %result, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}uint_to_fp_v2i32_to_v2f32: -; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W -; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X - -; SI: v_cvt_f32_u32_e32 -; SI: v_cvt_f32_u32_e32 -; SI: s_endpgm -define void @uint_to_fp_v2i32_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i32> %in) { - %result = uitofp <2 x i32> %in to <2 x float> - store <2 x float> %result, <2 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}uint_to_fp_v4i32_to_v4f32: -; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_cvt_f32_u32_e32 -; SI: v_cvt_f32_u32_e32 -; SI: v_cvt_f32_u32_e32 -; SI: v_cvt_f32_u32_e32 -; SI: s_endpgm -define void @uint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %value = load <4 x i32>, <4 x i32> addrspace(1) * %in - %result = uitofp <4 x i32> %value to <4 x float> - store <4 x float> %result, <4 x float> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}uint_to_fp_i64_to_f32: -; R600: UINT_TO_FLT -; R600: UINT_TO_FLT -; R600: MULADD_IEEE -; SI: v_cvt_f32_u32_e32 -; SI: v_cvt_f32_u32_e32 -; SI: v_madmk_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, 0x4f800000 -; SI: s_endpgm -define void @uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) { -entry: - %0 = uitofp i64 %in to float - store float %0, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}uint_to_fp_i1_to_f32: -; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]], -; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]] -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @uint_to_fp_i1_to_f32(float addrspace(1)* %out, i32 %in) { - %cmp = icmp eq i32 %in, 0 - %fp = uitofp i1 %cmp to float - store float %fp, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}uint_to_fp_i1_to_f32_load: -; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0 -; SI: buffer_store_dword [[RESULT]], -; SI: s_endpgm -define void @uint_to_fp_i1_to_f32_load(float addrspace(1)* %out, i1 %in) { - %fp = uitofp i1 %in to float - store float %fp, float addrspace(1)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/unaligned-load-store.ll b/test/CodeGen/R600/unaligned-load-store.ll deleted file mode 100644 index 82d88ebd3ae..00000000000 --- a/test/CodeGen/R600/unaligned-load-store.ll +++ /dev/null @@ -1,254 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}unaligned_load_store_i16_local: -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: s_endpgm -define void @unaligned_load_store_i16_local(i16 addrspace(3)* %p, i16 addrspace(3)* %r) nounwind { - %v = load i16, i16 addrspace(3)* %p, align 1 - store i16 %v, i16 addrspace(3)* %r, align 1 - ret void -} - -; SI-LABEL: {{^}}unaligned_load_store_i16_global: -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: s_endpgm -define void @unaligned_load_store_i16_global(i16 addrspace(1)* %p, i16 addrspace(1)* %r) nounwind { - %v = load i16, i16 addrspace(1)* %p, align 1 - store i16 %v, i16 addrspace(1)* %r, align 1 - ret void -} - -; SI-LABEL: {{^}}unaligned_load_store_i32_local: -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: s_endpgm -define void @unaligned_load_store_i32_local(i32 addrspace(3)* %p, i32 addrspace(3)* %r) nounwind { - %v = load i32, i32 addrspace(3)* %p, align 1 - store i32 %v, i32 addrspace(3)* %r, align 1 - ret void -} - -; SI-LABEL: {{^}}unaligned_load_store_i32_global: -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -define void @unaligned_load_store_i32_global(i32 addrspace(1)* %p, i32 addrspace(1)* %r) nounwind { - %v = load i32, i32 addrspace(1)* %p, align 1 - store i32 %v, i32 addrspace(1)* %r, align 1 - ret void -} - -; SI-LABEL: {{^}}unaligned_load_store_i64_local: -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: s_endpgm -define void @unaligned_load_store_i64_local(i64 addrspace(3)* %p, i64 addrspace(3)* %r) { - %v = load i64, i64 addrspace(3)* %p, align 1 - store i64 %v, i64 addrspace(3)* %r, align 1 - ret void -} - -; SI-LABEL: {{^}}unaligned_load_store_i64_global: -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) { - %v = load i64, i64 addrspace(1)* %p, align 1 - store i64 %v, i64 addrspace(1)* %r, align 1 - ret void -} - -; SI-LABEL: {{^}}unaligned_load_store_v4i32_local: -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 - -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 - -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 - -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 - -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 - -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 - -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 - -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: s_endpgm -define void @unaligned_load_store_v4i32_local(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) nounwind { - %v = load <4 x i32>, <4 x i32> addrspace(3)* %p, align 1 - store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1 - ret void -} - -; FIXME: We mark v4i32 as custom, so misaligned loads are never expanded. -; FIXME-SI-LABEL: {{^}}unaligned_load_store_v4i32_global -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -; FIXME-SI: buffer_load_ubyte -define void @unaligned_load_store_v4i32_global(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) nounwind { - %v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1 - store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1 - ret void -} - -; SI-LABEL: {{^}}load_lds_i64_align_4: -; SI: ds_read2_b32 -; SI: s_endpgm -define void @load_lds_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { - %val = load i64, i64 addrspace(3)* %in, align 4 - store i64 %val, i64 addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}load_lds_i64_align_4_with_offset -; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9 -; SI: s_endpgm -define void @load_lds_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { - %ptr = getelementptr i64, i64 addrspace(3)* %in, i32 4 - %val = load i64, i64 addrspace(3)* %ptr, align 4 - store i64 %val, i64 addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}load_lds_i64_align_4_with_split_offset: -; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits -; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1 -; SI: s_endpgm -define void @load_lds_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { - %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)* - %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255 - %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)* - %val = load i64, i64 addrspace(3)* %ptri64, align 4 - store i64 %val, i64 addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}load_lds_i64_align_1: -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: buffer_store_dwordx2 -; SI: s_endpgm - -define void @load_lds_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { - %val = load i64, i64 addrspace(3)* %in, align 1 - store i64 %val, i64 addrspace(1)* %out, align 8 - ret void -} - -; SI-LABEL: {{^}}store_lds_i64_align_4: -; SI: ds_write2_b32 -; SI: s_endpgm -define void @store_lds_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 { - store i64 %val, i64 addrspace(3)* %out, align 4 - ret void -} - -; SI-LABEL: {{^}}store_lds_i64_align_4_with_offset -; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9 -; SI: s_endpgm -define void @store_lds_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 { - %ptr = getelementptr i64, i64 addrspace(3)* %out, i32 4 - store i64 0, i64 addrspace(3)* %ptr, align 4 - ret void -} - -; SI-LABEL: {{^}}store_lds_i64_align_4_with_split_offset: -; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits -; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1 -; SI: s_endpgm -define void @store_lds_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 { - %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)* - %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255 - %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)* - store i64 0, i64 addrspace(3)* %out, align 4 - ret void -} diff --git a/test/CodeGen/R600/unhandled-loop-condition-assertion.ll b/test/CodeGen/R600/unhandled-loop-condition-assertion.ll deleted file mode 100644 index 036a7e91b47..00000000000 --- a/test/CodeGen/R600/unhandled-loop-condition-assertion.ll +++ /dev/null @@ -1,115 +0,0 @@ -; REQUIRES: asserts -; XFAIL: * -; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s -; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s -; RUN: llc -O0 -verify-machineinstrs -asm-verbose=0 -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=COMMON %s - -; SI hits an assertion at -O0, evergreen hits a not implemented unreachable. - -; COMMON-LABEL: {{^}}branch_true: -define void @branch_true(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 { -entry: - br i1 true, label %for.end, label %for.body.lr.ph - -for.body.lr.ph: ; preds = %entry - %add.ptr.sum = shl i32 %main_stride, 1 - %add.ptr1.sum = add i32 %add.ptr.sum, %main_stride - %add.ptr4.sum = shl i32 %main_stride, 2 - br label %for.body - -for.body: ; preds = %for.body, %for.body.lr.ph - %main.addr.011 = phi i8 addrspace(1)* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ] - %0 = bitcast i8 addrspace(1)* %main.addr.011 to i32 addrspace(1)* - %1 = load i32, i32 addrspace(1)* %0, align 4 - %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %main_stride - %2 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)* - %3 = load i32, i32 addrspace(1)* %2, align 4 - %add.ptr1 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum - %4 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)* - %5 = load i32, i32 addrspace(1)* %4, align 4 - %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum - %6 = bitcast i8 addrspace(1)* %add.ptr2 to i32 addrspace(1)* - %7 = load i32, i32 addrspace(1)* %6, align 4 - %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum - %8 = bitcast i8 addrspace(1)* %add.ptr3 to i32 addrspace(1)* - %9 = load i32, i32 addrspace(1)* %8, align 4 - %add.ptr6 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 undef - br i1 undef, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - ret void -} - -; COMMON-LABEL: {{^}}branch_false: -; SI: .text -; SI-NEXT: s_endpgm -define void @branch_false(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 { -entry: - br i1 false, label %for.end, label %for.body.lr.ph - -for.body.lr.ph: ; preds = %entry - %add.ptr.sum = shl i32 %main_stride, 1 - %add.ptr1.sum = add i32 %add.ptr.sum, %main_stride - %add.ptr4.sum = shl i32 %main_stride, 2 - br label %for.body - -for.body: ; preds = %for.body, %for.body.lr.ph - %main.addr.011 = phi i8 addrspace(1)* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ] - %0 = bitcast i8 addrspace(1)* %main.addr.011 to i32 addrspace(1)* - %1 = load i32, i32 addrspace(1)* %0, align 4 - %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %main_stride - %2 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)* - %3 = load i32, i32 addrspace(1)* %2, align 4 - %add.ptr1 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum - %4 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)* - %5 = load i32, i32 addrspace(1)* %4, align 4 - %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum - %6 = bitcast i8 addrspace(1)* %add.ptr2 to i32 addrspace(1)* - %7 = load i32, i32 addrspace(1)* %6, align 4 - %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum - %8 = bitcast i8 addrspace(1)* %add.ptr3 to i32 addrspace(1)* - %9 = load i32, i32 addrspace(1)* %8, align 4 - %add.ptr6 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 undef - br i1 undef, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - ret void -} - -; COMMON-LABEL: {{^}}branch_undef: -; SI: .text -; SI-NEXT: s_endpgm -define void @branch_undef(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 { -entry: - br i1 undef, label %for.end, label %for.body.lr.ph - -for.body.lr.ph: ; preds = %entry - %add.ptr.sum = shl i32 %main_stride, 1 - %add.ptr1.sum = add i32 %add.ptr.sum, %main_stride - %add.ptr4.sum = shl i32 %main_stride, 2 - br label %for.body - -for.body: ; preds = %for.body, %for.body.lr.ph - %main.addr.011 = phi i8 addrspace(1)* [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ] - %0 = bitcast i8 addrspace(1)* %main.addr.011 to i32 addrspace(1)* - %1 = load i32, i32 addrspace(1)* %0, align 4 - %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %main_stride - %2 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)* - %3 = load i32, i32 addrspace(1)* %2, align 4 - %add.ptr1 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr.sum - %4 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)* - %5 = load i32, i32 addrspace(1)* %4, align 4 - %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr1.sum - %6 = bitcast i8 addrspace(1)* %add.ptr2 to i32 addrspace(1)* - %7 = load i32, i32 addrspace(1)* %6, align 4 - %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 %add.ptr4.sum - %8 = bitcast i8 addrspace(1)* %add.ptr3 to i32 addrspace(1)* - %9 = load i32, i32 addrspace(1)* %8, align 4 - %add.ptr6 = getelementptr inbounds i8, i8 addrspace(1)* %main.addr.011, i32 undef - br i1 undef, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - ret void -} - -attributes #0 = { nounwind } diff --git a/test/CodeGen/R600/unroll.ll b/test/CodeGen/R600/unroll.ll deleted file mode 100644 index 411a15a4b83..00000000000 --- a/test/CodeGen/R600/unroll.ll +++ /dev/null @@ -1,36 +0,0 @@ -; RUN: opt -mtriple=amdgcn-- -loop-unroll -simplifycfg -sroa %s -S -o - | FileCheck %s -; RUN: opt -mtriple=r600-- -loop-unroll -simplifycfg -sroa %s -S -o - | FileCheck %s - - -; This test contains a simple loop that initializes an array declared in -; private memory. We want to make sure these kinds of loops are always -; unrolled, because private memory is slow. - -; CHECK-LABEL: @test -; CHECK-NOT: alloca -; CHECK: store i32 5, i32 addrspace(1)* %out -define void @test(i32 addrspace(1)* %out) { -entry: - %0 = alloca [32 x i32] - br label %loop.header - -loop.header: - %counter = phi i32 [0, %entry], [%inc, %loop.inc] - br label %loop.body - -loop.body: - %ptr = getelementptr [32 x i32], [32 x i32]* %0, i32 0, i32 %counter - store i32 %counter, i32* %ptr - br label %loop.inc - -loop.inc: - %inc = add i32 %counter, 1 - %1 = icmp sge i32 %counter, 32 - br i1 %1, label %exit, label %loop.header - -exit: - %2 = getelementptr [32 x i32], [32 x i32]* %0, i32 0, i32 5 - %3 = load i32, i32* %2 - store i32 %3, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/unsupported-cc.ll b/test/CodeGen/R600/unsupported-cc.ll deleted file mode 100644 index 8ab4faf2f14..00000000000 --- a/test/CodeGen/R600/unsupported-cc.ll +++ /dev/null @@ -1,125 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; These tests are for condition codes that are not supported by the hardware - -; CHECK-LABEL: {{^}}slt: -; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z -; CHECK-NEXT: LSHR -; CHECK-NEXT: 5(7.006492e-45) -define void @slt(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = icmp slt i32 %in, 5 - %1 = select i1 %0, i32 -1, i32 0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}ult_i32: -; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z -; CHECK-NEXT: LSHR -; CHECK-NEXT: 5(7.006492e-45) -define void @ult_i32(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = icmp ult i32 %in, 5 - %1 = select i1 %0, i32 -1, i32 0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}ult_float: -; CHECK: SETGE * T{{[0-9]}}.[[CHAN:[XYZW]]], KC0[2].Z, literal.x -; CHECK-NEXT: 1084227584(5.000000e+00) -; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0 -; CHECK-NEXT: LSHR * -define void @ult_float(float addrspace(1)* %out, float %in) { -entry: - %0 = fcmp ult float %in, 5.0 - %1 = select i1 %0, float 1.0, float 0.0 - store float %1, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}ult_float_native: -; CHECK: SETGE T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x -; CHECK-NEXT: LSHR * -; CHECK-NEXT: 1084227584(5.000000e+00) -define void @ult_float_native(float addrspace(1)* %out, float %in) { -entry: - %0 = fcmp ult float %in, 5.0 - %1 = select i1 %0, float 0.0, float 1.0 - store float %1, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}olt: -; CHECK: SETGT T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z -; CHECK-NEXT: LSHR * -; CHECK-NEXT: 1084227584(5.000000e+00) -define void @olt(float addrspace(1)* %out, float %in) { -entry: - %0 = fcmp olt float %in, 5.0 - %1 = select i1 %0, float 1.0, float 0.0 - store float %1, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}sle: -; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z -; CHECK-NEXT: LSHR -; CHECK-NEXT: 6(8.407791e-45) -define void @sle(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = icmp sle i32 %in, 5 - %1 = select i1 %0, i32 -1, i32 0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}ule_i32: -; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z -; CHECK-NEXT: LSHR -; CHECK-NEXT: 6(8.407791e-45) -define void @ule_i32(i32 addrspace(1)* %out, i32 %in) { -entry: - %0 = icmp ule i32 %in, 5 - %1 = select i1 %0, i32 -1, i32 0 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}ule_float: -; CHECK: SETGT * T{{[0-9]}}.[[CHAN:[XYZW]]], KC0[2].Z, literal.x -; CHECK-NEXT: 1084227584(5.000000e+00) -; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0 -; CHECK-NEXT: LSHR * -define void @ule_float(float addrspace(1)* %out, float %in) { -entry: - %0 = fcmp ule float %in, 5.0 - %1 = select i1 %0, float 1.0, float 0.0 - store float %1, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}ule_float_native: -; CHECK: SETGT T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x -; CHECK-NEXT: LSHR * -; CHECK-NEXT: 1084227584(5.000000e+00) -define void @ule_float_native(float addrspace(1)* %out, float %in) { -entry: - %0 = fcmp ule float %in, 5.0 - %1 = select i1 %0, float 0.0, float 1.0 - store float %1, float addrspace(1)* %out - ret void -} - -; CHECK-LABEL: {{^}}ole: -; CHECK: SETGE T{{[0-9]\.[XYZW]}}, literal.x, KC0[2].Z -; CHECK-NEXT: LSHR * -; CHECK-NEXT:1084227584(5.000000e+00) -define void @ole(float addrspace(1)* %out, float %in) { -entry: - %0 = fcmp ole float %in, 5.0 - %1 = select i1 %0, float 1.0, float 0.0 - store float %1, float addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/urecip.ll b/test/CodeGen/R600/urecip.ll deleted file mode 100644 index daacc771708..00000000000 --- a/test/CodeGen/R600/urecip.ll +++ /dev/null @@ -1,13 +0,0 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK: v_rcp_iflag_f32_e32 - -define void @test(i32 %p, i32 %q) { - %i = udiv i32 %p, %q - %r = bitcast i32 %i to float - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) - ret void -} - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) diff --git a/test/CodeGen/R600/urem.ll b/test/CodeGen/R600/urem.ll deleted file mode 100644 index 62841ec2d6c..00000000000 --- a/test/CodeGen/R600/urem.ll +++ /dev/null @@ -1,94 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; The code generated by urem is long and complex and may frequently -; change. The goal of this test is to make sure the ISel doesn't fail -; when it gets a v2i32/v4i32 urem - -; FUNC-LABEL: {{^}}test_urem_i32: -; SI: s_endpgm -; EG: CF_END -define void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 - %a = load i32, i32 addrspace(1)* %in - %b = load i32, i32 addrspace(1)* %b_ptr - %result = urem i32 %a, %b - store i32 %result, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_urem_i32_7: -; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x24924925 -; SI: v_mul_hi_u32 {{v[0-9]+}}, [[MAGIC]] -; SI: v_subrev_i32 -; SI: v_mul_lo_i32 -; SI: v_sub_i32 -; SI: buffer_store_dword -; SI: s_endpgm -define void @test_urem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { - %num = load i32, i32 addrspace(1) * %in - %result = urem i32 %num, 7 - store i32 %result, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_urem_v2i32: -; SI: s_endpgm -; EG: CF_END -define void @test_urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 - %a = load <2 x i32>, <2 x i32> addrspace(1)* %in - %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr - %result = urem <2 x i32> %a, %b - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_urem_v4i32: -; SI: s_endpgm -; EG: CF_END -define void @test_urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 - %a = load <4 x i32>, <4 x i32> addrspace(1)* %in - %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr - %result = urem <4 x i32> %a, %b - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_urem_i64: -; SI: s_endpgm -; EG: CF_END -define void @test_urem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { - %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 - %a = load i64, i64 addrspace(1)* %in - %b = load i64, i64 addrspace(1)* %b_ptr - %result = urem i64 %a, %b - store i64 %result, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_urem_v2i64: -; SI: s_endpgm -; EG: CF_END -define void @test_urem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { - %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 - %a = load <2 x i64>, <2 x i64> addrspace(1)* %in - %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr - %result = urem <2 x i64> %a, %b - store <2 x i64> %result, <2 x i64> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}test_urem_v4i64: -; SI: s_endpgm -; EG: CF_END -define void @test_urem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { - %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 - %a = load <4 x i64>, <4 x i64> addrspace(1)* %in - %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr - %result = urem <4 x i64> %a, %b - store <4 x i64> %result, <4 x i64> addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/use-sgpr-multiple-times.ll b/test/CodeGen/R600/use-sgpr-multiple-times.ll deleted file mode 100644 index f26f30022b4..00000000000 --- a/test/CodeGen/R600/use-sgpr-multiple-times.ll +++ /dev/null @@ -1,103 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s - -declare float @llvm.fma.f32(float, float, float) #1 -declare float @llvm.fmuladd.f32(float, float, float) #1 -declare i32 @llvm.AMDGPU.imad24(i32, i32, i32) #1 - - -; GCN-LABEL: {{^}}test_sgpr_use_twice_binop: -; GCN: s_load_dword [[SGPR:s[0-9]+]], -; GCN: v_add_f32_e64 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]] -; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 { - %dbl = fadd float %a, %a - store float %dbl, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_sgpr_use_three_ternary_op: -; GCN: s_load_dword [[SGPR:s[0-9]+]], -; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[SGPR]] -; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a) #0 { - %fma = call float @llvm.fma.f32(float %a, float %a, float %a) #1 - store float %fma, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_b: -; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 -; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]] -; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[SGPR0]], [[VGPR1]] -; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 { - %fma = call float @llvm.fma.f32(float %a, float %a, float %b) #1 - store float %fma, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_b_a: -; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 -; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]] -; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]] -; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 { - %fma = call float @llvm.fma.f32(float %a, float %b, float %a) #1 - store float %fma, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_b_a_a: -; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc -; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 -; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]] -; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]] -; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 { - %fma = call float @llvm.fma.f32(float %b, float %a, float %a) #1 - store float %fma, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_imm: -; GCN: s_load_dword [[SGPR:s[0-9]+]] -; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], 2.0 -; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, float %a) #0 { - %fma = call float @llvm.fma.f32(float %a, float %a, float 2.0) #1 - store float %fma, float addrspace(1)* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_imm_a: -; GCN: s_load_dword [[SGPR:s[0-9]+]] -; GCN: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]] -; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 { - %fma = call float @llvm.fma.f32(float %a, float 2.0, float %a) #1 - store float %fma, float addrspace(1)* %out, align 4 - ret void -} - -; Don't use fma since fma c, x, y is canonicalized to fma x, c, y -; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_imm_a_a: -; GCN: s_load_dword [[SGPR:s[0-9]+]] -; GCN: v_mad_i32_i24 [[RESULT:v[0-9]+]], 2, [[SGPR]], [[SGPR]] -; GCN: buffer_store_dword [[RESULT]] -define void @test_sgpr_use_twice_ternary_op_imm_a_a(i32 addrspace(1)* %out, i32 %a) #0 { - %fma = call i32 @llvm.AMDGPU.imad24(i32 2, i32 %a, i32 %a) #1 - store i32 %fma, i32 addrspace(1)* %out, align 4 - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/usubo.ll b/test/CodeGen/R600/usubo.ll deleted file mode 100644 index 3c9b1622a07..00000000000 --- a/test/CodeGen/R600/usubo.ll +++ /dev/null @@ -1,86 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone -declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone - -; FUNC-LABEL: {{^}}usubo_i64_zext: - -; EG: SUBB_UINT -; EG: ADDC_UINT -define void @usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { - %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind - %val = extractvalue { i64, i1 } %usub, 0 - %carry = extractvalue { i64, i1 } %usub, 1 - %ext = zext i1 %carry to i64 - %add2 = add i64 %val, %ext - store i64 %add2, i64 addrspace(1)* %out, align 8 - ret void -} - -; FUNC-LABEL: {{^}}s_usubo_i32: -; SI: s_sub_i32 - -; EG-DAG: SUBB_UINT -; EG-DAG: SUB_INT -define void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind { - %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) nounwind - %val = extractvalue { i32, i1 } %usub, 0 - %carry = extractvalue { i32, i1 } %usub, 1 - store i32 %val, i32 addrspace(1)* %out, align 4 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} - -; FUNC-LABEL: {{^}}v_usubo_i32: -; SI: v_subrev_i32_e32 - -; EG-DAG: SUBB_UINT -; EG-DAG: SUB_INT -define void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %a = load i32, i32 addrspace(1)* %aptr, align 4 - %b = load i32, i32 addrspace(1)* %bptr, align 4 - %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) nounwind - %val = extractvalue { i32, i1 } %usub, 0 - %carry = extractvalue { i32, i1 } %usub, 1 - store i32 %val, i32 addrspace(1)* %out, align 4 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} - -; FUNC-LABEL: {{^}}s_usubo_i64: -; SI: s_sub_u32 -; SI: s_subb_u32 - -; EG-DAG: SUBB_UINT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT -; EG: SUB_INT -define void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind { - %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind - %val = extractvalue { i64, i1 } %usub, 0 - %carry = extractvalue { i64, i1 } %usub, 1 - store i64 %val, i64 addrspace(1)* %out, align 8 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} - -; FUNC-LABEL: {{^}}v_usubo_i64: -; SI: v_sub_i32 -; SI: v_subb_u32 - -; EG-DAG: SUBB_UINT -; EG-DAG: SUB_INT -; EG-DAG: SUB_INT -; EG: SUB_INT -define void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { - %a = load i64, i64 addrspace(1)* %aptr, align 4 - %b = load i64, i64 addrspace(1)* %bptr, align 4 - %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind - %val = extractvalue { i64, i1 } %usub, 0 - %carry = extractvalue { i64, i1 } %usub, 1 - store i64 %val, i64 addrspace(1)* %out, align 8 - store i1 %carry, i1 addrspace(1)* %carryout - ret void -} diff --git a/test/CodeGen/R600/v1i64-kernel-arg.ll b/test/CodeGen/R600/v1i64-kernel-arg.ll deleted file mode 100644 index 31755125c03..00000000000 --- a/test/CodeGen/R600/v1i64-kernel-arg.ll +++ /dev/null @@ -1,17 +0,0 @@ -; REQUIRES: asserts -; XFAIL: * -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s - -; CHECK-LABEL: {{^}}kernel_arg_i64: -define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { - store i64 %a, i64 addrspace(1)* %out, align 8 - ret void -} - -; i64 arg works, v1i64 arg does not. -; CHECK-LABEL: {{^}}kernel_arg_v1i64: -define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind { - store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8 - ret void -} - diff --git a/test/CodeGen/R600/v_cndmask.ll b/test/CodeGen/R600/v_cndmask.ll deleted file mode 100644 index c368c5aaf7d..00000000000 --- a/test/CodeGen/R600/v_cndmask.ll +++ /dev/null @@ -1,39 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -declare i32 @llvm.r600.read.tidig.x() #1 - -; SI-LABEL: {{^}}v_cnd_nan_nosgpr: -; SI: v_cndmask_b32_e64 v{{[0-9]}}, v{{[0-9]}}, -1, s{{\[[0-9]+:[0-9]+\]}} -; SI-DAG: v{{[0-9]}} -; All nan values are converted to 0xffffffff -; SI: s_endpgm -define void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 { - %idx = call i32 @llvm.r600.read.tidig.x() #1 - %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx - %f = load float, float addrspace(1)* %fptr - %setcc = icmp ne i32 %c, 0 - %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f - store float %select, float addrspace(1)* %out - ret void -} - - -; This requires slightly trickier SGPR operand legalization since the -; single constant bus SGPR usage is the last operand, and it should -; never be moved. - -; SI-LABEL: {{^}}v_cnd_nan: -; SI: v_cndmask_b32_e64 v{{[0-9]}}, v{{[0-9]}}, -1, s{{\[[0-9]+:[0-9]+\]}} -; SI-DAG: v{{[0-9]}} -; All nan values are converted to 0xffffffff -; SI: s_endpgm -define void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 { - %setcc = icmp ne i32 %c, 0 - %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f - store float %select, float addrspace(1)* %out - ret void -} - -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/R600/valu-i1.ll b/test/CodeGen/R600/valu-i1.ll deleted file mode 100644 index 7d0ebd139f5..00000000000 --- a/test/CodeGen/R600/valu-i1.ll +++ /dev/null @@ -1,188 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -enable-misched -asm-verbose < %s | FileCheck -check-prefix=SI %s - -declare i32 @llvm.r600.read.tidig.x() nounwind readnone - -; SI-LABEL: @test_if -; Make sure the i1 values created by the cfg structurizer pass are -; moved using VALU instructions -; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1 -; SI: v_mov_b32_e32 v{{[0-9]}}, -1 -define void @test_if(i32 %a, i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 { -entry: - switch i32 %a, label %default [ - i32 0, label %case0 - i32 1, label %case1 - ] - -case0: - %arrayidx1 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b - store i32 0, i32 addrspace(1)* %arrayidx1, align 4 - br label %end - -case1: - %arrayidx5 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b - store i32 1, i32 addrspace(1)* %arrayidx5, align 4 - br label %end - -default: - %cmp8 = icmp eq i32 %a, 2 - %arrayidx10 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b - br i1 %cmp8, label %if, label %else - -if: - store i32 2, i32 addrspace(1)* %arrayidx10, align 4 - br label %end - -else: - store i32 3, i32 addrspace(1)* %arrayidx10, align 4 - br label %end - -end: - ret void -} - -; SI-LABEL: @simple_test_v_if -; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}} -; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc -; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] - -; SI: ; BB#1 -; SI: buffer_store_dword -; SI: s_endpgm - -; SI: BB1_2: -; SI: s_or_b64 exec, exec, [[BR_SREG]] -; SI: s_endpgm -define void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %is.0 = icmp ne i32 %tid, 0 - br i1 %is.0, label %store, label %exit - -store: - %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid - store i32 999, i32 addrspace(1)* %gep - ret void - -exit: - ret void -} - -; SI-LABEL: @simple_test_v_loop -; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}} -; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc -; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] -; SI: s_cbranch_execz BB2_2 - -; SI: ; BB#1: -; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} - -; SI: BB2_3: -; SI: buffer_load_dword -; SI: buffer_store_dword -; SI: v_cmp_eq_i32_e32 vcc, -; SI: s_or_b64 [[OR_SREG:s\[[0-9]+:[0-9]+\]]] -; SI: s_andn2_b64 exec, exec, [[OR_SREG]] -; SI: s_cbranch_execnz BB2_3 - -define void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { -entry: - %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone - %is.0 = icmp ne i32 %tid, 0 - %limit = add i32 %tid, 64 - br i1 %is.0, label %loop, label %exit - -loop: - %i = phi i32 [%tid, %entry], [%i.inc, %loop] - %gep.src = getelementptr i32, i32 addrspace(1)* %src, i32 %i - %gep.dst = getelementptr i32, i32 addrspace(1)* %dst, i32 %i - %load = load i32, i32 addrspace(1)* %src - store i32 %load, i32 addrspace(1)* %gep.dst - %i.inc = add nsw i32 %i, 1 - %cmp = icmp eq i32 %limit, %i.inc - br i1 %cmp, label %exit, label %loop - -exit: - ret void -} - -; SI-LABEL: @multi_vcond_loop - -; Load loop limit from buffer -; Branch to exit if uniformly not taken -; SI: ; BB#0: -; SI: buffer_load_dword [[VBOUND:v[0-9]+]] -; SI: v_cmp_lt_i32_e32 vcc -; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc -; SI: s_xor_b64 [[OUTER_CMP_SREG]], exec, [[OUTER_CMP_SREG]] -; SI: s_cbranch_execz BB3_2 - -; Initialize inner condition to false -; SI: ; BB#1: -; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0{{$}} -; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], [[ZERO]] - -; Clear exec bits for workitems that load -1s -; SI: BB3_3: -; SI: buffer_load_dword [[B:v[0-9]+]] -; SI: buffer_load_dword [[A:v[0-9]+]] -; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]] -; SI-DAG: v_cmp_ne_i32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]] -; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]] -; SI: s_and_saveexec_b64 [[ORNEG1]], [[ORNEG1]] -; SI: s_xor_b64 [[ORNEG1]], exec, [[ORNEG1]] -; SI: s_cbranch_execz BB3_5 - -; SI: BB#4: -; SI: buffer_store_dword -; SI: v_cmp_ge_i64_e32 vcc -; SI: s_or_b64 [[COND_STATE]], vcc, [[COND_STATE]] - -; SI: BB3_5: -; SI: s_or_b64 exec, exec, [[ORNEG1]] -; SI: s_or_b64 [[COND_STATE]], [[ORNEG1]], [[COND_STATE]] -; SI: s_andn2_b64 exec, exec, [[COND_STATE]] -; SI: s_cbranch_execnz BB3_3 - -; SI: BB#6 -; SI: s_or_b64 exec, exec, [[COND_STATE]] - -; SI: BB3_2: -; SI-NOT: [[COND_STATE]] -; SI: s_endpgm - -define void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 { -bb: - %tmp = tail call i32 @llvm.r600.read.tidig.x() #0 - %tmp4 = sext i32 %tmp to i64 - %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg3, i64 %tmp4 - %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4 - %tmp7 = icmp sgt i32 %tmp6, 0 - %tmp8 = sext i32 %tmp6 to i64 - br i1 %tmp7, label %bb10, label %bb26 - -bb10: ; preds = %bb, %bb20 - %tmp11 = phi i64 [ %tmp23, %bb20 ], [ 0, %bb ] - %tmp12 = add nsw i64 %tmp11, %tmp4 - %tmp13 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp12 - %tmp14 = load i32, i32 addrspace(1)* %tmp13, align 4 - %tmp15 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp12 - %tmp16 = load i32, i32 addrspace(1)* %tmp15, align 4 - %tmp17 = icmp ne i32 %tmp14, -1 - %tmp18 = icmp ne i32 %tmp16, -1 - %tmp19 = and i1 %tmp17, %tmp18 - br i1 %tmp19, label %bb20, label %bb26 - -bb20: ; preds = %bb10 - %tmp21 = add nsw i32 %tmp16, %tmp14 - %tmp22 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp12 - store i32 %tmp21, i32 addrspace(1)* %tmp22, align 4 - %tmp23 = add nuw nsw i64 %tmp11, 1 - %tmp24 = icmp slt i64 %tmp23, %tmp8 - br i1 %tmp24, label %bb10, label %bb26 - -bb26: ; preds = %bb10, %bb20, %bb - ret void -} - -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind } diff --git a/test/CodeGen/R600/vector-alloca.ll b/test/CodeGen/R600/vector-alloca.ll deleted file mode 100644 index 6f3b4847fbd..00000000000 --- a/test/CodeGen/R600/vector-alloca.ll +++ /dev/null @@ -1,77 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=verde -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=verde -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s - -; FUNC-LABEL: {{^}}vector_read: -; EG: MOV -; EG: MOV -; EG: MOV -; EG: MOV -; EG: MOVA_INT -define void @vector_read(i32 addrspace(1)* %out, i32 %index) { -entry: - %0 = alloca [4 x i32] - %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0 - %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1 - %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2 - %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3 - store i32 0, i32* %x - store i32 1, i32* %y - store i32 2, i32* %z - store i32 3, i32* %w - %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %index - %2 = load i32, i32* %1 - store i32 %2, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}vector_write: -; EG: MOV -; EG: MOV -; EG: MOV -; EG: MOV -; EG: MOVA_INT -; EG: MOVA_INT -define void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) { -entry: - %0 = alloca [4 x i32] - %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0 - %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1 - %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2 - %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3 - store i32 0, i32* %x - store i32 0, i32* %y - store i32 0, i32* %z - store i32 0, i32* %w - %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %w_index - store i32 1, i32* %1 - %2 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 %r_index - %3 = load i32, i32* %2 - store i32 %3, i32 addrspace(1)* %out - ret void -} - -; This test should be optimize to: -; store i32 0, i32 addrspace(1)* %out -; FUNC-LABEL: {{^}}bitcast_gep: -; EG: STORE_RAW -define void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) { -entry: - %0 = alloca [4 x i32] - %x = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 0 - %y = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1 - %z = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 2 - %w = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 3 - store i32 0, i32* %x - store i32 0, i32* %y - store i32 0, i32* %z - store i32 0, i32* %w - %1 = getelementptr [4 x i32], [4 x i32]* %0, i32 0, i32 1 - %2 = bitcast i32* %1 to [4 x i32]* - %3 = getelementptr [4 x i32], [4 x i32]* %2, i32 0, i32 0 - %4 = load i32, i32* %3 - store i32 %4, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/vertex-fetch-encoding.ll b/test/CodeGen/R600/vertex-fetch-encoding.ll deleted file mode 100644 index fb6a17e6714..00000000000 --- a/test/CodeGen/R600/vertex-fetch-encoding.ll +++ /dev/null @@ -1,25 +0,0 @@ -; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=barts | FileCheck --check-prefix=NI %s -; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=cayman | FileCheck --check-prefix=CM %s - -; NI: {{^}}vtx_fetch32: -; NI: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0 ; encoding: [0x40,0x01,0x0[[GPR]],0x10,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x08,0x00 -; CM: {{^}}vtx_fetch32: -; CM: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0 ; encoding: [0x40,0x01,0x0[[GPR]],0x00,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x00,0x00 - -define void @vtx_fetch32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { -entry: - %0 = load i32, i32 addrspace(1)* %in - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; NI: {{^}}vtx_fetch128: -; NI: VTX_READ_128 T[[DST:[0-9]]].XYZW, T[[SRC:[0-9]]].X, 0 ; encoding: [0x40,0x01,0x0[[SRC]],0x40,0x0[[DST]],0x10,0x8d,0x18,0x00,0x00,0x08,0x00 -; XXX: Add a case for Cayman when v4i32 stores are supported. - -define void @vtx_fetch128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { -entry: - %0 = load <4 x i32>, <4 x i32> addrspace(1)* %in - store <4 x i32> %0, <4 x i32> addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/vop-shrink.ll b/test/CodeGen/R600/vop-shrink.ll deleted file mode 100644 index 9b2f229c05a..00000000000 --- a/test/CodeGen/R600/vop-shrink.ll +++ /dev/null @@ -1,51 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - -; Test that we correctly commute a sub instruction -; FUNC-LABEL: {{^}}sub_rev: -; SI-NOT: v_sub_i32_e32 v{{[0-9]+}}, s -; SI: v_subrev_i32_e32 v{{[0-9]+}}, s - -; ModuleID = 'vop-shrink.ll' - -define void @sub_rev(i32 addrspace(1)* %out, <4 x i32> %sgpr, i32 %cond) { -entry: - %vgpr = call i32 @llvm.r600.read.tidig.x() #1 - %tmp = icmp eq i32 %cond, 0 - br i1 %tmp, label %if, label %else - -if: ; preds = %entry - %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 - %tmp2 = extractelement <4 x i32> %sgpr, i32 1 - store i32 %tmp2, i32 addrspace(1)* %out - br label %endif - -else: ; preds = %entry - %tmp3 = extractelement <4 x i32> %sgpr, i32 2 - %tmp4 = sub i32 %vgpr, %tmp3 - store i32 %tmp4, i32 addrspace(1)* %out - br label %endif - -endif: ; preds = %else, %if - ret void -} - -; Test that we fold an immediate that was illegal for a 64-bit op into the -; 32-bit op when we shrink it. - -; FUNC-LABEL: {{^}}add_fold: -; SI: v_add_f32_e32 v{{[0-9]+}}, 0x44800000 -define void @add_fold(float addrspace(1)* %out) { -entry: - %tmp = call i32 @llvm.r600.read.tidig.x() - %tmp1 = uitofp i32 %tmp to float - %tmp2 = fadd float %tmp1, 1.024000e+03 - store float %tmp2, float addrspace(1)* %out - ret void -} - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tidig.x() #0 - -attributes #0 = { nounwind readnone } -attributes #1 = { readnone } diff --git a/test/CodeGen/R600/vselect.ll b/test/CodeGen/R600/vselect.ll deleted file mode 100644 index a3014b03d2b..00000000000 --- a/test/CodeGen/R600/vselect.ll +++ /dev/null @@ -1,77 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s -;RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s - -;EG: {{^}}test_select_v2i32: -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -;SI: {{^}}test_select_v2i32: -;SI: v_cndmask_b32_e64 -;SI: v_cndmask_b32_e64 - -define void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) { -entry: - %0 = load <2 x i32>, <2 x i32> addrspace(1)* %in0 - %1 = load <2 x i32>, <2 x i32> addrspace(1)* %in1 - %cmp = icmp ne <2 x i32> %0, %1 - %result = select <2 x i1> %cmp, <2 x i32> %0, <2 x i32> %1 - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -;EG: {{^}}test_select_v2f32: -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -;SI: {{^}}test_select_v2f32: -;SI: v_cndmask_b32_e64 -;SI: v_cndmask_b32_e64 - -define void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) { -entry: - %0 = load <2 x float>, <2 x float> addrspace(1)* %in0 - %1 = load <2 x float>, <2 x float> addrspace(1)* %in1 - %cmp = fcmp une <2 x float> %0, %1 - %result = select <2 x i1> %cmp, <2 x float> %0, <2 x float> %1 - store <2 x float> %result, <2 x float> addrspace(1)* %out - ret void -} - -;EG: {{^}}test_select_v4i32: -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -;SI: {{^}}test_select_v4i32: -;SI: v_cndmask_b32_e64 -;SI: v_cndmask_b32_e64 -;SI: v_cndmask_b32_e64 -;SI: v_cndmask_b32_e64 - -define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) { -entry: - %0 = load <4 x i32>, <4 x i32> addrspace(1)* %in0 - %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in1 - %cmp = icmp ne <4 x i32> %0, %1 - %result = select <4 x i1> %cmp, <4 x i32> %0, <4 x i32> %1 - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -;EG: {{^}}test_select_v4f32: -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -define void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) { -entry: - %0 = load <4 x float>, <4 x float> addrspace(1)* %in0 - %1 = load <4 x float>, <4 x float> addrspace(1)* %in1 - %cmp = fcmp une <4 x float> %0, %1 - %result = select <4 x i1> %cmp, <4 x float> %0, <4 x float> %1 - store <4 x float> %result, <4 x float> addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/vselect64.ll b/test/CodeGen/R600/vselect64.ll deleted file mode 100644 index ef85ebe7899..00000000000 --- a/test/CodeGen/R600/vselect64.ll +++ /dev/null @@ -1,15 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -; XXX: Merge this test into vselect.ll once SI supports 64-bit select. - -; CHECK-LABEL: {{^}}test_select_v4i64: -; Make sure the vectors aren't being stored on the stack. We know they are -; being stored on the stack if the shaders uses at leat 10 registers. -; CHECK-NOT: {{\**}} MOV T{{[0-9][0-9]}}.X -define void @test_select_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> %c) { -entry: - %cmp = icmp ne <4 x i32> %c, - %result = select <4 x i1> %cmp, <4 x i64> , <4 x i64> - store <4 x i64> %result, <4 x i64> addrspace(1)* %out - ret void -} - diff --git a/test/CodeGen/R600/vtx-fetch-branch.ll b/test/CodeGen/R600/vtx-fetch-branch.ll deleted file mode 100644 index 4584d6e2525..00000000000 --- a/test/CodeGen/R600/vtx-fetch-branch.ll +++ /dev/null @@ -1,29 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood %s -o - | FileCheck %s - -; This tests for a bug where vertex fetch clauses right before an ENDIF -; instruction where being emitted after the ENDIF. We were using ALU_POP_AFTER -; for the ALU clause before the vetex fetch instead of emitting a POP instruction -; after the fetch clause. - - -; CHECK-LABEL: {{^}}test: -; CHECK-NOT: ALU_POP_AFTER -; CHECK: TEX -; CHECK-NEXT: POP -define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) { -entry: - %0 = icmp eq i32 %cond, 0 - br i1 %0, label %endif, label %if - -if: - %1 = load i32, i32 addrspace(1)* %in - br label %endif - -endif: - %x = phi i32 [ %1, %if], [ 0, %entry] - store i32 %x, i32 addrspace(1)* %out - br label %done - -done: - ret void -} diff --git a/test/CodeGen/R600/vtx-schedule.ll b/test/CodeGen/R600/vtx-schedule.ll deleted file mode 100644 index 912e258ebb8..00000000000 --- a/test/CodeGen/R600/vtx-schedule.ll +++ /dev/null @@ -1,18 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -; This test is for a scheduler bug where VTX_READ instructions that used -; the result of another VTX_READ instruction were being grouped in the -; same fetch clasue. - -; CHECK: {{^}}test: -; CHECK: Fetch clause -; CHECK: VTX_READ_32 [[IN0:T[0-9]+\.X]], [[IN0]], 0 -; CHECK: Fetch clause -; CHECK: VTX_READ_32 [[IN1:T[0-9]+\.X]], [[IN1]], 0 -define void @test(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* addrspace(1)* nocapture %in0) { -entry: - %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in0 - %1 = load i32, i32 addrspace(1)* %0 - store i32 %1, i32 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/wait.ll b/test/CodeGen/R600/wait.ll deleted file mode 100644 index 5cc7577cad3..00000000000 --- a/test/CodeGen/R600/wait.ll +++ /dev/null @@ -1,45 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace %s - -; CHECK-LABEL: {{^}}main: -; CHECK: s_load_dwordx4 -; CHECK: s_load_dwordx4 -; CHECK: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; CHECK: s_endpgm -define void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 { -main_body: - %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 0 - %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 - %tmp11 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp10, i32 0, i32 %arg6) - %tmp12 = extractelement <4 x float> %tmp11, i32 0 - %tmp13 = extractelement <4 x float> %tmp11, i32 1 - call void @llvm.AMDGPU.barrier.global() #1 - %tmp14 = extractelement <4 x float> %tmp11, i32 2 -; %tmp15 = extractelement <4 x float> %tmp11, i32 3 - %tmp15 = load float, float addrspace(2)* %constptr, align 4 ; Force waiting for expcnt and lgkmcnt - %tmp16 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 1 - %tmp17 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp16, !tbaa !0 - %tmp18 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp17, i32 0, i32 %arg6) - %tmp19 = extractelement <4 x float> %tmp18, i32 0 - %tmp20 = extractelement <4 x float> %tmp18, i32 1 - %tmp21 = extractelement <4 x float> %tmp18, i32 2 - %tmp22 = extractelement <4 x float> %tmp18, i32 3 - call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %tmp19, float %tmp20, float %tmp21, float %tmp22) - call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp12, float %tmp13, float %tmp14, float %tmp15) - ret void -} - -; Function Attrs: noduplicate nounwind -declare void @llvm.AMDGPU.barrier.global() #1 - -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #2 - -declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="1" } -attributes #1 = { noduplicate nounwind } -attributes #2 = { nounwind readnone } - -!0 = !{!1, !1, i64 0, i32 1} -!1 = !{!"const", null} diff --git a/test/CodeGen/R600/work-item-intrinsics.ll b/test/CodeGen/R600/work-item-intrinsics.ll deleted file mode 100644 index 4328e964c1b..00000000000 --- a/test/CodeGen/R600/work-item-intrinsics.ll +++ /dev/null @@ -1,238 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - - -; FUNC-LABEL: {{^}}ngroups_x: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[0].X - -; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @ngroups_x (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.ngroups.x() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}ngroups_y: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[0].Y - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @ngroups_y (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.ngroups.y() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}ngroups_z: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[0].Z - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @ngroups_z (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.ngroups.z() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}global_size_x: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[0].W - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @global_size_x (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.global.size.x() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}global_size_y: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[1].X - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @global_size_y (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.global.size.y() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}global_size_z: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[1].Y - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @global_size_z (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.global.size.z() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}local_size_x: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[1].Z - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @local_size_x (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.local.size.x() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}local_size_y: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[1].W - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @local_size_y (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.local.size.y() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}local_size_z: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[2].X - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @local_size_z (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.local.size.z() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}get_work_dim: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[2].Z - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @get_work_dim (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.AMDGPU.read.workdim() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; The tgid values are stored in sgprs offset by the number of user sgprs. -; Currently we always use exactly 2 user sgprs for the pointer to the -; kernel arguments, but this may change in the future. - -; FUNC-LABEL: {{^}}tgid_x: -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s4 -; GCN: buffer_store_dword [[VVAL]] -define void @tgid_x (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.tgid.x() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}tgid_y: -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s5 -; GCN: buffer_store_dword [[VVAL]] -define void @tgid_y (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.tgid.y() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}tgid_z: -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6 -; GCN: buffer_store_dword [[VVAL]] -define void @tgid_z (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.tgid.z() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}tidig_x: -; GCN: buffer_store_dword v0 -define void @tidig_x (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.tidig.x() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}tidig_y: -; GCN: buffer_store_dword v1 -define void @tidig_y (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.tidig.y() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}tidig_z: -; GCN: buffer_store_dword v2 -define void @tidig_z (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.tidig.z() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -declare i32 @llvm.r600.read.ngroups.x() #0 -declare i32 @llvm.r600.read.ngroups.y() #0 -declare i32 @llvm.r600.read.ngroups.z() #0 - -declare i32 @llvm.r600.read.global.size.x() #0 -declare i32 @llvm.r600.read.global.size.y() #0 -declare i32 @llvm.r600.read.global.size.z() #0 - -declare i32 @llvm.r600.read.local.size.x() #0 -declare i32 @llvm.r600.read.local.size.y() #0 -declare i32 @llvm.r600.read.local.size.z() #0 - -declare i32 @llvm.r600.read.tgid.x() #0 -declare i32 @llvm.r600.read.tgid.y() #0 -declare i32 @llvm.r600.read.tgid.z() #0 - -declare i32 @llvm.r600.read.tidig.x() #0 -declare i32 @llvm.r600.read.tidig.y() #0 -declare i32 @llvm.r600.read.tidig.z() #0 - -declare i32 @llvm.AMDGPU.read.workdim() #0 - -attributes #0 = { readnone } diff --git a/test/CodeGen/R600/wrong-transalu-pos-fix.ll b/test/CodeGen/R600/wrong-transalu-pos-fix.ll deleted file mode 100644 index 8b383e4c393..00000000000 --- a/test/CodeGen/R600/wrong-transalu-pos-fix.ll +++ /dev/null @@ -1,81 +0,0 @@ -; RUN: llc -march=r600 -mcpu=redwood -mtriple=r600-- < %s | FileCheck %s - -; We want all MULLO_INT inst to be last in their instruction group -;CHECK: {{^}}fill3d: -;CHECK-NOT: MULLO_INT T[0-9]+ - -define void @fill3d(i32 addrspace(1)* nocapture %out) #0 { -entry: - %x.i = tail call i32 @llvm.r600.read.global.size.x() #1 - %y.i18 = tail call i32 @llvm.r600.read.global.size.y() #1 - %mul = mul i32 %y.i18, %x.i - %z.i17 = tail call i32 @llvm.r600.read.global.size.z() #1 - %mul3 = mul i32 %mul, %z.i17 - %x.i.i = tail call i32 @llvm.r600.read.tgid.x() #1 - %x.i12.i = tail call i32 @llvm.r600.read.local.size.x() #1 - %mul26.i = mul i32 %x.i12.i, %x.i.i - %x.i4.i = tail call i32 @llvm.r600.read.tidig.x() #1 - %add.i16 = add i32 %x.i4.i, %mul26.i - %mul7 = mul i32 %add.i16, %y.i18 - %y.i.i = tail call i32 @llvm.r600.read.tgid.y() #1 - %y.i14.i = tail call i32 @llvm.r600.read.local.size.y() #1 - %mul30.i = mul i32 %y.i14.i, %y.i.i - %y.i6.i = tail call i32 @llvm.r600.read.tidig.y() #1 - %add.i14 = add i32 %mul30.i, %mul7 - %mul819 = add i32 %add.i14, %y.i6.i - %add = mul i32 %mul819, %z.i17 - %z.i.i = tail call i32 @llvm.r600.read.tgid.z() #1 - %z.i16.i = tail call i32 @llvm.r600.read.local.size.z() #1 - %mul33.i = mul i32 %z.i16.i, %z.i.i - %z.i8.i = tail call i32 @llvm.r600.read.tidig.z() #1 - %add.i = add i32 %z.i8.i, %mul33.i - %add13 = add i32 %add.i, %add - %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add13 - store i32 %mul3, i32 addrspace(1)* %arrayidx, align 4 - ret void -} - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tgid.x() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tgid.y() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tgid.z() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.local.size.x() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.local.size.y() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.local.size.z() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tidig.x() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tidig.y() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.tidig.z() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.global.size.x() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.global.size.y() #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.r600.read.global.size.z() #1 - -attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } - -!opencl.kernels = !{!0, !1, !2} - -!0 = !{null} -!1 = !{null} -!2 = !{void (i32 addrspace(1)*)* @fill3d} diff --git a/test/CodeGen/R600/xor.ll b/test/CodeGen/R600/xor.ll deleted file mode 100644 index 089db59eabc..00000000000 --- a/test/CodeGen/R600/xor.ll +++ /dev/null @@ -1,173 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - - -; FUNC-LABEL: {{^}}xor_v2i32: -; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} - -define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) { - %a = load <2 x i32>, <2 x i32> addrspace(1) * %in0 - %b = load <2 x i32>, <2 x i32> addrspace(1) * %in1 - %result = xor <2 x i32> %a, %b - store <2 x i32> %result, <2 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}xor_v4i32: -; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} -; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} - -define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) { - %a = load <4 x i32>, <4 x i32> addrspace(1) * %in0 - %b = load <4 x i32>, <4 x i32> addrspace(1) * %in1 - %result = xor <4 x i32> %a, %b - store <4 x i32> %result, <4 x i32> addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}xor_i1: -; EG: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}} - -; SI-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 0, {{v[0-9]+}} -; SI-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 1.0, {{v[0-9]+}} -; SI: s_xor_b64 [[XOR:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]] -; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, [[XOR]] -; SI: buffer_store_dword [[RESULT]] -; SI: s_endpgm -define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) { - %a = load float, float addrspace(1) * %in0 - %b = load float, float addrspace(1) * %in1 - %acmp = fcmp oge float %a, 0.000000e+00 - %bcmp = fcmp oge float %b, 1.000000e+00 - %xor = xor i1 %acmp, %bcmp - %result = select i1 %xor, float %a, float %b - store float %result, float addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}v_xor_i1: -; SI: buffer_load_ubyte [[B:v[0-9]+]] -; SI: buffer_load_ubyte [[A:v[0-9]+]] -; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[A]], [[B]] -; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]] -; SI: buffer_store_byte [[RESULT]] -define void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) { - %a = load i1, i1 addrspace(1)* %in0 - %b = load i1, i1 addrspace(1)* %in1 - %xor = xor i1 %a, %b - store i1 %xor, i1 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}vector_xor_i32: -; SI: v_xor_b32_e32 -define void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) { - %a = load i32, i32 addrspace(1)* %in0 - %b = load i32, i32 addrspace(1)* %in1 - %result = xor i32 %a, %b - store i32 %result, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}scalar_xor_i32: -; SI: s_xor_b32 -define void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { - %result = xor i32 %a, %b - store i32 %result, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}scalar_not_i32: -; SI: s_not_b32 -define void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) { - %result = xor i32 %a, -1 - store i32 %result, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}vector_not_i32: -; SI: v_not_b32 -define void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) { - %a = load i32, i32 addrspace(1)* %in0 - %b = load i32, i32 addrspace(1)* %in1 - %result = xor i32 %a, -1 - store i32 %result, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}vector_xor_i64: -; SI: v_xor_b32_e32 -; SI: v_xor_b32_e32 -; SI: s_endpgm -define void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) { - %a = load i64, i64 addrspace(1)* %in0 - %b = load i64, i64 addrspace(1)* %in1 - %result = xor i64 %a, %b - store i64 %result, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}scalar_xor_i64: -; SI: s_xor_b64 -; SI: s_endpgm -define void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { - %result = xor i64 %a, %b - store i64 %result, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}scalar_not_i64: -; SI: s_not_b64 -define void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) { - %result = xor i64 %a, -1 - store i64 %result, i64 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}vector_not_i64: -; SI: v_not_b32 -; SI: v_not_b32 -define void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) { - %a = load i64, i64 addrspace(1)* %in0 - %b = load i64, i64 addrspace(1)* %in1 - %result = xor i64 %a, -1 - store i64 %result, i64 addrspace(1)* %out - ret void -} - -; Test that we have a pattern to match xor inside a branch. -; Note that in the future the backend may be smart enough to -; use an SALU instruction for this. - -; FUNC-LABEL: {{^}}xor_cf: -; SI: s_xor_b64 -define void @xor_cf(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b) { -entry: - %0 = icmp eq i64 %a, 0 - br i1 %0, label %if, label %else - -if: - %1 = xor i64 %a, %b - br label %endif - -else: - %2 = load i64, i64 addrspace(1)* %in - br label %endif - -endif: - %3 = phi i64 [%1, %if], [%2, %else] - store i64 %3, i64 addrspace(1)* %out - ret void -} diff --git a/test/CodeGen/R600/zero_extend.ll b/test/CodeGen/R600/zero_extend.ll deleted file mode 100644 index 033055db185..00000000000 --- a/test/CodeGen/R600/zero_extend.ll +++ /dev/null @@ -1,41 +0,0 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600 -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI - -; R600: {{^}}test: -; R600: MEM_RAT_CACHELESS STORE_RAW -; R600: MEM_RAT_CACHELESS STORE_RAW - -; SI: {{^}}test: -; SI: s_mov_b32 [[ZERO:s[0-9]]], 0{{$}} -; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], [[ZERO]] -; SI: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}} -define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { -entry: - %0 = mul i32 %a, %b - %1 = add i32 %0, %c - %2 = zext i32 %1 to i64 - store i64 %2, i64 addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}testi1toi32: -; SI: v_cndmask_b32 -define void @testi1toi32(i32 addrspace(1)* %out, i32 %a, i32 %b) { -entry: - %0 = icmp eq i32 %a, %b - %1 = zext i1 %0 to i32 - store i32 %1, i32 addrspace(1)* %out - ret void -} - -; SI-LABEL: {{^}}zext_i1_to_i64: -; SI: s_mov_b32 s{{[0-9]+}}, 0 -; SI: v_cmp_eq_i32 -; SI: v_cndmask_b32 -define void @zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind { - %cmp = icmp eq i32 %a, %b - %ext = zext i1 %cmp to i64 - store i64 %ext, i64 addrspace(1)* %out, align 8 - ret void -} diff --git a/test/MC/AMDGPU/ds-err.s b/test/MC/AMDGPU/ds-err.s new file mode 100644 index 00000000000..52c2740bec2 --- /dev/null +++ b/test/MC/AMDGPU/ds-err.s @@ -0,0 +1,23 @@ +// RUN: not llvm-mc -arch=amdgcn %s 2>&1 | FileCheck %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=SI %s 2>&1 | FileCheck %s + +// offset too big +// CHECK: invalid operand for instruction +ds_add_u32 v2, v4 offset:1000000000 + +// offset0 twice +// CHECK: error: not a valid operand. +ds_write2_b32 v2, v4, v6 offset0:4 offset0:8 + +// offset1 twice +// CHECK: error: not a valid operand. +ds_write2_b32 v2, v4, v6 offset1:4 offset1:8 + +// offset0 too big +// CHECK: invalid operand for instruction +ds_write2_b32 v2, v4, v6 offset0:1000000000 + +// offset1 too big +// CHECK: invalid operand for instruction +ds_write2_b32 v2, v4, v6 offset1:1000000000 + diff --git a/test/MC/AMDGPU/ds.s b/test/MC/AMDGPU/ds.s new file mode 100644 index 00000000000..ad63229ba2e --- /dev/null +++ b/test/MC/AMDGPU/ds.s @@ -0,0 +1,337 @@ +// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s +// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s + +//===----------------------------------------------------------------------===// +// Checks for 16-bit Offsets +//===----------------------------------------------------------------------===// + +ds_add_u32 v2, v4 offset:16 +// CHECK: ds_add_u32 v2, v4 offset:16 ; encoding: [0x10,0x00,0x00,0xd8,0x02,0x04,0x00,0x00] + +//===----------------------------------------------------------------------===// +// Checks for 2 8-bit Offsets +//===----------------------------------------------------------------------===// + +ds_write2_b32 v2, v4, v6 offset0:4 +// CHECK: ds_write2_b32 v2, v4, v6 offset0:4 ; encoding: [0x04,0x00,0x38,0xd8,0x02,0x04,0x06,0x00] + +ds_write2_b32 v2, v4, v6 offset0:4 offset1:8 +// CHECK: ds_write2_b32 v2, v4, v6 offset0:4 offset1:8 ; encoding: [0x04,0x08,0x38,0xd8,0x02,0x04,0x06,0x00] + +ds_write2_b32 v2, v4, v6 offset1:8 +// CHECK: ds_write2_b32 v2, v4, v6 offset1:8 ; encoding: [0x00,0x08,0x38,0xd8,0x02,0x04,0x06,0x00] + +ds_read2_b32 v[8:9], v2 offset0:4 +// CHECK: ds_read2_b32 v[8:9], v2 offset0:4 ; encoding: [0x04,0x00,0xdc,0xd8,0x02,0x00,0x00,0x08] + +ds_read2_b32 v[8:9], v2 offset0:4 offset1:8 +// CHECK: ds_read2_b32 v[8:9], v2 offset0:4 offset1:8 ; encoding: [0x04,0x08,0xdc,0xd8,0x02,0x00,0x00,0x08] + +ds_read2_b32 v[8:9], v2 offset1:8 +// CHECK: ds_read2_b32 v[8:9], v2 offset1:8 ; encoding: [0x00,0x08,0xdc,0xd8,0x02,0x00,0x00,0x08] +//===----------------------------------------------------------------------===// +// Instructions +//===----------------------------------------------------------------------===// + +ds_add_u32 v2, v4 +// CHECK: ds_add_u32 v2, v4 ; encoding: [0x00,0x00,0x00,0xd8,0x02,0x04,0x00,0x00] + +ds_sub_u32 v2, v4 +// CHECK: ds_sub_u32 v2, v4 ; encoding: [0x00,0x00,0x04,0xd8,0x02,0x04,0x00,0x00] + +ds_rsub_u32 v2, v4 +// CHECK: ds_rsub_u32 v2, v4 ; encoding: [0x00,0x00,0x08,0xd8,0x02,0x04,0x00,0x00] + +ds_inc_u32 v2, v4 +// CHECK: ds_inc_u32 v2, v4 ; encoding: [0x00,0x00,0x0c,0xd8,0x02,0x04,0x00,0x00] + +ds_dec_u32 v2, v4 +// CHECK: ds_dec_u32 v2, v4 ; encoding: [0x00,0x00,0x10,0xd8,0x02,0x04,0x00,0x00] + +ds_min_i32 v2, v4 +// CHECK: ds_min_i32 v2, v4 ; encoding: [0x00,0x00,0x14,0xd8,0x02,0x04,0x00,0x00] + +ds_max_i32 v2, v4 +// CHECK: ds_max_i32 v2, v4 ; encoding: [0x00,0x00,0x18,0xd8,0x02,0x04,0x00,0x00] + +ds_min_u32 v2, v4 +// CHECK: ds_min_u32 v2, v4 ; encoding: [0x00,0x00,0x1c,0xd8,0x02,0x04,0x00,0x00] + +ds_max_u32 v2, v4 +// CHECK: ds_max_u32 v2, v4 ; encoding: [0x00,0x00,0x20,0xd8,0x02,0x04,0x00,0x00] + +ds_and_b32 v2, v4 +// CHECK: ds_and_b32 v2, v4 ; encoding: [0x00,0x00,0x24,0xd8,0x02,0x04,0x00,0x00] + +ds_or_b32 v2, v4 +// CHECK: ds_or_b32 v2, v4 ; encoding: [0x00,0x00,0x28,0xd8,0x02,0x04,0x00,0x00] + +ds_xor_b32 v2, v4 +// CHECK: ds_xor_b32 v2, v4 ; encoding: [0x00,0x00,0x2c,0xd8,0x02,0x04,0x00,0x00] + +ds_mskor_b32 v2, v4, v6 +// CHECK: ds_mskor_b32 v2, v4, v6 ; encoding: [0x00,0x00,0x30,0xd8,0x02,0x04,0x06,0x00] + +ds_write_b32 v2, v4 +// CHECK: ds_write_b32 v2, v4 ; encoding: [0x00,0x00,0x34,0xd8,0x02,0x04,0x00,0x00] + +ds_write2_b32 v2, v4, v6 +// CHECK: ds_write2_b32 v2, v4, v6 ; encoding: [0x00,0x00,0x38,0xd8,0x02,0x04,0x06,0x00] + +ds_write2st64_b32 v2, v4, v6 +// CHECK: ds_write2st64_b32 v2, v4, v6 ; encoding: [0x00,0x00,0x3c,0xd8,0x02,0x04,0x06,0x00] + +ds_cmpst_b32 v2, v4, v6 +// CHECK: ds_cmpst_b32 v2, v4, v6 ; encoding: [0x00,0x00,0x40,0xd8,0x02,0x04,0x06,0x00] + +ds_cmpst_f32 v2, v4, v6 +// CHECK: ds_cmpst_f32 v2, v4, v6 ; encoding: [0x00,0x00,0x44,0xd8,0x02,0x04,0x06,0x00] + +ds_min_f32 v2, v4, v6 +// CHECK: ds_min_f32 v2, v4, v6 ; encoding: [0x00,0x00,0x48,0xd8,0x02,0x04,0x06,0x00] + +ds_max_f32 v2, v4, v6 +// CHECK: ds_max_f32 v2, v4, v6 ; encoding: [0x00,0x00,0x4c,0xd8,0x02,0x04,0x06,0x00] + +ds_gws_init v2 gds +// CHECK: ds_gws_init v2 gds ; encoding: [0x00,0x00,0x66,0xd8,0x02,0x00,0x00,0x00] + +ds_gws_sema_v v2 gds +// CHECK: ds_gws_sema_v v2 gds ; encoding: [0x00,0x00,0x6a,0xd8,0x02,0x00,0x00,0x00] + +ds_gws_sema_br v2 gds +// CHECK: ds_gws_sema_br v2 gds ; encoding: [0x00,0x00,0x6e,0xd8,0x02,0x00,0x00,0x00] + +ds_gws_sema_p v2 gds +// CHECK: ds_gws_sema_p v2 gds ; encoding: [0x00,0x00,0x72,0xd8,0x02,0x00,0x00,0x00] + +ds_gws_barrier v2 gds +// CHECK: ds_gws_barrier v2 gds ; encoding: [0x00,0x00,0x76,0xd8,0x02,0x00,0x00,0x00] + +ds_write_b8 v2, v4 +// CHECK: ds_write_b8 v2, v4 ; encoding: [0x00,0x00,0x78,0xd8,0x02,0x04,0x00,0x00] + +ds_write_b16 v2, v4 +// CHECK: ds_write_b16 v2, v4 ; encoding: [0x00,0x00,0x7c,0xd8,0x02,0x04,0x00,0x00] + +ds_add_rtn_u32 v8, v2, v4 +// CHECK: ds_add_rtn_u32 v8, v2, v4 ; encoding: [0x00,0x00,0x80,0xd8,0x02,0x04,0x00,0x08] + +ds_sub_rtn_u32 v8, v2, v4 +// CHECK: ds_sub_rtn_u32 v8, v2, v4 ; encoding: [0x00,0x00,0x84,0xd8,0x02,0x04,0x00,0x08] + +ds_rsub_rtn_u32 v8, v2, v4 +// CHECK: ds_rsub_rtn_u32 v8, v2, v4 ; encoding: [0x00,0x00,0x88,0xd8,0x02,0x04,0x00,0x08] + +ds_inc_rtn_u32 v8, v2, v4 +// CHECK: ds_inc_rtn_u32 v8, v2, v4 ; encoding: [0x00,0x00,0x8c,0xd8,0x02,0x04,0x00,0x08] + +ds_dec_rtn_u32 v8, v2, v4 +// CHECK: ds_dec_rtn_u32 v8, v2, v4 ; encoding: [0x00,0x00,0x90,0xd8,0x02,0x04,0x00,0x08] + +ds_min_rtn_i32 v8, v2, v4 +// CHECK: ds_min_rtn_i32 v8, v2, v4 ; encoding: [0x00,0x00,0x94,0xd8,0x02,0x04,0x00,0x08] + +ds_max_rtn_i32 v8, v2, v4 +// CHECK: ds_max_rtn_i32 v8, v2, v4 ; encoding: [0x00,0x00,0x98,0xd8,0x02,0x04,0x00,0x08] + +ds_min_rtn_u32 v8, v2, v4 +// CHECK: ds_min_rtn_u32 v8, v2, v4 ; encoding: [0x00,0x00,0x9c,0xd8,0x02,0x04,0x00,0x08] + +ds_max_rtn_u32 v8, v2, v4 +// CHECK: ds_max_rtn_u32 v8, v2, v4 ; encoding: [0x00,0x00,0xa0,0xd8,0x02,0x04,0x00,0x08] + +ds_and_rtn_b32 v8, v2, v4 +// CHECK: ds_and_rtn_b32 v8, v2, v4 ; encoding: [0x00,0x00,0xa4,0xd8,0x02,0x04,0x00,0x08] + +ds_or_rtn_b32 v8, v2, v4 +// CHECK: ds_or_rtn_b32 v8, v2, v4 ; encoding: [0x00,0x00,0xa8,0xd8,0x02,0x04,0x00,0x08] + +ds_xor_rtn_b32 v8, v2, v4 +// CHECK: ds_xor_rtn_b32 v8, v2, v4 ; encoding: [0x00,0x00,0xac,0xd8,0x02,0x04,0x00,0x08] + +ds_mskor_rtn_b32 v8, v2, v4, v6 +// CHECK: ds_mskor_rtn_b32 v8, v2, v4, v6 ; encoding: [0x00,0x00,0xb0,0xd8,0x02,0x04,0x06,0x08] + +ds_wrxchg_rtn_b32 v8, v2, v4 +// CHECK: ds_wrxchg_rtn_b32 v8, v2, v4 ; encoding: [0x00,0x00,0xb4,0xd8,0x02,0x04,0x00,0x08] + +ds_wrxchg2_rtn_b32 v[8:9], v2, v4, v6 +// CHECK: ds_wrxchg2_rtn_b32 v[8:9], v2, v4, v6 ; encoding: [0x00,0x00,0xb8,0xd8,0x02,0x04,0x06,0x08] + +ds_wrxchg2st64_rtn_b32 v[8:9] v2, v4, v6 +// CHECK: ds_wrxchg2st64_rtn_b32 v[8:9], v2, v4, v6 ; encoding: [0x00,0x00,0xbc,0xd8,0x02,0x04,0x06,0x08] + +ds_cmpst_rtn_b32 v8, v2, v4, v6 +// CHECK: ds_cmpst_rtn_b32 v8, v2, v4, v6 ; encoding: [0x00,0x00,0xc0,0xd8,0x02,0x04,0x06,0x08] + +ds_cmpst_rtn_f32 v8, v2, v4, v6 +// CHECK: ds_cmpst_rtn_f32 v8, v2, v4, v6 ; encoding: [0x00,0x00,0xc4,0xd8,0x02,0x04,0x06,0x08] + +ds_min_rtn_f32 v8, v2, v4, v6 +// CHECK: ds_min_rtn_f32 v8, v2, v4, v6 ; encoding: [0x00,0x00,0xc8,0xd8,0x02,0x04,0x06,0x08] + +ds_max_rtn_f32 v8, v2, v4, v6 +// CHECK: ds_max_rtn_f32 v8, v2, v4, v6 ; encoding: [0x00,0x00,0xcc,0xd8,0x02,0x04,0x06,0x08] + +ds_swizzle_b32 v8, v2 +// CHECK: ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] + +ds_read_b32 v8, v2 +// CHECK: ds_read_b32 v8, v2 ; encoding: [0x00,0x00,0xd8,0xd8,0x02,0x00,0x00,0x08] + +ds_read2_b32 v[8:9], v2 +// CHECK: ds_read2_b32 v[8:9], v2 ; encoding: [0x00,0x00,0xdc,0xd8,0x02,0x00,0x00,0x08] + +ds_read2st64_b32 v[8:9], v2 +// CHECK: ds_read2st64_b32 v[8:9], v2 ; encoding: [0x00,0x00,0xe0,0xd8,0x02,0x00,0x00,0x08] + +ds_read_i8 v8, v2 +// CHECK: ds_read_i8 v8, v2 ; encoding: [0x00,0x00,0xe4,0xd8,0x02,0x00,0x00,0x08] + +ds_read_u8 v8, v2 +// CHECK: ds_read_u8 v8, v2 ; encoding: [0x00,0x00,0xe8,0xd8,0x02,0x00,0x00,0x08] + +ds_read_i16 v8, v2 +// CHECK: ds_read_i16 v8, v2 ; encoding: [0x00,0x00,0xec,0xd8,0x02,0x00,0x00,0x08] + +ds_read_u16 v8, v2 +// CHECK: ds_read_u16 v8, v2 ; encoding: [0x00,0x00,0xf0,0xd8,0x02,0x00,0x00,0x08] + +ds_consume v8 +// CHECK: ds_consume v8 ; encoding: [0x00,0x00,0xf4,0xd8,0x00,0x00,0x00,0x08] + +ds_append v8 +// CHECK: ds_append v8 ; encoding: [0x00,0x00,0xf8,0xd8,0x00,0x00,0x00,0x08] + +ds_ordered_count v8, v2 gds +// CHECK: ds_ordered_count v8, v2 gds ; encoding: [0x00,0x00,0xfe,0xd8,0x02,0x00,0x00,0x08] + +ds_add_u64 v2, v[4:5] +// CHECK: ds_add_u64 v2, v[4:5] ; encoding: [0x00,0x00,0x00,0xd9,0x02,0x04,0x00,0x00] + +ds_sub_u64 v2, v[4:5] +// CHECK: ds_sub_u64 v2, v[4:5] ; encoding: [0x00,0x00,0x04,0xd9,0x02,0x04,0x00,0x00] + +ds_rsub_u64 v2, v[4:5] +// CHECK: ds_rsub_u64 v2, v[4:5] ; encoding: [0x00,0x00,0x08,0xd9,0x02,0x04,0x00,0x00] + +ds_inc_u64 v2, v[4:5] +// CHECK: ds_inc_u64 v2, v[4:5] ; encoding: [0x00,0x00,0x0c,0xd9,0x02,0x04,0x00,0x00] + +ds_dec_u64 v2, v[4:5] +// CHECK: ds_dec_u64 v2, v[4:5] ; encoding: [0x00,0x00,0x10,0xd9,0x02,0x04,0x00,0x00] + +ds_min_i64 v2, v[4:5] +// CHECK: ds_min_i64 v2, v[4:5] ; encoding: [0x00,0x00,0x14,0xd9,0x02,0x04,0x00,0x00] + +ds_max_i64 v2, v[4:5] +// CHECK: ds_max_i64 v2, v[4:5] ; encoding: [0x00,0x00,0x18,0xd9,0x02,0x04,0x00,0x00] + +ds_min_u64 v2, v[4:5] +// CHECK: ds_min_u64 v2, v[4:5] ; encoding: [0x00,0x00,0x1c,0xd9,0x02,0x04,0x00,0x00] + +ds_max_u64 v2, v[4:5] +// CHECK: ds_max_u64 v2, v[4:5] ; encoding: [0x00,0x00,0x20,0xd9,0x02,0x04,0x00,0x00] + +ds_and_b64 v2, v[4:5] +// CHECK: ds_and_b64 v2, v[4:5] ; encoding: [0x00,0x00,0x24,0xd9,0x02,0x04,0x00,0x00] + +ds_or_b64 v2, v[4:5] +// CHECK: ds_or_b64 v2, v[4:5] ; encoding: [0x00,0x00,0x28,0xd9,0x02,0x04,0x00,0x00] + +ds_xor_b64 v2, v[4:5] +// CHECK: ds_xor_b64 v2, v[4:5] ; encoding: [0x00,0x00,0x2c,0xd9,0x02,0x04,0x00,0x00] + +ds_mskor_b64 v2, v[4:5], v[6:7] +// CHECK: ds_mskor_b64 v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0x30,0xd9,0x02,0x04,0x06,0x00] + +ds_write_b64 v2, v[4:5] +// CHECK: ds_write_b64 v2, v[4:5] ; encoding: [0x00,0x00,0x34,0xd9,0x02,0x04,0x00,0x00] + +ds_write2_b64 v2, v[4:5], v[6:7] +// CHECK: ds_write2_b64 v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0x38,0xd9,0x02,0x04,0x06,0x00] + +ds_write2st64_b64 v2, v[4:5], v[6:7] +// CHECK: ds_write2st64_b64 v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0x3c,0xd9,0x02,0x04,0x06,0x00] + +ds_cmpst_b64 v2, v[4:5], v[6:7] +// CHECK: ds_cmpst_b64 v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0x40,0xd9,0x02,0x04,0x06,0x00] + +ds_cmpst_f64 v2, v[4:5], v[6:7] +// CHECK: ds_cmpst_f64 v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0x44,0xd9,0x02,0x04,0x06,0x00] + +ds_min_f64 v2, v[4:5] +// CHECK: ds_min_f64 v2, v[4:5] ; encoding: [0x00,0x00,0x48,0xd9,0x02,0x04,0x00,0x00] + +ds_max_f64 v2, v[4:5] +// CHECK: ds_max_f64 v2, v[4:5] ; encoding: [0x00,0x00,0x4c,0xd9,0x02,0x04,0x00,0x00] + +ds_add_rtn_u64 v[8:9], v2, v[4:5] +// CHECK: ds_add_rtn_u64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x80,0xd9,0x02,0x04,0x00,0x08] + +ds_sub_rtn_u64 v[8:9], v2, v[4:5] +// CHECK: ds_sub_rtn_u64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x84,0xd9,0x02,0x04,0x00,0x08] + +ds_rsub_rtn_u64 v[8:9], v2, v[4:5] +// CHECK: ds_rsub_rtn_u64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x88,0xd9,0x02,0x04,0x00,0x08] + +ds_inc_rtn_u64 v[8:9], v2, v[4:5] +// CHECK: ds_inc_rtn_u64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x8c,0xd9,0x02,0x04,0x00,0x08] + +ds_dec_rtn_u64 v[8:9] v2, v[4:5] +// CHECK: ds_dec_rtn_u64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x90,0xd9,0x02,0x04,0x00,0x08] + +ds_min_rtn_i64 v[8:9], v2, v[4:5] +// CHECK: ds_min_rtn_i64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x94,0xd9,0x02,0x04,0x00,0x08] + +ds_max_rtn_i64 v[8:9], v2, v[4:5] +// CHECK: ds_max_rtn_i64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x98,0xd9,0x02,0x04,0x00,0x08] + +ds_min_rtn_u64 v[8:9], v2, v[4:5] +// CHECK: ds_min_rtn_u64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x9c,0xd9,0x02,0x04,0x00,0x08] + +ds_max_rtn_u64 v[8:9], v2, v[4:5] +// CHECK: ds_max_rtn_u64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0xa0,0xd9,0x02,0x04,0x00,0x08] + +ds_and_rtn_b64 v[8:9], v2, v[4:5] +// CHECK: ds_and_rtn_b64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0xa4,0xd9,0x02,0x04,0x00,0x08] + +ds_or_rtn_b64 v[8:9], v2, v[4:5] +// CHECK: ds_or_rtn_b64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0xa8,0xd9,0x02,0x04,0x00,0x08] + +ds_xor_rtn_b64 v[8:9], v2, v[4:5] +// CHECK: ds_xor_rtn_b64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0xac,0xd9,0x02,0x04,0x00,0x08] + +ds_mskor_rtn_b64 v[8:9], v2, v[4:5], v[6:7] +// CHECK: ds_mskor_rtn_b64 v[8:9], v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0xb0,0xd9,0x02,0x04,0x06,0x08] + +ds_wrxchg_rtn_b64 v[8:9], v2, v[4:5] +// CHECK: ds_wrxchg_rtn_b64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0xb4,0xd9,0x02,0x04,0x00,0x08] + +ds_wrxchg2_rtn_b64 v[8:11], v2, v[4:5], v[6:7] +// CHECK: ds_wrxchg2_rtn_b64 v[8:11], v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0xb8,0xd9,0x02,0x04,0x06,0x08] + +ds_wrxchg2st64_rtn_b64 v[8:11], v2, v[4:5], v[6:7] +// CHECK: ds_wrxchg2st64_rtn_b64 v[8:11], v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0xbc,0xd9,0x02,0x04,0x06,0x08] + +ds_cmpst_rtn_b64 v[8:9], v2, v[4:5], v[6:7] +// CHECK: ds_cmpst_rtn_b64 v[8:9], v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0xc0,0xd9,0x02,0x04,0x06,0x08] + +ds_cmpst_rtn_f64 v[8:9], v2, v[4:5], v[6:7] +// CHECK: ds_cmpst_rtn_f64 v[8:9], v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0xc4,0xd9,0x02,0x04,0x06,0x08] + +ds_min_rtn_f64 v[8:9], v2, v[4:5] +// CHECK: ds_min_rtn_f64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0xc8,0xd9,0x02,0x04,0x00,0x08] + +ds_max_rtn_f64 v[8:9], v2, v[4:5] +// CHECK: ds_max_rtn_f64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0xcc,0xd9,0x02,0x04,0x00,0x08] + +ds_read_b64 v[8:9], v2 +// CHECK: ds_read_b64 v[8:9], v2 ; encoding: [0x00,0x00,0xd8,0xd9,0x02,0x00,0x00,0x08] + +ds_read2_b64 v[8:11], v2 +// CHECK: ds_read2_b64 v[8:11], v2 ; encoding: [0x00,0x00,0xdc,0xd9,0x02,0x00,0x00,0x08] + +ds_read2st64_b64 v[8:11], v2 +// CHECK: ds_read2st64_b64 v[8:11], v2 ; encoding: [0x00,0x00,0xe0,0xd9,0x02,0x00,0x00,0x08] diff --git a/test/MC/AMDGPU/flat.s b/test/MC/AMDGPU/flat.s new file mode 100644 index 00000000000..adad29a5595 --- /dev/null +++ b/test/MC/AMDGPU/flat.s @@ -0,0 +1,477 @@ +// RUN: llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s | FileCheck %s --check-prefix=CIVI --check-prefix=CI +// RUN: llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck %s --check-prefix=CIVI + +// FIXME: These instructions give an 'invalid operand' error on SI and should +// instead be reporting an 'instruction not supported' error. + +// XUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck %s --check-prefix=NOVI +// XUN: not llvm-mc -arch=amdgcn -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSI +// XUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSI + +//===----------------------------------------------------------------------===// +// Operands +//===----------------------------------------------------------------------===// + +flat_load_dword v1, v[3:4] +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_load_dword v1, v[3:4] ; encoding: [0x00,0x00,0x30,0xdc,0x03,0x00,0x00,0x01] + +flat_load_dword v1, v[3:4] glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_load_dword v1, v[3:4] glc ; encoding: [0x00,0x00,0x31,0xdc,0x03,0x00,0x00,0x01] + +flat_load_dword v1, v[3:4] glc slc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_load_dword v1, v[3:4] glc slc ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x00,0x01] + +flat_load_dword v1, v[3:4] glc tfe +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_load_dword v1, v[3:4] glc tfe ; encoding: [0x00,0x00,0x31,0xdc,0x03,0x00,0x80,0x01] + +flat_load_dword v1, v[3:4] glc slc tfe +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_load_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x80,0x01] + +flat_load_dword v1, v[3:4] glc tfe slc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_load_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x80,0x01] + +flat_load_dword v1, v[3:4] slc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_load_dword v1, v[3:4] slc ; encoding: [0x00,0x00,0x32,0xdc,0x03,0x00,0x00,0x01] + +flat_load_dword v1, v[3:4] slc glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_load_dword v1, v[3:4] glc slc ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x00,0x01] + +flat_load_dword v1, v[3:4] slc tfe +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_load_dword v1, v[3:4] slc tfe ; encoding: [0x00,0x00,0x32,0xdc,0x03,0x00,0x80,0x01] + +flat_load_dword v1, v[3:4] slc glc tfe +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_load_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x80,0x01] + +flat_load_dword v1, v[3:4] slc tfe glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_load_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x80,0x01] + +flat_load_dword v1, v[3:4] tfe +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_load_dword v1, v[3:4] tfe ; encoding: [0x00,0x00,0x30,0xdc,0x03,0x00,0x80,0x01] + +flat_load_dword v1, v[3:4] tfe glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_load_dword v1, v[3:4] glc tfe ; encoding: [0x00,0x00,0x31,0xdc,0x03,0x00,0x80,0x01] + +flat_load_dword v1, v[3:4] tfe slc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_load_dword v1, v[3:4] slc tfe ; encoding: [0x00,0x00,0x32,0xdc,0x03,0x00,0x80,0x01] + +flat_load_dword v1, v[3:4] tfe glc slc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_load_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x80,0x01] + +flat_load_dword v1, v[3:4] tfe slc glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_load_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x80,0x01] + +flat_store_dword v1, v[3:4] +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_store_dword v1, v[3:4] ; encoding: [0x00,0x00,0x70,0xdc,0x03,0x01,0x00,0x00] + +flat_store_dword v1, v[3:4] glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_store_dword v1, v[3:4] glc ; encoding: [0x00,0x00,0x71,0xdc,0x03,0x01,0x00,0x00] + +flat_store_dword v1, v[3:4] glc slc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_store_dword v1, v[3:4] glc slc ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x00,0x00] + +flat_store_dword v1, v[3:4] glc tfe +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_store_dword v1, v[3:4] glc tfe ; encoding: [0x00,0x00,0x71,0xdc,0x03,0x01,0x80,0x00] + +flat_store_dword v1, v[3:4] glc slc tfe +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_store_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x80,0x00] + +flat_store_dword v1, v[3:4] glc tfe slc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_store_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x80,0x00] + +flat_store_dword v1, v[3:4] slc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_store_dword v1, v[3:4] slc ; encoding: [0x00,0x00,0x72,0xdc,0x03,0x01,0x00,0x00] + +flat_store_dword v1, v[3:4] slc glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_store_dword v1, v[3:4] glc slc ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x00,0x00] + +flat_store_dword v1, v[3:4] slc tfe +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_store_dword v1, v[3:4] slc tfe ; encoding: [0x00,0x00,0x72,0xdc,0x03,0x01,0x80,0x00] + +flat_store_dword v1, v[3:4] slc glc tfe +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_store_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x80,0x00] + +flat_store_dword v1, v[3:4] slc tfe glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_store_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x80,0x00] + +flat_store_dword v1, v[3:4] tfe +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_store_dword v1, v[3:4] tfe ; encoding: [0x00,0x00,0x70,0xdc,0x03,0x01,0x80,0x00] + +flat_store_dword v1, v[3:4] tfe glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_store_dword v1, v[3:4] glc tfe ; encoding: [0x00,0x00,0x71,0xdc,0x03,0x01,0x80,0x00] + +flat_store_dword v1, v[3:4] tfe slc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_store_dword v1, v[3:4] slc tfe ; encoding: [0x00,0x00,0x72,0xdc,0x03,0x01,0x80,0x00] + +flat_store_dword v1, v[3:4] tfe glc slc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_store_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x80,0x00] + +flat_store_dword v1, v[3:4] tfe slc glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_store_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x80,0x00] + +// FIXME: For atomic instructions, glc must be placed immediately following +// the data regiser. These forms aren't currently supported: +// flat_atomic_add v1, v[3:4], v5 slc glc +// flat_atomic_add v1, v[3:4], v5 slc glc tfe +// flat_atomic_add v1, v[3:4], v5 slc tfe glc +// flat_atomic_add v1, v[3:4], v5 tfe glc +// flat_atomic_add v[3:4], v5 tfe glc +// flat_atomic_add v1, v[3:4], v5 tfe glc slc +// flat_atomic_add v1, v[3:4], v5 tfe slc glc + +flat_atomic_add v1 v[3:4], v5 glc slc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_add v1, v[3:4], v5 glc slc ; encoding: [0x00,0x00,0xcb,0xdc,0x03,0x05,0x00,0x01] + +flat_atomic_add v1 v[3:4], v5 glc tfe +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_add v1, v[3:4], v5 glc tfe ; encoding: [0x00,0x00,0xc9,0xdc,0x03,0x05,0x80,0x01] + +flat_atomic_add v1 v[3:4], v5 glc slc tfe +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_add v1, v[3:4], v5 glc slc tfe ; encoding: [0x00,0x00,0xcb,0xdc,0x03,0x05,0x80,0x01] + +flat_atomic_add v1 v[3:4], v5 glc tfe slc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_add v1, v[3:4], v5 glc slc tfe ; encoding: [0x00,0x00,0xcb,0xdc,0x03,0x05,0x80,0x01] + +flat_atomic_add v[3:4], v5 slc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_add v[3:4], v5 slc ; encoding: [0x00,0x00,0xca,0xdc,0x03,0x05,0x00,0x00] + +flat_atomic_add v[3:4], v5 slc tfe +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_add v[3:4], v5 slc tfe ; encoding: [0x00,0x00,0xca,0xdc,0x03,0x05,0x80,0x00] + +flat_atomic_add v[3:4], v5 tfe +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_add v[3:4], v5 tfe ; encoding: [0x00,0x00,0xc8,0xdc,0x03,0x05,0x80,0x00] + +//===----------------------------------------------------------------------===// +// Instructions +//===----------------------------------------------------------------------===// + +flat_load_ubyte v1, v[3:4] +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_load_ubyte v1, v[3:4] ; encoding: [0x00,0x00,0x20,0xdc,0x03,0x00,0x00,0x01] + +flat_load_sbyte v1, v[3:4] +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_load_sbyte v1, v[3:4] ; encoding: [0x00,0x00,0x24,0xdc,0x03,0x00,0x00,0x01] + +flat_load_ushort v1, v[3:4] +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_load_ushort v1, v[3:4] ; encoding: [0x00,0x00,0x28,0xdc,0x03,0x00,0x00,0x01] + +flat_load_sshort v1, v[3:4] +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_load_sshort v1, v[3:4] ; encoding: [0x00,0x00,0x2c,0xdc,0x03,0x00,0x00,0x01] + +flat_load_dword v1, v[3:4] +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_load_dword v1, v[3:4] ; encoding: [0x00,0x00,0x30,0xdc,0x03,0x00,0x00,0x01] + +flat_load_dwordx2 v[1:2], v[3:4] +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_load_dwordx2 v[1:2], v[3:4] ; encoding: [0x00,0x00,0x34,0xdc,0x03,0x00,0x00,0x01] + +flat_load_dwordx4 v[5:8], v[3:4] +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_load_dwordx4 v[5:8], v[3:4] ; encoding: [0x00,0x00,0x38,0xdc,0x03,0x00,0x00,0x05] + +flat_load_dwordx3 v[5:7], v[3:4] +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_load_dwordx3 v[5:7], v[3:4] ; encoding: [0x00,0x00,0x3c,0xdc,0x03,0x00,0x00,0x05] + +flat_store_byte v1, v[3:4] +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_store_byte v1, v[3:4] ; encoding: [0x00,0x00,0x60,0xdc,0x03,0x01,0x00,0x00] + +flat_store_short v1, v[3:4] +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_store_short v1, v[3:4] ; encoding: [0x00,0x00,0x68,0xdc,0x03,0x01,0x00,0x00] + +flat_store_dword v1, v[3:4] +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_store_dword v1, v[3:4] ; encoding: [0x00,0x00,0x70,0xdc,0x03,0x01,0x00,0x00] + +flat_store_dwordx2 v[1:2], v[3:4] +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_store_dwordx2 v[1:2], v[3:4] ; encoding: [0x00,0x00,0x74,0xdc,0x03,0x01,0x00,0x00] + +flat_store_dwordx4 v[5:8], v[3:4] +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_store_dwordx4 v[5:8], v[3:4] ; encoding: [0x00,0x00,0x78,0xdc,0x03,0x05,0x00,0x00] + +flat_store_dwordx3 v[5:7], v[3:4] +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_store_dwordx3 v[5:7], v[3:4] ; encoding: [0x00,0x00,0x7c,0xdc,0x03,0x05,0x00,0x00] + +flat_atomic_swap v[3:4], v5 +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_swap v[3:4], v5 ; encoding: [0x00,0x00,0xc0,0xdc,0x03,0x05,0x00,0x00] + +flat_atomic_swap v1, v[3:4], v5 glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_swap v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xc1,0xdc,0x03,0x05,0x00,0x01] + +flat_atomic_cmpswap v[3:4], v[5:6] +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_cmpswap v[3:4], v[5:6] ; encoding: [0x00,0x00,0xc4,0xdc,0x03,0x05,0x00,0x00] + +flat_atomic_cmpswap v1, v[3:4], v[5:6] glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_cmpswap v1, v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0xc5,0xdc,0x03,0x05,0x00,0x01] + +flat_atomic_add v[3:4], v5 +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_add v[3:4], v5 ; encoding: [0x00,0x00,0xc8,0xdc,0x03,0x05,0x00,0x00] + +flat_atomic_add v1, v[3:4], v5 glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_add v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xc9,0xdc,0x03,0x05,0x00,0x01] + +flat_atomic_sub v[3:4], v5 +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_sub v[3:4], v5 ; encoding: [0x00,0x00,0xcc,0xdc,0x03,0x05,0x00,0x00] + +flat_atomic_sub v1, v[3:4], v5 glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_sub v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xcd,0xdc,0x03,0x05,0x00,0x01] + +flat_atomic_smin v[3:4], v5 +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_smin v[3:4], v5 ; encoding: [0x00,0x00,0xd4,0xdc,0x03,0x05,0x00,0x00] + +flat_atomic_smin v1, v[3:4], v5 glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_smin v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xd5,0xdc,0x03,0x05,0x00,0x01] + +flat_atomic_umin v[3:4], v5 +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_umin v[3:4], v5 ; encoding: [0x00,0x00,0xd8,0xdc,0x03,0x05,0x00,0x00] + +flat_atomic_umin v1, v[3:4], v5 glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_umin v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xd9,0xdc,0x03,0x05,0x00,0x01] + +flat_atomic_smax v[3:4], v5 +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_smax v[3:4], v5 ; encoding: [0x00,0x00,0xdc,0xdc,0x03,0x05,0x00,0x00] + +flat_atomic_smax v1, v[3:4], v5 glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_smax v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xdd,0xdc,0x03,0x05,0x00,0x01] + +flat_atomic_umax v[3:4], v5 +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_umax v[3:4], v5 ; encoding: [0x00,0x00,0xe0,0xdc,0x03,0x05,0x00,0x00] + +flat_atomic_umax v1, v[3:4], v5 glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_umax v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xe1,0xdc,0x03,0x05,0x00,0x01] + +flat_atomic_and v[3:4], v5 +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_and v[3:4], v5 ; encoding: [0x00,0x00,0xe4,0xdc,0x03,0x05,0x00,0x00] + +flat_atomic_and v1, v[3:4], v5 glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_and v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xe5,0xdc,0x03,0x05,0x00,0x01] + +flat_atomic_or v[3:4], v5 +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_or v[3:4], v5 ; encoding: [0x00,0x00,0xe8,0xdc,0x03,0x05,0x00,0x00] + +flat_atomic_or v1, v[3:4], v5 glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_or v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xe9,0xdc,0x03,0x05,0x00,0x01] + +flat_atomic_xor v[3:4], v5 +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_xor v[3:4], v5 ; encoding: [0x00,0x00,0xec,0xdc,0x03,0x05,0x00,0x00] + +flat_atomic_xor v1, v[3:4], v5 glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_xor v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xed,0xdc,0x03,0x05,0x00,0x01] + +flat_atomic_inc v[3:4], v5 +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_inc v[3:4], v5 ; encoding: [0x00,0x00,0xf0,0xdc,0x03,0x05,0x00,0x00] + +flat_atomic_inc v1, v[3:4], v5 glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_inc v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xf1,0xdc,0x03,0x05,0x00,0x01] + +flat_atomic_dec v[3:4], v5 +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_dec v[3:4], v5 ; encoding: [0x00,0x00,0xf4,0xdc,0x03,0x05,0x00,0x00] + +flat_atomic_dec v1, v[3:4], v5 glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_dec v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xf5,0xdc,0x03,0x05,0x00,0x01] + +flat_atomic_swap_x2 v[3:4], v[5:6] +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_swap_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x40,0xdd,0x03,0x05,0x00,0x00] + +flat_atomic_swap_x2 v[1:2], v[3:4], v[5:6] glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_swap_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x41,0xdd,0x03,0x05,0x00,0x01] + +flat_atomic_cmpswap_x2 v[3:4], v[5:8] +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_cmpswap_x2 v[3:4], v[5:8] ; encoding: [0x00,0x00,0x44,0xdd,0x03,0x05,0x00,0x00] + +flat_atomic_cmpswap_x2 v[1:2], v[3:4], v[5:8] glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_cmpswap_x2 v[1:2], v[3:4], v[5:8] glc ; encoding: [0x00,0x00,0x45,0xdd,0x03,0x05,0x00,0x01] + +flat_atomic_add_x2 v[3:4], v[5:6] +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_add_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x48,0xdd,0x03,0x05,0x00,0x00] + +flat_atomic_add_x2 v[1:2], v[3:4], v[5:6] glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_add_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x49,0xdd,0x03,0x05,0x00,0x01] + +flat_atomic_sub_x2 v[3:4], v[5:6] +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_sub_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x4c,0xdd,0x03,0x05,0x00,0x00] + +flat_atomic_sub_x2 v[1:2], v[3:4], v[5:6] glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_sub_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x4d,0xdd,0x03,0x05,0x00,0x01] + +flat_atomic_smin_x2 v[3:4], v[5:6] +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_smin_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x54,0xdd,0x03,0x05,0x00,0x00] + +flat_atomic_smin_x2 v[1:2], v[3:4], v[5:6] glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_smin_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x55,0xdd,0x03,0x05,0x00,0x01] + +flat_atomic_umin_x2 v[3:4], v[5:6] +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_umin_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x58,0xdd,0x03,0x05,0x00,0x00] + +flat_atomic_umin_x2 v[1:2], v[3:4], v[5:6] glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_umin_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x59,0xdd,0x03,0x05,0x00,0x01] + +flat_atomic_smax_x2 v[3:4], v[5:6] +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_smax_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x5c,0xdd,0x03,0x05,0x00,0x00] + +flat_atomic_smax_x2 v[1:2], v[3:4], v[5:6] glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_smax_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x5d,0xdd,0x03,0x05,0x00,0x01] + +flat_atomic_umax_x2 v[3:4], v[5:6] +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_umax_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x60,0xdd,0x03,0x05,0x00,0x00] + +flat_atomic_umax_x2 v[1:2], v[3:4], v[5:6] glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_umax_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x61,0xdd,0x03,0x05,0x00,0x01] + +flat_atomic_and_x2 v[3:4], v[5:6] +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_and_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x64,0xdd,0x03,0x05,0x00,0x00] + +flat_atomic_and_x2 v[1:2], v[3:4], v[5:6] glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_and_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x65,0xdd,0x03,0x05,0x00,0x01] + +flat_atomic_or_x2 v[3:4], v[5:6] +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_or_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x68,0xdd,0x03,0x05,0x00,0x00] + +flat_atomic_or_x2 v[1:2], v[3:4], v[5:6] glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_or_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x69,0xdd,0x03,0x05,0x00,0x01] + +flat_atomic_xor_x2 v[3:4], v[5:6] +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_xor_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x6c,0xdd,0x03,0x05,0x00,0x00] + +flat_atomic_xor_x2 v[1:2], v[3:4], v[5:6] glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_xor_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x6d,0xdd,0x03,0x05,0x00,0x01] + +flat_atomic_inc_x2 v[3:4], v[5:6] +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_inc_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x70,0xdd,0x03,0x05,0x00,0x00] + +flat_atomic_inc_x2 v[1:2], v[3:4], v[5:6] glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_inc_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x71,0xdd,0x03,0x05,0x00,0x01] + +flat_atomic_dec_x2 v[3:4], v[5:6] +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_dec_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x74,0xdd,0x03,0x05,0x00,0x00] + +flat_atomic_dec_x2 v[1:2], v[3:4], v[5:6] glc +// NOSI: error: instruction not supported on this GPU +// CIVI: flat_atomic_dec_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x75,0xdd,0x03,0x05,0x00,0x01] + +flat_atomic_fcmpswap_x2 v[3:4], v[5:8] +// NOSI: error: instruction not supported on this GPU +// CI: flat_atomic_fcmpswap_x2 v[3:4], v[5:8] ; encoding: [0x00,0x00,0x78,0xdd,0x03,0x05,0x00,0x00] +// NOVI: error: instruction not supported on this GPU + +flat_atomic_fcmpswap_x2 v[1:2], v[3:4], v[5:8] glc +// NOSI: error: instruction not supported on this GPU +// CI: flat_atomic_fcmpswap_x2 v[1:2], v[3:4], v[5:8] glc ; encoding: [0x00,0x00,0x79,0xdd,0x03,0x05,0x00,0x01] +// NOVI: error: instruction not supported on this GPU + +flat_atomic_fmin_x2 v[3:4], v[5:6] +// NOSI: error: instruction not supported on this GPU +// CI: flat_atomic_fmin_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x7c,0xdd,0x03,0x05,0x00,0x00] +// NOVI: error: instruction not supported on this GPU + +flat_atomic_fmin_x2 v[1:2], v[3:4], v[5:6] glc +// NOSI: error: instruction not supported on this GPU +// CI: flat_atomic_fmin_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x7d,0xdd,0x03,0x05,0x00,0x01] +// NOVI: error: instruction not supported on this GPU + +flat_atomic_fmax_x2 v[3:4], v[5:6] +// NOSI: error: instruction not supported on this GPU +// CI: flat_atomic_fmax_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x80,0xdd,0x03,0x05,0x00,0x00] +// NOVI: error: instruction not supported on this GPU + +flat_atomic_fmax_x2 v[1:2], v[3:4], v[5:6] glc +// NOSI: error: instruction not supported on this GPU +// CI: flat_atomic_fmax_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x81,0xdd,0x03,0x05,0x00,0x01] +// NOVI: error: instruction not supported on this GPU diff --git a/test/MC/AMDGPU/lit.local.cfg b/test/MC/AMDGPU/lit.local.cfg new file mode 100644 index 00000000000..2a665f06be7 --- /dev/null +++ b/test/MC/AMDGPU/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'AMDGPU' in config.root.targets: + config.unsupported = True diff --git a/test/MC/AMDGPU/mubuf.s b/test/MC/AMDGPU/mubuf.s new file mode 100644 index 00000000000..78d365abef1 --- /dev/null +++ b/test/MC/AMDGPU/mubuf.s @@ -0,0 +1,352 @@ +// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s +// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s + +//===----------------------------------------------------------------------===// +// Test for different operand combinations +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// load - immediate offset only +//===----------------------------------------------------------------------===// + +buffer_load_dword v1, s[4:7], s1 +// CHECK: buffer_load_dword v1, s[4:7], s1 ; encoding: [0x00,0x00,0x30,0xe0,0x00,0x01,0x01,0x01] + +buffer_load_dword v1, s[4:7], s1 offset:4 +// CHECK: buffer_load_dword v1, s[4:7], s1 offset:4 ; encoding: [0x04,0x00,0x30,0xe0,0x00,0x01,0x01,0x01] + +buffer_load_dword v1, s[4:7], s1 offset:4 glc +// CHECK: buffer_load_dword v1, s[4:7], s1 offset:4 glc ; encoding: [0x04,0x40,0x30,0xe0,0x00,0x01,0x01,0x01] + +buffer_load_dword v1, s[4:7], s1 offset:4 slc +// CHECK: buffer_load_dword v1, s[4:7], s1 offset:4 slc ; encoding: [0x04,0x00,0x30,0xe0,0x00,0x01,0x41,0x01] + +buffer_load_dword v1, s[4:7], s1 offset:4 tfe +// CHECK: buffer_load_dword v1, s[4:7], s1 offset:4 tfe ; encoding: [0x04,0x00,0x30,0xe0,0x00,0x01,0x81,0x01] + +buffer_load_dword v1, s[4:7], s1 tfe glc +// CHECK: buffer_load_dword v1, s[4:7], s1 glc tfe ; encoding: [0x00,0x40,0x30,0xe0,0x00,0x01,0x81,0x01] + +buffer_load_dword v1, s[4:7], s1 offset:4 glc tfe slc +// CHECK: buffer_load_dword v1, s[4:7], s1 offset:4 glc slc tfe ; encoding: [0x04,0x40,0x30,0xe0,0x00,0x01,0xc1,0x01] + +buffer_load_dword v1, s[4:7], s1 glc tfe slc offset:4 +// CHECK: buffer_load_dword v1, s[4:7], s1 offset:4 glc slc tfe ; encoding: [0x04,0x40,0x30,0xe0,0x00,0x01,0xc1,0x01] + +//===----------------------------------------------------------------------===// +// load - vgpr offset +//===----------------------------------------------------------------------===// + +buffer_load_dword v1, v2, s[4:7], s1 offen +// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen ; encoding: [0x00,0x10,0x30,0xe0,0x02,0x01,0x01,0x01] + +buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 +// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 ; encoding: [0x04,0x10,0x30,0xe0,0x02,0x01,0x01,0x01] + +buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 glc +// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 glc ; encoding: [0x04,0x50,0x30,0xe0,0x02,0x01,0x01,0x01] + +buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 slc +// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 slc ; encoding: [0x04,0x10,0x30,0xe0,0x02,0x01,0x41,0x01] + +buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 tfe +// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 tfe ; encoding: [0x04,0x10,0x30,0xe0,0x02,0x01,0x81,0x01] + +buffer_load_dword v1, v2, s[4:7], s1 offen tfe glc +// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen glc tfe ; encoding: [0x00,0x50,0x30,0xe0,0x02,0x01,0x81,0x01] + +buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 glc tfe slc +// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 glc slc tfe ; encoding: [0x04,0x50,0x30,0xe0,0x02,0x01,0xc1,0x01] + +buffer_load_dword v1, v2, s[4:7], s1 offen glc tfe slc offset:4 +// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 glc slc tfe ; encoding: [0x04,0x50,0x30,0xe0,0x02,0x01,0xc1,0x01] + +//===----------------------------------------------------------------------===// +// load - vgpr index +//===----------------------------------------------------------------------===// + +buffer_load_dword v1, v2, s[4:7], s1 idxen +// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen ; encoding: [0x00,0x20,0x30,0xe0,0x02,0x01,0x01,0x01] + +buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 +// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 ; encoding: [0x04,0x20,0x30,0xe0,0x02,0x01,0x01,0x01] + +buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 glc +// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 glc ; encoding: [0x04,0x60,0x30,0xe0,0x02,0x01,0x01,0x01] + +buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 slc +// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 slc ; encoding: [0x04,0x20,0x30,0xe0,0x02,0x01,0x41,0x01] + +buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 tfe +// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 tfe ; encoding: [0x04,0x20,0x30,0xe0,0x02,0x01,0x81,0x01] + +buffer_load_dword v1, v2, s[4:7], s1 idxen tfe glc +// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen glc tfe ; encoding: [0x00,0x60,0x30,0xe0,0x02,0x01,0x81,0x01] + +buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 glc tfe slc +// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 glc slc tfe ; encoding: [0x04,0x60,0x30,0xe0,0x02,0x01,0xc1,0x01] + +buffer_load_dword v1, v2, s[4:7], s1 idxen glc tfe slc offset:4 +// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 glc slc tfe ; encoding: [0x04,0x60,0x30,0xe0,0x02,0x01,0xc1,0x01] + +//===----------------------------------------------------------------------===// +// load - vgpr index and offset +//===----------------------------------------------------------------------===// + +buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen +// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen ; encoding: [0x00,0x30,0x30,0xe0,0x02,0x01,0x01,0x01] + +buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 +// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 ; encoding: [0x04,0x30,0x30,0xe0,0x02,0x01,0x01,0x01] + +buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc +// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc ; encoding: [0x04,0x70,0x30,0xe0,0x02,0x01,0x01,0x01] + +buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 slc +// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 slc ; encoding: [0x04,0x30,0x30,0xe0,0x02,0x01,0x41,0x01] + +buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 tfe +// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 tfe ; encoding: [0x04,0x30,0x30,0xe0,0x02,0x01,0x81,0x01] + +buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen tfe glc +// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen glc tfe ; encoding: [0x00,0x70,0x30,0xe0,0x02,0x01,0x81,0x01] + +buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc tfe slc +// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc slc tfe ; encoding: [0x04,0x70,0x30,0xe0,0x02,0x01,0xc1,0x01] + +buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen glc tfe slc offset:4 +// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc slc tfe ; encoding: [0x04,0x70,0x30,0xe0,0x02,0x01,0xc1,0x01] + +//===----------------------------------------------------------------------===// +// load - addr64 +//===----------------------------------------------------------------------===// + +buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 +// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 ; encoding: [0x00,0x80,0x30,0xe0,0x02,0x01,0x01,0x01] + +buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 +// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 ; encoding: [0x04,0x80,0x30,0xe0,0x02,0x01,0x01,0x01] + +buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc +// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc ; encoding: [0x04,0xc0,0x30,0xe0,0x02,0x01,0x01,0x01] + +buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 slc +// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 slc ; encoding: [0x04,0x80,0x30,0xe0,0x02,0x01,0x41,0x01] + +buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 tfe +// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 tfe ; encoding: [0x04,0x80,0x30,0xe0,0x02,0x01,0x81,0x01] + +buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 tfe glc +// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 glc tfe ; encoding: [0x00,0xc0,0x30,0xe0,0x02,0x01,0x81,0x01] + +buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc tfe slc +// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc slc tfe ; encoding: [0x04,0xc0,0x30,0xe0,0x02,0x01,0xc1,0x01] + +buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 glc tfe slc offset:4 +// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc slc tfe ; encoding: [0x04,0xc0,0x30,0xe0,0x02,0x01,0xc1,0x01] + +//===----------------------------------------------------------------------===// +// store - immediate offset only +//===----------------------------------------------------------------------===// + +buffer_store_dword v1, s[4:7], s1 +// CHECK: buffer_store_dword v1, s[4:7], s1 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x01,0x01,0x01] + +buffer_store_dword v1, s[4:7], s1 offset:4 +// CHECK: buffer_store_dword v1, s[4:7], s1 offset:4 ; encoding: [0x04,0x00,0x70,0xe0,0x00,0x01,0x01,0x01] + +buffer_store_dword v1, s[4:7], s1 offset:4 glc +// CHECK: buffer_store_dword v1, s[4:7], s1 offset:4 glc ; encoding: [0x04,0x40,0x70,0xe0,0x00,0x01,0x01,0x01] + +buffer_store_dword v1, s[4:7], s1 offset:4 slc +// CHECK: buffer_store_dword v1, s[4:7], s1 offset:4 slc ; encoding: [0x04,0x00,0x70,0xe0,0x00,0x01,0x41,0x01] + +buffer_store_dword v1, s[4:7], s1 offset:4 tfe +// CHECK: buffer_store_dword v1, s[4:7], s1 offset:4 tfe ; encoding: [0x04,0x00,0x70,0xe0,0x00,0x01,0x81,0x01] + +buffer_store_dword v1, s[4:7], s1 tfe glc +// CHECK: buffer_store_dword v1, s[4:7], s1 glc tfe ; encoding: [0x00,0x40,0x70,0xe0,0x00,0x01,0x81,0x01] + +buffer_store_dword v1, s[4:7], s1 offset:4 glc tfe slc +// CHECK: buffer_store_dword v1, s[4:7], s1 offset:4 glc slc tfe ; encoding: [0x04,0x40,0x70,0xe0,0x00,0x01,0xc1,0x01] + +buffer_store_dword v1, s[4:7], s1 glc tfe slc offset:4 +// CHECK: buffer_store_dword v1, s[4:7], s1 offset:4 glc slc tfe ; encoding: [0x04,0x40,0x70,0xe0,0x00,0x01,0xc1,0x01] + +//===----------------------------------------------------------------------===// +// store - vgpr offset +//===----------------------------------------------------------------------===// + +buffer_store_dword v1, v2, s[4:7], s1 offen +// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen ; encoding: [0x00,0x10,0x70,0xe0,0x02,0x01,0x01,0x01] + +buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 +// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 ; encoding: [0x04,0x10,0x70,0xe0,0x02,0x01,0x01,0x01] + +buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 glc +// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 glc ; encoding: [0x04,0x50,0x70,0xe0,0x02,0x01,0x01,0x01] + +buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 slc +// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 slc ; encoding: [0x04,0x10,0x70,0xe0,0x02,0x01,0x41,0x01] + +buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 tfe +// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 tfe ; encoding: [0x04,0x10,0x70,0xe0,0x02,0x01,0x81,0x01] + +buffer_store_dword v1, v2, s[4:7], s1 offen tfe glc +// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen glc tfe ; encoding: [0x00,0x50,0x70,0xe0,0x02,0x01,0x81,0x01] + +buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 glc tfe slc +// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 glc slc tfe ; encoding: [0x04,0x50,0x70,0xe0,0x02,0x01,0xc1,0x01] + +buffer_store_dword v1, v2, s[4:7], s1 offen glc tfe slc offset:4 +// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 glc slc tfe ; encoding: [0x04,0x50,0x70,0xe0,0x02,0x01,0xc1,0x01] + +//===----------------------------------------------------------------------===// +// store - vgpr index +//===----------------------------------------------------------------------===// + +buffer_store_dword v1, v2, s[4:7], s1 idxen +// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen ; encoding: [0x00,0x20,0x70,0xe0,0x02,0x01,0x01,0x01] + +buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 +// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 ; encoding: [0x04,0x20,0x70,0xe0,0x02,0x01,0x01,0x01] + +buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 glc +// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 glc ; encoding: [0x04,0x60,0x70,0xe0,0x02,0x01,0x01,0x01] + +buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 slc +// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 slc ; encoding: [0x04,0x20,0x70,0xe0,0x02,0x01,0x41,0x01] + +buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 tfe +// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 tfe ; encoding: [0x04,0x20,0x70,0xe0,0x02,0x01,0x81,0x01] + +buffer_store_dword v1, v2, s[4:7], s1 idxen tfe glc +// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen glc tfe ; encoding: [0x00,0x60,0x70,0xe0,0x02,0x01,0x81,0x01] + +buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 glc tfe slc +// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 glc slc tfe ; encoding: [0x04,0x60,0x70,0xe0,0x02,0x01,0xc1,0x01] + +buffer_store_dword v1, v2, s[4:7], s1 idxen glc tfe slc offset:4 +// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 glc slc tfe ; encoding: [0x04,0x60,0x70,0xe0,0x02,0x01,0xc1,0x01] + +//===----------------------------------------------------------------------===// +// store - vgpr index and offset +//===----------------------------------------------------------------------===// + +buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen +// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen ; encoding: [0x00,0x30,0x70,0xe0,0x02,0x01,0x01,0x01] + +buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 +// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 ; encoding: [0x04,0x30,0x70,0xe0,0x02,0x01,0x01,0x01] + +buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc +// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc ; encoding: [0x04,0x70,0x70,0xe0,0x02,0x01,0x01,0x01] + +buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 slc +// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 slc ; encoding: [0x04,0x30,0x70,0xe0,0x02,0x01,0x41,0x01] + +buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 tfe +// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 tfe ; encoding: [0x04,0x30,0x70,0xe0,0x02,0x01,0x81,0x01] + +buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen tfe glc +// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen glc tfe ; encoding: [0x00,0x70,0x70,0xe0,0x02,0x01,0x81,0x01] + +buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc tfe slc +// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc slc tfe ; encoding: [0x04,0x70,0x70,0xe0,0x02,0x01,0xc1,0x01] + +buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen glc tfe slc offset:4 +// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc slc tfe ; encoding: [0x04,0x70,0x70,0xe0,0x02,0x01,0xc1,0x01] + +//===----------------------------------------------------------------------===// +// store - addr64 +//===----------------------------------------------------------------------===// + +buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 +// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 ; encoding: [0x00,0x80,0x70,0xe0,0x02,0x01,0x01,0x01] + +buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 +// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 ; encoding: [0x04,0x80,0x70,0xe0,0x02,0x01,0x01,0x01] + +buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc +// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc ; encoding: [0x04,0xc0,0x70,0xe0,0x02,0x01,0x01,0x01] + +buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 slc +// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 slc ; encoding: [0x04,0x80,0x70,0xe0,0x02,0x01,0x41,0x01] + +buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 tfe +// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 tfe ; encoding: [0x04,0x80,0x70,0xe0,0x02,0x01,0x81,0x01] + +buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 tfe glc +// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 glc tfe ; encoding: [0x00,0xc0,0x70,0xe0,0x02,0x01,0x81,0x01] + +buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc tfe slc +// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc slc tfe ; encoding: [0x04,0xc0,0x70,0xe0,0x02,0x01,0xc1,0x01] + +buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 glc tfe slc offset:4 +// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc slc tfe ; encoding: [0x04,0xc0,0x70,0xe0,0x02,0x01,0xc1,0x01] + +//===----------------------------------------------------------------------===// +// Instructions +//===----------------------------------------------------------------------===// + +buffer_load_format_x v1, s[4:7], s1 +// CHECK: buffer_load_format_x v1, s[4:7], s1 ; encoding: [0x00,0x00,0x00,0xe0,0x00,0x01,0x01,0x01] + +buffer_load_format_xy v[1:2], s[4:7], s1 +// CHECK: buffer_load_format_xy v[1:2], s[4:7], s1 ; encoding: [0x00,0x00,0x04,0xe0,0x00,0x01,0x01,0x01] + +buffer_load_format_xyz v[1:3], s[4:7], s1 +// CHECK: buffer_load_format_xyz v[1:3], s[4:7], s1 ; encoding: [0x00,0x00,0x08,0xe0,0x00,0x01,0x01,0x01] + +buffer_load_format_xyzw v[1:4], s[4:7], s1 +// CHECK: buffer_load_format_xyzw v[1:4], s[4:7], s1 ; encoding: [0x00,0x00,0x0c,0xe0,0x00,0x01,0x01,0x01] + +buffer_store_format_x v1, s[4:7], s1 +// CHECK: buffer_store_format_x v1, s[4:7], s1 ; encoding: [0x00,0x00,0x10,0xe0,0x00,0x01,0x01,0x01] + +buffer_store_format_xy v[1:2], s[4:7], s1 +// CHECK: buffer_store_format_xy v[1:2], s[4:7], s1 ; encoding: [0x00,0x00,0x14,0xe0,0x00,0x01,0x01,0x01] + +buffer_store_format_xyz v[1:3], s[4:7], s1 +// CHECK: buffer_store_format_xyz v[1:3], s[4:7], s1 ; encoding: [0x00,0x00,0x18,0xe0,0x00,0x01,0x01,0x01] + +buffer_store_format_xyzw v[1:4], s[4:7], s1 +// CHECK: buffer_store_format_xyzw v[1:4], s[4:7], s1 ; encoding: [0x00,0x00,0x1c,0xe0,0x00,0x01,0x01,0x01] + +buffer_load_ubyte v1, s[4:7], s1 +// CHECK: buffer_load_ubyte v1, s[4:7], s1 ; encoding: [0x00,0x00,0x20,0xe0,0x00,0x01,0x01,0x01] + +buffer_load_sbyte v1, s[4:7], s1 +// CHECK: buffer_load_sbyte v1, s[4:7], s1 ; encoding: [0x00,0x00,0x24,0xe0,0x00,0x01,0x01,0x01] + +buffer_load_ushort v1, s[4:7], s1 +// CHECK: buffer_load_ushort v1, s[4:7], s1 ; encoding: [0x00,0x00,0x28,0xe0,0x00,0x01,0x01,0x01] + +buffer_load_sshort v1, s[4:7], s1 +// CHECK: buffer_load_sshort v1, s[4:7], s1 ; encoding: [0x00,0x00,0x2c,0xe0,0x00,0x01,0x01,0x01] + +buffer_load_dword v1, s[4:7], s1 +// CHECK: buffer_load_dword v1, s[4:7], s1 ; encoding: [0x00,0x00,0x30,0xe0,0x00,0x01,0x01,0x01] + +buffer_load_dwordx2 v[1:2], s[4:7], s1 +// CHECK: buffer_load_dwordx2 v[1:2], s[4:7], s1 ; encoding: [0x00,0x00,0x34,0xe0,0x00,0x01,0x01,0x01] + +buffer_load_dwordx4 v[1:4], s[4:7], s1 +// CHECK: buffer_load_dwordx4 v[1:4], s[4:7], s1 ; encoding: [0x00,0x00,0x38,0xe0,0x00,0x01,0x01,0x01] + +buffer_store_byte v1, s[4:7], s1 +// CHECK: buffer_store_byte v1, s[4:7], s1 ; encoding: [0x00,0x00,0x60,0xe0,0x00,0x01,0x01,0x01] + +buffer_store_short v1, s[4:7], s1 +// CHECK: buffer_store_short v1, s[4:7], s1 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x01,0x01,0x01] + +buffer_store_dword v1 s[4:7], s1 +// CHECK: buffer_store_dword v1, s[4:7], s1 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x01,0x01,0x01] + +buffer_store_dwordx2 v[1:2], s[4:7], s1 +// CHECK: buffer_store_dwordx2 v[1:2], s[4:7], s1 ; encoding: [0x00,0x00,0x74,0xe0,0x00,0x01,0x01,0x01] + +buffer_store_dwordx4 v[1:4], s[4:7], s1 +// CHECK: buffer_store_dwordx4 v[1:4], s[4:7], s1 ; encoding: [0x00,0x00,0x78,0xe0,0x00,0x01,0x01,0x01] + +// TODO: Atomics diff --git a/test/MC/AMDGPU/smrd.s b/test/MC/AMDGPU/smrd.s new file mode 100644 index 00000000000..b67abf7e689 --- /dev/null +++ b/test/MC/AMDGPU/smrd.s @@ -0,0 +1,32 @@ +// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s +// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s + +s_load_dword s1, s[2:3], 1 +// CHECK: s_load_dword s1, s[2:3], 0x1 ; encoding: [0x01,0x83,0x00,0xc0] + +s_load_dword s1, s[2:3], s4 +// CHECK: s_load_dword s1, s[2:3], s4 ; encoding: [0x04,0x82,0x00,0xc0] + +s_load_dwordx2 s[2:3], s[2:3], 1 +// CHECK: s_load_dwordx2 s[2:3], s[2:3], 0x1 ; encoding: [0x01,0x03,0x41,0xc0] + +s_load_dwordx2 s[2:3], s[2:3], s4 +// CHECK: s_load_dwordx2 s[2:3], s[2:3], s4 ; encoding: [0x04,0x02,0x41,0xc0] + +s_load_dwordx4 s[4:7], s[2:3], 1 +// CHECK: s_load_dwordx4 s[4:7], s[2:3], 0x1 ; encoding: [0x01,0x03,0x82,0xc0] + +s_load_dwordx4 s[4:7], s[2:3], s4 +// CHECK: s_load_dwordx4 s[4:7], s[2:3], s4 ; encoding: [0x04,0x02,0x82,0xc0] + +s_load_dwordx8 s[8:15], s[2:3], 1 +// CHECK: s_load_dwordx8 s[8:15], s[2:3], 0x1 ; encoding: [0x01,0x03,0xc4,0xc0] + +s_load_dwordx8 s[8:15], s[2:3], s4 +// CHECK: s_load_dwordx8 s[8:15], s[2:3], s4 ; encoding: [0x04,0x02,0xc4,0xc0] + +s_load_dwordx16 s[16:31], s[2:3], 1 +// CHECK: s_load_dwordx16 s[16:31], s[2:3], 0x1 ; encoding: [0x01,0x03,0x08,0xc1] + +s_load_dwordx16 s[16:31], s[2:3], s4 +// CHECK: s_load_dwordx16 s[16:31], s[2:3], s4 ; encoding: [0x04,0x02,0x08,0xc1] diff --git a/test/MC/AMDGPU/sop1-err.s b/test/MC/AMDGPU/sop1-err.s new file mode 100644 index 00000000000..f892356b623 --- /dev/null +++ b/test/MC/AMDGPU/sop1-err.s @@ -0,0 +1,37 @@ +// RUN: not llvm-mc -arch=amdgcn %s 2>&1 | FileCheck %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=SI %s 2>&1 | FileCheck %s + +s_mov_b32 v1, s2 +// CHECK: error: invalid operand for instruction + +s_mov_b32 s1, v0 +// CHECK: error: invalid operand for instruction + +s_mov_b32 s[1:2], s0 +// CHECK: error: invalid operand for instruction + +s_mov_b32 s0, s[1:2] +// CHECK: error: invalid operand for instruction + +s_mov_b32 s220, s0 +// CHECK: error: invalid operand for instruction + +s_mov_b32 s0, s220 +// CHECK: error: invalid operand for instruction + +s_mov_b64 s1, s[0:1] +// CHECK: error: invalid operand for instruction + +s_mov_b64 s[0:1], s1 +// CHECK: error: invalid operand for instruction + +// Immediate greater than 32-bits +s_mov_b32 s1, 0xfffffffff +// CHECK: error: invalid immediate: only 32-bit values are legal + +// Immediate greater than 32-bits +s_mov_b64 s[0:1], 0xfffffffff +// CHECK: error: invalid immediate: only 32-bit values are legal + +// Out of range register +s_mov_b32 s diff --git a/test/MC/AMDGPU/sop1.s b/test/MC/AMDGPU/sop1.s new file mode 100644 index 00000000000..92ca73f2500 --- /dev/null +++ b/test/MC/AMDGPU/sop1.s @@ -0,0 +1,177 @@ +// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s +// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s + +s_mov_b32 s1, s2 +// CHECK: s_mov_b32 s1, s2 ; encoding: [0x02,0x03,0x81,0xbe] + +s_mov_b32 s1, 1 +// CHECK: s_mov_b32 s1, 1 ; encoding: [0x81,0x03,0x81,0xbe] + +s_mov_b32 s1, 100 +// CHECK: s_mov_b32 s1, 0x64 ; encoding: [0xff,0x03,0x81,0xbe,0x64,0x00,0x00,0x00] + +s_mov_b64 s[2:3], s[4:5] +// CHECK: s_mov_b64 s[2:3], s[4:5] ; encoding: [0x04,0x04,0x82,0xbe] + +s_mov_b64 s[2:3], 0xffffffffffffffff +// CHECK: s_mov_b64 s[2:3], -1 ; encoding: [0xc1,0x04,0x82,0xbe] + +s_cmov_b32 s1, 200 +// CHECK: s_cmov_b32 s1, 0xc8 ; encoding: [0xff,0x05,0x81,0xbe,0xc8,0x00,0x00,0x00] + +s_cmov_b32 s1, 1.0 +// CHECK: s_cmov_b32 s1, 1.0 ; encoding: [0xf2,0x05,0x81,0xbe] + +//s_cmov_b64 s[2:3], 1.0 +//CHECK-FIXME: s_cmov_b64 s[2:3], 1.0 ; encoding: [0xf2,0x05,0x82,0xb3] + +//===----------------------------------------------------------------------===// +// Instructions +//===----------------------------------------------------------------------===// + +s_mov_b32 s1, s2 +// CHECK: s_mov_b32 s1, s2 ; encoding: [0x02,0x03,0x81,0xbe] + +s_mov_b64 s[2:3], s[4:5] +// CHECK: s_mov_b64 s[2:3], s[4:5] ; encoding: [0x04,0x04,0x82,0xbe] + +s_cmov_b32 s1, s2 +// CHECK: s_cmov_b32 s1, s2 ; encoding: [0x02,0x05,0x81,0xbe] + +s_cmov_b64 s[2:3], s[4:5] +// CHECK: s_cmov_b64 s[2:3], s[4:5] ; encoding: [0x04,0x06,0x82,0xbe] + +s_not_b32 s1, s2 +// CHECK: s_not_b32 s1, s2 ; encoding: [0x02,0x07,0x81,0xbe] + +s_not_b64 s[2:3], s[4:5] +// CHECK: s_not_b64 s[2:3], s[4:5] ; encoding: [0x04,0x08,0x82,0xbe] + +s_wqm_b32 s1, s2 +// CHECK: s_wqm_b32 s1, s2 ; encoding: [0x02,0x09,0x81,0xbe] + +s_wqm_b64 s[2:3], s[4:5] +// CHECK: s_wqm_b64 s[2:3], s[4:5] ; encoding: [0x04,0x0a,0x82,0xbe] + +s_brev_b32 s1, s2 +// CHECK: s_brev_b32 s1, s2 ; encoding: [0x02,0x0b,0x81,0xbe] + +s_brev_b64 s[2:3], s[4:5] +// CHECK: s_brev_b64 s[2:3], s[4:5] ; encoding: [0x04,0x0c,0x82,0xbe] + +s_bcnt0_i32_b32 s1, s2 +// CHECK: s_bcnt0_i32_b32 s1, s2 ; encoding: [0x02,0x0d,0x81,0xbe] + +s_bcnt0_i32_b64 s1, s[2:3] +// CHECK: s_bcnt0_i32_b64 s1, s[2:3] ; encoding: [0x02,0x0e,0x81,0xbe] + +s_bcnt1_i32_b32 s1, s2 +// CHECK: s_bcnt1_i32_b32 s1, s2 ; encoding: [0x02,0x0f,0x81,0xbe] + +s_bcnt1_i32_b64 s1, s[2:3] +// CHECK: s_bcnt1_i32_b64 s1, s[2:3] ; encoding: [0x02,0x10,0x81,0xbe] + +s_ff0_i32_b32 s1, s2 +// CHECK: s_ff0_i32_b32 s1, s2 ; encoding: [0x02,0x11,0x81,0xbe] + +s_ff0_i32_b64 s1, s[2:3] +// CHECK: s_ff0_i32_b64 s1, s[2:3] ; encoding: [0x02,0x12,0x81,0xbe] + +s_ff1_i32_b32 s1, s2 +// CHECK: s_ff1_i32_b32 s1, s2 ; encoding: [0x02,0x13,0x81,0xbe] + +s_ff1_i32_b64 s1, s[2:3] +// CHECK: s_ff1_i32_b64 s1, s[2:3] ; encoding: [0x02,0x14,0x81,0xbe] + +s_flbit_i32_b32 s1, s2 +// CHECK: s_flbit_i32_b32 s1, s2 ; encoding: [0x02,0x15,0x81,0xbe] + +s_flbit_i32_b64 s1, s[2:3] +// CHECK: s_flbit_i32_b64 s1, s[2:3] ; encoding: [0x02,0x16,0x81,0xbe] + +s_flbit_i32 s1, s2 +// CHECK: s_flbit_i32 s1, s2 ; encoding: [0x02,0x17,0x81,0xbe] + +s_flbit_i32_i64 s1, s[2:3] +// CHECK: s_flbit_i32_i64 s1, s[2:3] ; encoding: [0x02,0x18,0x81,0xbe] + +s_sext_i32_i8 s1, s2 +// CHECK: s_sext_i32_i8 s1, s2 ; encoding: [0x02,0x19,0x81,0xbe] + +s_sext_i32_i16 s1, s2 +// CHECK: s_sext_i32_i16 s1, s2 ; encoding: [0x02,0x1a,0x81,0xbe] + +s_bitset0_b32 s1, s2 +// CHECK: s_bitset0_b32 s1, s2 ; encoding: [0x02,0x1b,0x81,0xbe] + +s_bitset0_b64 s[2:3], s[4:5] +// CHECK: s_bitset0_b64 s[2:3], s[4:5] ; encoding: [0x04,0x1c,0x82,0xbe] + +s_bitset1_b32 s1, s2 +// CHECK: s_bitset1_b32 s1, s2 ; encoding: [0x02,0x1d,0x81,0xbe] + +s_bitset1_b64 s[2:3], s[4:5] +// CHECK: s_bitset1_b64 s[2:3], s[4:5] ; encoding: [0x04,0x1e,0x82,0xbe] + +s_getpc_b64 s[2:3] +// CHECK: s_getpc_b64 s[2:3] ; encoding: [0x00,0x1f,0x82,0xbe] + +s_setpc_b64 s[2:3], s[4:5] +// CHECK: s_setpc_b64 s[2:3], s[4:5] ; encoding: [0x04,0x20,0x82,0xbe] + +s_swappc_b64 s[2:3], s[4:5] +// CHECK: s_swappc_b64 s[2:3], s[4:5] ; encoding: [0x04,0x21,0x82,0xbe] + +s_rfe_b64 s[2:3], s[4:5] +// CHECK: s_rfe_b64 s[2:3], s[4:5] ; encoding: [0x04,0x22,0x82,0xbe] + +s_and_saveexec_b64 s[2:3], s[4:5] +// CHECK: s_and_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x24,0x82,0xbe] + +s_or_saveexec_b64 s[2:3], s[4:5] +// CHECK: s_or_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x25,0x82,0xbe] + +s_xor_saveexec_b64 s[2:3], s[4:5] +// CHECK: s_xor_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x26,0x82,0xbe] + +s_andn2_saveexec_b64 s[2:3], s[4:5] +// CHECK: s_andn2_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x27,0x82,0xbe] + +s_orn2_saveexec_b64 s[2:3], s[4:5] +// CHECK: s_orn2_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x28,0x82,0xbe] + +s_nand_saveexec_b64 s[2:3], s[4:5] +// CHECK: s_nand_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x29,0x82,0xbe] + +s_nor_saveexec_b64 s[2:3], s[4:5] +// CHECK: s_nor_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x2a,0x82,0xbe] + +s_xnor_saveexec_b64 s[2:3], s[4:5] +// CHECK: s_xnor_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x2b,0x82,0xbe] + +s_quadmask_b32 s1, s2 +// CHECK: s_quadmask_b32 s1, s2 ; encoding: [0x02,0x2c,0x81,0xbe] + +s_quadmask_b64 s[2:3], s[4:5] +// CHECK: s_quadmask_b64 s[2:3], s[4:5] ; encoding: [0x04,0x2d,0x82,0xbe] + +s_movrels_b32 s1, s2 +// CHECK: s_movrels_b32 s1, s2 ; encoding: [0x02,0x2e,0x81,0xbe] + +s_movrels_b64 s[2:3], s[4:5] +// CHECK: s_movrels_b64 s[2:3], s[4:5] ; encoding: [0x04,0x2f,0x82,0xbe] + +s_movreld_b32 s1, s2 +// CHECK: s_movreld_b32 s1, s2 ; encoding: [0x02,0x30,0x81,0xbe] + +s_movreld_b64 s[2:3], s[4:5] +// CHECK: s_movreld_b64 s[2:3], s[4:5] ; encoding: [0x04,0x31,0x82,0xbe] + +s_cbranch_join s[4:5] +// CHECK: s_cbranch_join s[4:5] ; encoding: [0x04,0x32,0x80,0xbe] + +s_abs_i32 s1, s2 +// CHECK: s_abs_i32 s1, s2 ; encoding: [0x02,0x34,0x81,0xbe] + +s_mov_fed_b32 s1, s2 +// CHECK: s_mov_fed_b32 s1, s2 ; encoding: [0x02,0x35,0x81,0xbe] diff --git a/test/MC/AMDGPU/sop2.s b/test/MC/AMDGPU/sop2.s new file mode 100644 index 00000000000..9a7a1c01064 --- /dev/null +++ b/test/MC/AMDGPU/sop2.s @@ -0,0 +1,131 @@ +// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s +// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s + +// CHECK: s_add_u32 s1, s2, s3 ; encoding: [0x02,0x03,0x01,0x80] +s_add_u32 s1, s2, s3 + +// CHECK: s_sub_u32 s1, s2, s3 ; encoding: [0x02,0x03,0x81,0x80] +s_sub_u32 s1, s2, s3 + +// CHECK: s_add_i32 s1, s2, s3 ; encoding: [0x02,0x03,0x01,0x81] +s_add_i32 s1, s2, s3 + +// CHECK: s_sub_i32 s1, s2, s3 ; encoding: [0x02,0x03,0x81,0x81] +s_sub_i32 s1, s2, s3 + +// CHECK: s_addc_u32 s1, s2, s3 ; encoding: [0x02,0x03,0x01,0x82] +s_addc_u32 s1, s2, s3 + +// CHECK: s_subb_u32 s1, s2, s3 ; encoding: [0x02,0x03,0x81,0x82] +s_subb_u32 s1, s2, s3 + +// CHECK: s_min_i32 s1, s2, s3 ; encoding: [0x02,0x03,0x01,0x83] +s_min_i32 s1, s2, s3 + +// CHECK: s_min_u32 s1, s2, s3 ; encoding: [0x02,0x03,0x81,0x83] +s_min_u32 s1, s2, s3 + +// CHECK: s_max_i32 s1, s2, s3 ; encoding: [0x02,0x03,0x01,0x84] +s_max_i32 s1, s2, s3 + +// CHECK: s_max_u32 s1, s2, s3 ; encoding: [0x02,0x03,0x81,0x84] +s_max_u32 s1, s2, s3 + +// CHECK: s_cselect_b32 s1, s2, s3 ; encoding: [0x02,0x03,0x01,0x85] +s_cselect_b32 s1, s2, s3 + +// CHECK: s_cselect_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x85] +s_cselect_b64 s[2:3], s[4:5], s[6:7] + +// CHECK: s_and_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x87] +s_and_b32 s2, s4, s6 + +// CHECK: s_and_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x87] +s_and_b64 s[2:3], s[4:5], s[6:7] + +// CHECK: s_or_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x88] +s_or_b32 s2, s4, s6 + +// CHECK: s_or_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x88] +s_or_b64 s[2:3], s[4:5], s[6:7] + +// CHECK: s_xor_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x89] +s_xor_b32 s2, s4, s6 + +// CHECK: s_xor_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x89] +s_xor_b64 s[2:3], s[4:5], s[6:7] + +// CHECK: s_andn2_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x8a] +s_andn2_b32 s2, s4, s6 + +// CHECK: s_andn2_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x8a] +s_andn2_b64 s[2:3], s[4:5], s[6:7] + +// CHECK: s_orn2_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x8b] +s_orn2_b32 s2, s4, s6 + +// CHECK: s_orn2_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x8b] +s_orn2_b64 s[2:3], s[4:5], s[6:7] + +// CHECK: s_nand_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x8c] +s_nand_b32 s2, s4, s6 + +// CHECK: s_nand_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x8c] +s_nand_b64 s[2:3], s[4:5], s[6:7] + +// CHECK: s_nor_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x8d] +s_nor_b32 s2, s4, s6 + +// CHECK: s_nor_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x8d] +s_nor_b64 s[2:3], s[4:5], s[6:7] + +// CHECK: s_xnor_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x8e] +s_xnor_b32 s2, s4, s6 + +// CHECK: s_xnor_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x8e] +s_xnor_b64 s[2:3], s[4:5], s[6:7] + +// CHECK: s_lshl_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x8f] +s_lshl_b32 s2, s4, s6 + +// CHECK: s_lshl_b64 s[2:3], s[4:5], s6 ; encoding: [0x04,0x06,0x82,0x8f] +s_lshl_b64 s[2:3], s[4:5], s6 + +// CHECK: s_lshr_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x90] +s_lshr_b32 s2, s4, s6 + +// CHECK: s_lshr_b64 s[2:3], s[4:5], s6 ; encoding: [0x04,0x06,0x82,0x90] +s_lshr_b64 s[2:3], s[4:5], s6 + +// CHECK: s_ashr_i32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x91] +s_ashr_i32 s2, s4, s6 + +// CHECK: s_ashr_i64 s[2:3], s[4:5], s6 ; encoding: [0x04,0x06,0x82,0x91] +s_ashr_i64 s[2:3], s[4:5], s6 + +// CHECK: s_bfm_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x92] +s_bfm_b32 s2, s4, s6 + +// CHECK: s_bfm_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x92] +s_bfm_b64 s[2:3], s[4:5], s[6:7] + +// CHECK: s_mul_i32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x93] +s_mul_i32 s2, s4, s6 + +// CHECK: s_bfe_u32 s2, s4, s6 ; encoding: [0x04,0x06,0x82,0x93] +s_bfe_u32 s2, s4, s6 + +// CHECK: s_bfe_i32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x94] +s_bfe_i32 s2, s4, s6 + +// CHECK: s_bfe_u64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x94] +s_bfe_u64 s[2:3], s[4:5], s[6:7] + +// CHECK: s_bfe_i64 s[2:3], s[4:5], s6 ; encoding: [0x04,0x06,0x02,0x95] +s_bfe_i64 s[2:3], s[4:5], s6 + +// CHECK: s_cbranch_g_fork s[4:5], s[6:7] ; encoding: [0x04,0x06,0x80,0x95] +s_cbranch_g_fork s[4:5], s[6:7] + +// CHECK: s_absdiff_i32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x96] +s_absdiff_i32 s2, s4, s6 diff --git a/test/MC/AMDGPU/sopc.s b/test/MC/AMDGPU/sopc.s new file mode 100644 index 00000000000..0899c1a2eed --- /dev/null +++ b/test/MC/AMDGPU/sopc.s @@ -0,0 +1,9 @@ +// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s +// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s + +//===----------------------------------------------------------------------===// +// Instructions +//===----------------------------------------------------------------------===// + +s_cmp_eq_i32 s1, s2 +// CHECK: s_cmp_eq_i32 s1, s2 ; encoding: [0x01,0x02,0x00,0xbf] diff --git a/test/MC/AMDGPU/sopk.s b/test/MC/AMDGPU/sopk.s new file mode 100644 index 00000000000..6c27aaccb80 --- /dev/null +++ b/test/MC/AMDGPU/sopk.s @@ -0,0 +1,66 @@ +// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s +// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s + +//===----------------------------------------------------------------------===// +// Instructions +//===----------------------------------------------------------------------===// + +s_movk_i32 s2, 0x6 +// CHECK: s_movk_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb0] + +s_cmovk_i32 s2, 0x6 +// CHECK: s_cmovk_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb1] + +s_cmpk_eq_i32 s2, 0x6 +// CHECK: s_cmpk_eq_i32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb1] + +s_cmpk_lg_i32 s2, 0x6 +// CHECK: s_cmpk_lg_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb2] + +s_cmpk_gt_i32 s2, 0x6 +// CHECK: s_cmpk_gt_i32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb2] + +s_cmpk_ge_i32 s2, 0x6 +// CHECK: s_cmpk_ge_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb3] + +s_cmpk_lt_i32 s2, 0x6 +// CHECK: s_cmpk_lt_i32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb3] + +s_cmpk_le_i32 s2, 0x6 +// CHECK: s_cmpk_le_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb4] + +s_cmpk_eq_u32 s2, 0x6 +// CHECK: s_cmpk_eq_u32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb4] + +s_cmpk_lg_u32 s2, 0x6 +// CHECK: s_cmpk_lg_u32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb5] + +s_cmpk_gt_u32 s2, 0x6 +// CHECK: s_cmpk_gt_u32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb5] + +s_cmpk_ge_u32 s2, 0x6 +// CHECK: s_cmpk_ge_u32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb6] + +s_cmpk_lt_u32 s2, 0x6 +// CHECK: s_cmpk_lt_u32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb6] + +s_cmpk_le_u32 s2, 0x6 +// CHECK: s_cmpk_le_u32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb7] + +s_addk_i32 s2, 0x6 +// CHECK: s_addk_i32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb7] + +s_mulk_i32 s2, 0x6 +// CHECK: s_mulk_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb8] + +s_cbranch_i_fork s[2:3], 0x6 +// CHECK: s_cbranch_i_fork s[2:3], 0x6 ; encoding: [0x06,0x00,0x82,0xb8] + +s_getreg_b32 s2, 0x6 +// CHECK: s_getreg_b32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb9] + +s_setreg_b32 s2, 0x6 +// CHECK: s_setreg_b32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb9] + +s_setreg_imm32_b32 0xff, 0x6 +// CHECK: s_setreg_imm32_b32 0xff, 0x6 ; encoding: [0x06,0x00,0x80,0xba,0xff,0x00,0x00,0x00] diff --git a/test/MC/AMDGPU/sopp.s b/test/MC/AMDGPU/sopp.s new file mode 100644 index 00000000000..b072c16fdb2 --- /dev/null +++ b/test/MC/AMDGPU/sopp.s @@ -0,0 +1,64 @@ +// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s +// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s + +//===----------------------------------------------------------------------===// +// Edge Cases +//===----------------------------------------------------------------------===// + +s_nop 0 // CHECK: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] +s_nop 0xffff // CHECK: s_nop 0xffff ; encoding: [0xff,0xff,0x80,0xbf] + +//===----------------------------------------------------------------------===// +// Instructions +//===----------------------------------------------------------------------===// + + s_nop 1 // CHECK: s_nop 1 ; encoding: [0x01,0x00,0x80,0xbf] + s_endpgm // CHECK: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] + s_branch 2 // CHECK: s_branch 2 ; encoding: [0x02,0x00,0x82,0xbf] + s_cbranch_scc0 3 // CHECK: s_cbranch_scc0 3 ; encoding: [0x03,0x00,0x84,0xbf] + s_cbranch_scc1 4 // CHECK: s_cbranch_scc1 4 ; encoding: [0x04,0x00,0x85,0xbf] + s_cbranch_vccz 5 // CHECK: s_cbranch_vccz 5 ; encoding: [0x05,0x00,0x86,0xbf] + s_cbranch_vccnz 6 // CHECK: s_cbranch_vccnz 6 ; encoding: [0x06,0x00,0x87,0xbf] + s_cbranch_execz 7 // CHECK: s_cbranch_execz 7 ; encoding: [0x07,0x00,0x88,0xbf] + s_cbranch_execnz 8 // CHECK: s_cbranch_execnz 8 ; encoding: [0x08,0x00,0x89,0xbf] + s_barrier // CHECK: s_barrier ; encoding: [0x00,0x00,0x8a,0xbf] + +//===----------------------------------------------------------------------===// +// s_waitcnt +//===----------------------------------------------------------------------===// + + s_waitcnt 0 + // CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] + + s_waitcnt vmcnt(0) & expcnt(0) & lgkmcnt(0) + // CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] + + s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + // CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] + + s_waitcnt vmcnt(0), expcnt(0), lgkmcnt(0) + // CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] + + s_waitcnt vmcnt(1) + // CHECK: s_waitcnt vmcnt(1) ; encoding: [0x71,0x07,0x8c,0xbf] + + s_waitcnt expcnt(2) + // CHECK: s_waitcnt expcnt(2) ; encoding: [0x2f,0x07,0x8c,0xbf] + + s_waitcnt lgkmcnt(3) + // CHECK: s_waitcnt lgkmcnt(3) ; encoding: [0x7f,0x03,0x8c,0xbf] + + s_waitcnt vmcnt(0), expcnt(0) + // CHECK: s_waitcnt vmcnt(0) expcnt(0) ; encoding: [0x00,0x07,0x8c,0xbf] + + + s_sethalt 9 // CHECK: s_sethalt 9 ; encoding: [0x09,0x00,0x8d,0xbf] + s_sleep 10 // CHECK: s_sleep 10 ; encoding: [0x0a,0x00,0x8e,0xbf] + s_setprio 1 // CHECK: s_setprio 1 ; encoding: [0x01,0x00,0x8f,0xbf] + s_sendmsg 2 // CHECK: s_sendmsg Gs(nop), [m0] ; encoding: [0x02,0x00,0x90,0xbf] + s_sendmsghalt 3 // CHECK: s_sendmsghalt 3 ; encoding: [0x03,0x00,0x91,0xbf] + s_trap 4 // CHECK: s_trap 4 ; encoding: [0x04,0x00,0x92,0xbf] + s_icache_inv // CHECK: s_icache_inv ; encoding: [0x00,0x00,0x93,0xbf] + s_incperflevel 5 // CHECK: s_incperflevel 5 ; encoding: [0x05,0x00,0x94,0xbf] + s_decperflevel 6 // CHECK: s_decperflevel 6 ; encoding: [0x06,0x00,0x95,0xbf] + s_ttracedata // CHECK: s_ttracedata ; encoding: [0x00,0x00,0x96,0xbf] diff --git a/test/MC/AMDGPU/vop1.s b/test/MC/AMDGPU/vop1.s new file mode 100644 index 00000000000..d0b00fcd189 --- /dev/null +++ b/test/MC/AMDGPU/vop1.s @@ -0,0 +1,357 @@ +// RUN: not llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=SI --check-prefix=SICI +// RUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=SI --check-prefix=SICI +// RUN: not llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=SICI --check-prefix=CIVI +// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=CIVI --check-prefix=VI + +// RUN: not llvm-mc -arch=amdgcn -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSI --check-prefix=NOSICI +// RUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSI --check-prefix=NOSICI +// RUN: not llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSICI +// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck %s -check-prefix=NOVI + + +// GCN: v_nop ; encoding: [0x00,0x00,0x00,0x7e] +v_nop + +// GCN: v_mov_b32_e32 v1, v2 ; encoding: [0x02,0x03,0x02,0x7e] +v_mov_b32 v1, v2 + +// GCN: v_readfirstlane_b32 s1, v2 ; encoding: [0x02,0x05,0x02,0x7e] +v_readfirstlane_b32 s1, v2 + +// GCN: v_cvt_i32_f64_e32 v1, v[2:3] ; encoding: [0x02,0x07,0x02,0x7e] +v_cvt_i32_f64 v1, v[2:3] + +// GCN: v_cvt_f64_i32_e32 v[1:2], v2 ; encoding: [0x02,0x09,0x02,0x7e] +v_cvt_f64_i32 v[1:2], v2 + +// GCN: v_cvt_f32_i32_e32 v1, v2 ; encoding: [0x02,0x0b,0x02,0x7e] +v_cvt_f32_i32 v1, v2 + +// GCN: v_cvt_f32_u32_e32 v1, v2 ; encoding: [0x02,0x0d,0x02,0x7e] +v_cvt_f32_u32 v1, v2 + +// GCN: v_cvt_u32_f32_e32 v1, v2 ; encoding: [0x02,0x0f,0x02,0x7e +v_cvt_u32_f32 v1, v2 + +// GCN: v_cvt_i32_f32_e32 v1, v2 ; encoding: [0x02,0x11,0x02,0x7e] +v_cvt_i32_f32 v1, v2 + +// SICI: v_mov_fed_b32_e32 v1, v2 ; encoding: [0x02,0x13,0x02,0x7e] +// NOVI: error: instruction not supported on this GPU +v_mov_fed_b32 v1, v2 + +// GCN: v_cvt_f16_f32_e32 v1, v2 ; encoding: [0x02,0x15,0x02,0x7e] +v_cvt_f16_f32 v1, v2 + +// GCN: v_cvt_f32_f16_e32 v1, v2 ; encoding: [0x02,0x17,0x02,0x7e] +v_cvt_f32_f16 v1, v2 + +// GCN: v_cvt_rpi_i32_f32_e32 v1, v2 ; encoding: [0x02,0x19,0x02,0x7e] +v_cvt_rpi_i32_f32 v1, v2 + +// GCN: v_cvt_flr_i32_f32_e32 v1, v2 ; encoding: [0x02,0x1b,0x02,0x7e] +v_cvt_flr_i32_f32 v1, v2 + +// GCN: v_cvt_off_f32_i4_e32 v1, v2 ; encoding: [0x02,0x1d,0x02,0x7e] +v_cvt_off_f32_i4_e32 v1, v2 + +// GCN: v_cvt_f32_f64_e32 v1, v[2:3] ; encoding: [0x02,0x1f,0x02,0x7e] +v_cvt_f32_f64 v1, v[2:3] + +// GCN: v_cvt_f64_f32_e32 v[1:2], v2 ; encoding: [0x02,0x21,0x02,0x7e] +v_cvt_f64_f32 v[1:2], v2 + +// GCN: v_cvt_f32_ubyte0_e32 v1, v2 ; encoding: [0x02,0x23,0x02,0x7e] +v_cvt_f32_ubyte0 v1, v2 + +// GCN: v_cvt_f32_ubyte1_e32 v1, v2 ; encoding: [0x02,0x25,0x02,0x7e] +v_cvt_f32_ubyte1_e32 v1, v2 + +// GCN: v_cvt_f32_ubyte2_e32 v1, v2 ; encoding: [0x02,0x27,0x02,0x7e] +v_cvt_f32_ubyte2 v1, v2 + +// GCN: v_cvt_f32_ubyte3_e32 v1, v2 ; encoding: [0x02,0x29,0x02,0x7e] +v_cvt_f32_ubyte3 v1, v2 + +// GCN: v_cvt_u32_f64_e32 v1, v[2:3] ; encoding: [0x02,0x2b,0x02,0x7e] +v_cvt_u32_f64 v1, v[2:3] + +// GCN: v_cvt_f64_u32_e32 v[1:2], v2 ; encoding: [0x02,0x2d,0x02,0x7e] +v_cvt_f64_u32 v[1:2], v2 + +// NOSI: error: instruction not supported on this GPU +// NOSI: v_trunc_f64_e32 v[1:2], v[2:3] +// CIVI: v_trunc_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x2f,0x02,0x7e] +v_trunc_f64_e32 v[1:2], v[2:3] + +// NOSI: error: instruction not supported on this GPU +// NOSI: v_ceil_f64_e32 v[1:2], v[2:3] +// CIVI: v_ceil_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x31,0x02,0x7e] +v_ceil_f64_e32 v[1:2], v[2:3] + +// NOSI: error: instruction not supported on this GPU +// NOSI: v_rndne_f64_e32 v[1:2], v[2:3] +// CIVI: v_rndne_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x33,0x02,0x7e] +v_rndne_f64_e32 v[1:2], v[2:3] + +// NOSI: error: instruction not supported on this GPU +// NOSI: v_floor_f64_e32 v[1:2], v[2:3] +// CIVI: v_floor_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x35,0x02,0x7e] +v_floor_f64_e32 v[1:2], v[2:3] + +// SICI: v_fract_f32_e32 v1, v2 ; encoding: [0x02,0x41,0x02,0x7e] +// VI: v_fract_f32_e32 v1, v2 ; encoding: [0x02,0x37,0x02,0x7e] +v_fract_f32 v1, v2 + +// SICI: v_trunc_f32_e32 v1, v2 ; encoding: [0x02,0x43,0x02,0x7e] +// VI: v_trunc_f32_e32 v1, v2 ; encoding: [0x02,0x39,0x02,0x7e] +v_trunc_f32 v1, v2 + +// SICI: v_ceil_f32_e32 v1, v2 ; encoding: [0x02,0x45,0x02,0x7e] +// VI: v_ceil_f32_e32 v1, v2 ; encoding: [0x02,0x3b,0x02,0x7e] +v_ceil_f32 v1, v2 + +// SICI: v_rndne_f32_e32 v1, v2 ; encoding: [0x02,0x47,0x02,0x7e] +// VI: v_rndne_f32_e32 v1, v2 ; encoding: [0x02,0x3d,0x02,0x7e] +v_rndne_f32 v1, v2 + +// SICI: v_floor_f32_e32 v1, v2 ; encoding: [0x02,0x49,0x02,0x7e] +// VI: v_floor_f32_e32 v1, v2 ; encoding: [0x02,0x3f,0x02,0x7e] +v_floor_f32_e32 v1, v2 + +// SICI: v_exp_f32_e32 v1, v2 ; encoding: [0x02,0x4b,0x02,0x7e] +// VI: v_exp_f32_e32 v1, v2 ; encoding: [0x02,0x41,0x02,0x7e] +v_exp_f32 v1, v2 + +// SICI: v_log_clamp_f32_e32 v1, v2 ; encoding: [0x02,0x4d,0x02,0x7e] +// NOVI: error: instruction not supported on this GPU +// NOVI: v_log_clamp_f32 v1, v2 +v_log_clamp_f32 v1, v2 + +// SICI: v_log_f32_e32 v1, v2 ; encoding: [0x02,0x4f,0x02,0x7e] +// VI: v_log_f32_e32 v1, v2 ; encoding: [0x02,0x43,0x02,0x7e] +v_log_f32 v1, v2 + +// SICI: v_rcp_clamp_f32_e32 v1, v2 ; encoding: [0x02,0x51,0x02,0x7e] +// NOVI: error: instruction not supported on this GPU +// NOVI: v_rcp_clamp_f32 v1, v2 +v_rcp_clamp_f32 v1, v2 + +// SICI: v_rcp_legacy_f32_e32 v1, v2 ; encoding: [0x02,0x53,0x02,0x7e] +// NOVI: error: instruction not supported on this GPU +// NOVI: v_rcp_legacy_f32 v1, v2 +v_rcp_legacy_f32 v1, v2 + +// SICI: v_rcp_f32_e32 v1, v2 ; encoding: [0x02,0x55,0x02,0x7e] +// VI: v_rcp_f32_e32 v1, v2 ; encoding: [0x02,0x45,0x02,0x7e] +v_rcp_f32 v1, v2 + +// SICI: v_rcp_iflag_f32_e32 v1, v2 ; encoding: [0x02,0x57,0x02,0x7e] +// VI: v_rcp_iflag_f32_e32 v1, v2 ; encoding: [0x02,0x47,0x02,0x7e] +v_rcp_iflag_f32 v1, v2 + +// SICI: v_rsq_clamp_f32_e32 v1, v2 ; encoding: [0x02,0x59,0x02,0x7e] +// NOVI: error: instruction not supported on this GPU +// NOVI: v_rsq_clamp_f32 v1, v2 +v_rsq_clamp_f32 v1, v2 + +// SICI: v_rsq_legacy_f32_e32 v1, v2 ; encoding: [0x02,0x5b,0x02,0x7e] +// NOVI: error: instruction not supported on this GPU +// NOVI: v_rsq_legacy_f32 v1, v2 +v_rsq_legacy_f32 v1, v2 + +// SICI: v_rsq_f32_e32 v1, v2 ; encoding: [0x02,0x5d,0x02,0x7e] +// VI: v_rsq_f32_e32 v1, v2 ; encoding: [0x02,0x49,0x02,0x7e] +v_rsq_f32_e32 v1, v2 + +// SICI: v_rcp_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x5f,0x02,0x7e] +// VI: v_rcp_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x4b,0x02,0x7e] +v_rcp_f64 v[1:2], v[2:3] + +// SICI: v_rcp_clamp_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x61,0x02,0x7e] +// NOVI: error: instruction not supported on this GPU +// NOVI: v_rcp_clamp_f64 v[1:2], v[2:3] +v_rcp_clamp_f64 v[1:2], v[2:3] + +// SICI: v_rsq_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x63,0x02,0x7e] +// VI: v_rsq_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x4d,0x02,0x7e] +v_rsq_f64 v[1:2], v[2:3] + +// SICI: v_rsq_clamp_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x65,0x02,0x7e] +// NOVI: error: instruction not supported on this GPU +// NOVI: v_rsq_clamp_f64 v[1:2], v[2:3] +v_rsq_clamp_f64 v[1:2], v[2:3] + +// SICI: v_sqrt_f32_e32 v1, v2 ; encoding: [0x02,0x67,0x02,0x7e] +// VI: v_sqrt_f32_e32 v1, v2 ; encoding: [0x02,0x4f,0x02,0x7e] +v_sqrt_f32 v1, v2 + +// SICI: v_sqrt_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x69,0x02,0x7e] +// VI: v_sqrt_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x51,0x02,0x7e] +v_sqrt_f64 v[1:2], v[2:3] + +// SICI: v_sin_f32_e32 v1, v2 ; encoding: [0x02,0x6b,0x02,0x7e] +// VI: v_sin_f32_e32 v1, v2 ; encoding: [0x02,0x53,0x02,0x7e] +v_sin_f32 v1, v2 + +// SICI: v_cos_f32_e32 v1, v2 ; encoding: [0x02,0x6d,0x02,0x7e] +// VI: v_cos_f32_e32 v1, v2 ; encoding: [0x02,0x55,0x02,0x7e] +v_cos_f32 v1, v2 + +// SICI: v_not_b32_e32 v1, v2 ; encoding: [0x02,0x6f,0x02,0x7e] +// VI: v_not_b32_e32 v1, v2 ; encoding: [0x02,0x57,0x02,0x7e] +v_not_b32 v1, v2 + +// SICI: v_bfrev_b32_e32 v1, v2 ; encoding: [0x02,0x71,0x02,0x7e] +// VI: v_bfrev_b32_e32 v1, v2 ; encoding: [0x02,0x59,0x02,0x7e] +v_bfrev_b32 v1, v2 + +// SICI: v_ffbh_u32_e32 v1, v2 ; encoding: [0x02,0x73,0x02,0x7e] +// VI: v_ffbh_u32_e32 v1, v2 ; encoding: [0x02,0x5b,0x02,0x7e] +v_ffbh_u32 v1, v2 + +// SICI: v_ffbl_b32_e32 v1, v2 ; encoding: [0x02,0x75,0x02,0x7e] +// VI: v_ffbl_b32_e32 v1, v2 ; encoding: [0x02,0x5d,0x02,0x7e] +v_ffbl_b32 v1, v2 + +// SICI: v_ffbh_i32_e32 v1, v2 ; encoding: [0x02,0x77,0x02,0x7e] +// VI: v_ffbh_i32_e32 v1, v2 ; encoding: [0x02,0x5f,0x02,0x7e] +v_ffbh_i32_e32 v1, v2 + +// SICI: v_frexp_exp_i32_f64_e32 v1, v[2:3] ; encoding: [0x02,0x79,0x02,0x7e] +// VI: v_frexp_exp_i32_f64_e32 v1, v[2:3] ; encoding: [0x02,0x61,0x02,0x7e] +v_frexp_exp_i32_f64 v1, v[2:3] + +// SICI: v_frexp_mant_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x7b,0x02,0x7e] +// VI; v_frexp_mant_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x63,0x02,0x7e] +v_frexp_mant_f64 v[1:2], v[2:3] + +// SICI: v_fract_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x7d,0x02,0x7e] +// VI: v_fract_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x65,0x02,0x7e] +v_fract_f64 v[1:2], v[2:3] + +// SICI: v_frexp_exp_i32_f32_e32 v1, v2 ; encoding: [0x02,0x7f,0x02,0x7e] +// VI: v_frexp_exp_i32_f32_e32 v1, v2 ; encoding: [0x02,0x67,0x02,0x7e] +v_frexp_exp_i32_f32 v1, v2 + +// SICI: v_frexp_mant_f32_e32 v1, v2 ; encoding: [0x02,0x81,0x02,0x7e] +// VI: v_frexp_mant_f32_e32 v1, v2 ; encoding: [0x02,0x69,0x02,0x7e] +v_frexp_mant_f32 v1, v2 + +// SICI: v_clrexcp ; encoding: [0x00,0x82,0x00,0x7e] +// VI: v_clrexcp ; encoding: [0x00,0x6a,0x00,0x7e] +v_clrexcp + +// SICI: v_movreld_b32_e32 v1, v2 ; encoding: [0x02,0x85,0x02,0x7e] +// VI: v_movreld_b32_e32 v1, v2 ; encoding: [0x02,0x6d,0x02,0x7e] +v_movreld_b32 v1, v2 + +// SICI: v_movrels_b32_e32 v1, v2 ; encoding: [0x02,0x87,0x02,0x7e] +// VI: v_movrels_b32_e32 v1, v2 ; encoding: [0x02,0x6f,0x02,0x7e] +v_movrels_b32 v1, v2 + +// SICI: v_movrelsd_b32_e32 v1, v2 ; encoding: [0x02,0x89,0x02,0x7e] +// VI: v_movrelsd_b32_e32 v1, v2 ; encoding: [0x02,0x71,0x02,0x7e] +v_movrelsd_b32 v1, v2 + +// NOSI: error: instruction not supported on this GPU +// NOSI: v_log_legacy_f32 v1, v2 +// CI: v_log_legacy_f32_e32 v1, v2 ; encoding: [0x02,0x8b,0x02,0x7e] +// VI: v_log_legacy_f32_e32 v1, v2 ; encoding: [0x02,0x99,0x02,0x7e] +v_log_legacy_f32 v1, v2 + +// NOSI: error: instruction not supported on this GPU +// NOSI: v_exp_legacy_f32 v1, v2 +// CI: v_exp_legacy_f32_e32 v1, v2 ; encoding: [0x02,0x8d,0x02,0x7e] +// VI: v_exp_legacy_f32_e32 v1, v2 ; encoding: [0x02,0x97,0x02,0x7e] +v_exp_legacy_f32 v1, v2 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_cvt_f16_u16 v1, v2 +// VI: v_cvt_f16_u16_e32 v1, v2 ; encoding: [0x02,0x73,0x02,0x7e] +v_cvt_f16_u16 v1, v2 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_cvt_f16_i16 v1, v2 +// VI: v_cvt_f16_i16_e32 v1, v2 ; encoding: [0x02,0x75,0x02,0x7e] +v_cvt_f16_i16 v1, v2 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_cvt_u16_f16 v1, v2 +// VI: v_cvt_u16_f16_e32 v1, v2 ; encoding: [0x02,0x77,0x02,0x7e] +v_cvt_u16_f16 v1, v2 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_cvt_i16_f16 v1, v2 +// VI: v_cvt_i16_f16_e32 v1, v2 ; encoding: [0x02,0x79,0x02,0x7e] +v_cvt_i16_f16 v1, v2 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_rcp_f16 v1, v2 +// VI: v_rcp_f16_e32 v1, v2 ; encoding: [0x02,0x7b,0x02,0x7e] +v_rcp_f16 v1, v2 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_sqrt_f16 v1, v2 +// VI: v_sqrt_f16_e32 v1, v2 ; encoding: [0x02,0x7d,0x02,0x7e] +v_sqrt_f16 v1, v2 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_rsq_f16 v1, v2 +// VI: v_rsq_f16_e32 v1, v2 ; encoding: [0x02,0x7f,0x02,0x7e] +v_rsq_f16 v1, v2 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_log_f16 v1, v2 +// VI: v_log_f16_e32 v1, v2 ; encoding: [0x02,0x81,0x02,0x7e] +v_log_f16 v1, v2 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_exp_f16 v1, v2 +// VI: v_exp_f16_e32 v1, v2 ; encoding: [0x02,0x83,0x02,0x7e] +v_exp_f16 v1, v2 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_frexp_mant_f16 v1, v2 +// VI: v_frexp_mant_f16_e32 v1, v2 ; encoding: [0x02,0x85,0x02,0x7e] +v_frexp_mant_f16 v1, v2 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_frexp_exp_i16_f16 v1, v2 +// VI: v_frexp_exp_i16_f16_e32 v1, v2 ; encoding: [0x02,0x87,0x02,0x7e] +v_frexp_exp_i16_f16 v1, v2 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_floor_f16 v1, v2 +// VI: v_floor_f16_e32 v1, v2 ; encoding: [0x02,0x89,0x02,0x7e] +v_floor_f16 v1, v2 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_ceil_f16 v1, v2 +// VI: v_ceil_f16_e32 v1, v2 ; encoding: [0x02,0x8b,0x02,0x7e] +v_ceil_f16 v1, v2 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_trunc_f16 v1, v2 +// VI: v_trunc_f16_e32 v1, v2 ; encoding: [0x02,0x8d,0x02,0x7e] +v_trunc_f16 v1, v2 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_rndne_f16 v1, v2 +// VI: v_rndne_f16_e32 v1, v2 ; encoding: [0x02,0x8f,0x02,0x7e] +v_rndne_f16 v1, v2 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_fract_f16 v1, v2 +// VI: v_fract_f16_e32 v1, v2 ; encoding: [0x02,0x91,0x02,0x7e] +v_fract_f16 v1, v2 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_sin_f16 v1, v2 +// VI: v_sin_f16_e32 v1, v2 ; encoding: [0x02,0x93,0x02,0x7e] +v_sin_f16 v1, v2 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_cos_f16 v1, v2 +// VI: v_cos_f16_e32 v1, v2 ; encoding: [0x02,0x95,0x02,0x7e] +v_cos_f16 v1, v2 diff --git a/test/MC/AMDGPU/vop2-err.s b/test/MC/AMDGPU/vop2-err.s new file mode 100644 index 00000000000..a1131000a90 --- /dev/null +++ b/test/MC/AMDGPU/vop2-err.s @@ -0,0 +1,35 @@ +// RUN: not llvm-mc -arch=amdgcn %s 2>&1 | FileCheck %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=SI %s 2>&1 | FileCheck %s + +//===----------------------------------------------------------------------===// +// Generic checks +//===----------------------------------------------------------------------===// + +v_mul_i32_i24 v1, v2, 100 +// CHECK: error: invalid operand for instruction + +//===----------------------------------------------------------------------===// +// _e32 checks +//===----------------------------------------------------------------------===// + +// Immediate src1 +v_mul_i32_i24_e32 v1, v2, 100 +// CHECK: error: invalid operand for instruction + +// sgpr src1 +v_mul_i32_i24_e32 v1, v2, s3 +// CHECK: error: invalid operand for instruction + +//===----------------------------------------------------------------------===// +// _e64 checks +//===----------------------------------------------------------------------===// + +// Immediate src0 +v_mul_i32_i24_e64 v1, 100, v3 +// CHECK: error: invalid operand for instruction + +// Immediate src1 +v_mul_i32_i24_e64 v1, v2, 100 +// CHECK: error: invalid operand for instruction + +// TODO: Constant bus restrictions diff --git a/test/MC/AMDGPU/vop2.s b/test/MC/AMDGPU/vop2.s new file mode 100644 index 00000000000..a1f3b8d8936 --- /dev/null +++ b/test/MC/AMDGPU/vop2.s @@ -0,0 +1,421 @@ +// RUN: not llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=SICI +// RUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=SICI +// RUN: not llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=SICI +// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=CIVI --check-prefix=VI + +// RUN: not llvm-mc -arch=amdgcn -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSICI +// RUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSICI +// RUN: not llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSICI +// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck %s -check-prefix=NOVI + +//===----------------------------------------------------------------------===// +// Generic Checks for floating-point instructions (These have modifiers). +//===----------------------------------------------------------------------===// + +// TODO: 64-bit encoding of instructions with modifiers + +// _e32 suffix +// SICI: v_add_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x06] +v_add_f32_e32 v1, v2, v3 + +// src0 inline immediate +// SICI: v_add_f32_e32 v1, 1.0, v3 ; encoding: [0xf2,0x06,0x02,0x06] +v_add_f32 v1, 1.0, v3 + +// src0 negative inline immediate +// SICI: v_add_f32_e32 v1, -1.0, v3 ; encoding: [0xf3,0x06,0x02,0x06] +v_add_f32 v1, -1.0, v3 + +// src0 literal +// SICI: v_add_f32_e32 v1, 0x42c80000, v3 ; encoding: [0xff,0x06,0x02,0x06,0x00,0x00,0xc8,0x42] +v_add_f32 v1, 100.0, v3 + +// src0 negative literal +// SICI: v_add_f32_e32 v1, 0xc2c80000, v3 ; encoding: [0xff,0x06,0x02,0x06,0x00,0x00,0xc8,0xc2] +v_add_f32 v1, -100.0, v3 + +//===----------------------------------------------------------------------===// +// Generic Checks for integer instructions (These don't have modifiers). +//===----------------------------------------------------------------------===// + +// _e32 suffix +// SICI: v_mul_i32_i24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x12] +v_mul_i32_i24_e32 v1, v2, v3 + +// _e64 suffix +// SICI: v_mul_i32_i24_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x12,0xd2,0x02,0x07,0x02,0x00] +v_mul_i32_i24_e64 v1, v2, v3 + +// src0 inline +// SICI: v_mul_i32_i24_e32 v1, 3, v3 ; encoding: [0x83,0x06,0x02,0x12] +v_mul_i32_i24 v1, 3, v3 + +// src0 negative inline +// SICI: v_mul_i32_i24_e32 v1, -3, v3 ; encoding: [0xc3,0x06,0x02,0x12] +v_mul_i32_i24 v1, -3, v3 + +// src1 inline +// SICI: v_mul_i32_i24_e64 v1, v2, 3 ; encoding: [0x01,0x00,0x12,0xd2,0x02,0x07,0x01,0x00] +v_mul_i32_i24 v1, v2, 3 + +// src1 negative inline +// SICI: v_mul_i32_i24_e64 v1, v2, -3 ; encoding: [0x01,0x00,0x12,0xd2,0x02,0x87,0x01,0x00] +v_mul_i32_i24 v1, v2, -3 + +// src0 literal +// SICI: v_mul_i32_i24_e32 v1, 0x64, v3 ; encoding: [0xff,0x06,0x02,0x12,0x64,0x00,0x00,0x00] +v_mul_i32_i24 v1, 100, v3 + +// src1 negative literal +// SICI: v_mul_i32_i24_e32 v1, 0xffffff9c, v3 ; encoding: [0xff,0x06,0x02,0x12,0x9c,0xff,0xff,0xff] +v_mul_i32_i24 v1, -100, v3 + +//===----------------------------------------------------------------------===// +// Checks for legal operands +//===----------------------------------------------------------------------===// + +// src0 sgpr +// SICI: v_mul_i32_i24_e32 v1, s2, v3 ; encoding: [0x02,0x06,0x02,0x12] +v_mul_i32_i24 v1, s2, v3 + +// src1 sgpr +// SICI: v_mul_i32_i24_e64 v1, v2, s3 ; encoding: [0x01,0x00,0x12,0xd2,0x02,0x07,0x00,0x00] +v_mul_i32_i24 v1, v2, s3 + +// src0, src1 same sgpr +// SICI: v_mul_i32_i24_e64 v1, s2, s2 ; encoding: [0x01,0x00,0x12,0xd2,0x02,0x04,0x00,0x00] +v_mul_i32_i24 v1, s2, s2 + +// src0 sgpr, src1 inline +// SICI: v_mul_i32_i24_e64 v1, s2, 3 ; encoding: [0x01,0x00,0x12,0xd2,0x02,0x06,0x01,0x00] +v_mul_i32_i24 v1, s2, 3 + +// src0 inline src1 sgpr +// SICI: v_mul_i32_i24_e64 v1, 3, s3 ; encoding: [0x01,0x00,0x12,0xd2,0x83,0x06,0x00,0x00] +v_mul_i32_i24 v1, 3, s3 + +//===----------------------------------------------------------------------===// +// Instructions +//===----------------------------------------------------------------------===// + +// GCN: v_cndmask_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x00] +v_cndmask_b32 v1, v2, v3 + +// SICI: v_readlane_b32 s1, v2, s3 ; encoding: [0x02,0x07,0x02,0x02] +// VI: v_readlane_b32 s1, v2, s3 ; encoding: [0x01,0x00,0x89,0xd2,0x02,0x07,0x00,0x00] +v_readlane_b32 s1, v2, s3 + +// SICI: v_writelane_b32 v1, s2, s3 ; encoding: [0x02,0x06,0x02,0x04] +// VI: v_writelane_b32 v1, s2, s3 ; encoding: [0x01,0x00,0x8a,0xd2,0x02,0x06,0x00,0x00] +v_writelane_b32 v1, s2, s3 + +// SICI: v_add_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x06] +// VI: v_add_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x02] +v_add_f32 v1, v2, v3 + +// SICI: v_sub_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x08] +// VI: v_sub_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x04] +v_sub_f32 v1, v2, v3 + +// SICI: v_subrev_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x0a] +// VI: v_subrev_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x06] +v_subrev_f32 v1, v2, v3 + +// SICI: v_mac_legacy_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x0c] +// NOVI: error: instruction not supported on this GPU +// NOVI: v_mac_legacy_f32 v1, v2, v3 +v_mac_legacy_f32 v1, v2, v3 + +// SICI: v_mul_legacy_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x0e] +// VI: v_mul_legacy_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x08] +v_mul_legacy_f32_e32 v1, v2, v3 + +// SICI: v_mul_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x10] +// VI: v_mul_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x0a] +v_mul_f32 v1, v2, v3 + +// SICI: v_mul_i32_i24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x12] +// VI: v_mul_i32_i24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x0c] +v_mul_i32_i24 v1, v2, v3 + +// SICI: v_mul_hi_i32_i24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x14] +// VI: v_mul_hi_i32_i24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x0e] +v_mul_hi_i32_i24 v1, v2, v3 + +// SICI: v_mul_u32_u24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x16] +// VI: v_mul_u32_u24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x10] +v_mul_u32_u24 v1, v2, v3 + +// SICI: v_mul_hi_u32_u24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x18] +// VI: v_mul_hi_u32_u24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x12] +v_mul_hi_u32_u24 v1, v2, v3 + +// SICI: v_min_legacy_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x1a] +// NOVI: error: instruction not supported on this GPU +// NOVI: v_min_legacy_f32_e32 v1, v2, v3 +v_min_legacy_f32_e32 v1, v2, v3 + +// SICI: v_max_legacy_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x1c] +// NOVI: error: instruction not supported on this GPU +// NOVI: v_max_legacy_f32 v1, v2, v3 +v_max_legacy_f32 v1, v2, v3 + +// SICI: v_min_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x1e] +// VI: v_min_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x14] +v_min_f32_e32 v1, v2, v3 + +// SICI: v_max_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x20] +// VI: v_max_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x16] +v_max_f32 v1, v2 v3 + +// SICI: v_min_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x22] +// VI: v_min_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x18] +v_min_i32 v1, v2, v3 + +// SICI: v_max_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x24] +// VI: v_max_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x1a] +v_max_i32 v1, v2, v3 + +// SICI: v_min_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x26] +// VI: v_min_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x1c] +v_min_u32 v1, v2, v3 + +// SICI: v_max_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x28] +// VI: v_max_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x1e] +v_max_u32 v1, v2, v3 + +// SICI: v_lshr_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x2a] +// NOVI: error: instruction not supported on this GPU +// NOVI: v_lshr_b32 v1, v2, v3 +v_lshr_b32 v1, v2, v3 + +// SICI: v_lshrrev_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x2c] +// VI: v_lshrrev_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x20] +v_lshrrev_b32 v1, v2, v3 + +// SICI: v_ashr_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x2e] +// NOVI: error: instruction not supported on this GPU +// NOVI: v_ashr_i32 v1, v2, v3 +v_ashr_i32 v1, v2, v3 + +// SICI: v_ashrrev_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x30] +// VI: v_ashrrev_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x22] +v_ashrrev_i32 v1, v2, v3 + +// SICI: v_lshl_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x32] +// NOVI: error: instruction not supported on this GPU +// NOVI: v_lshl_b32_e32 v1, v2, v3 +v_lshl_b32_e32 v1, v2, v3 + +// SICI: v_lshlrev_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x34] +// VI: v_lshlrev_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x24] +v_lshlrev_b32 v1, v2, v3 + +// SICI: v_and_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x36] +// VI: v_and_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x26] +v_and_b32 v1, v2, v3 + +// SICI: v_or_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x38] +// VI: v_or_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x28] +v_or_b32 v1, v2, v3 + +// SICI: v_xor_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3a] +// VI: v_xor_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x2a] +v_xor_b32 v1, v2, v3 + +// SICI: v_bfm_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3c] +// VI: v_bfm_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x93,0xd2,0x02,0x07,0x02,0x00] +v_bfm_b32 v1, v2, v3 + +// SICI: v_mac_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3e] +// VI: v_mac_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x2c] +v_mac_f32 v1, v2, v3 + +// SICI: v_madmk_f32_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x40,0x00,0x00,0x80,0x42] +// VI: v_madmk_f32_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x2e,0x00,0x00,0x80,0x42] +v_madmk_f32 v1, v2, v3, 64.0 + +// SICI: v_madak_f32_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x42,0x00,0x00,0x80,0x42] +// VI: v_madak_f32_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x30,0x00,0x00,0x80,0x42] +v_madak_f32 v1, v2, v3, 64.0 + +// SICI: v_bcnt_u32_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x44] +// VI: v_bcnt_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x8b,0xd2,0x02,0x07,0x02,0x00] +v_bcnt_u32_b32 v1, v2, v3 + +// SICI: v_mbcnt_lo_u32_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x46] +// VI: v_mbcnt_lo_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x8c,0xd2,0x02,0x07,0x02,0x00] +v_mbcnt_lo_u32_b32 v1, v2, v3 + +// SICI: v_mbcnt_hi_u32_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x48] +// VI: v_mbcnt_hi_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x8d,0xd2,0x02,0x07,0x02,0x00] +v_mbcnt_hi_u32_b32 v1, v2, v3 + +// SICI: v_add_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4a] +// VI: v_add_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x32] +v_add_i32 v1, v2, v3 + +// SICI: v_add_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4a] +// VI: v_add_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x32] +v_add_u32 v1, v2, v3 + +// SICI: v_sub_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4c] +// VI: v_sub_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x34] +v_sub_i32 v1, v2, v3 + +// SICI: v_sub_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4c] +// VI: v_sub_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x34] +v_sub_u32 v1, v2, v3 + +// SICI: v_subrev_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4e] +// VI: v_subrev_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x36] +v_subrev_i32 v1, v2, v3 + +// SICI: v_subrev_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4e] +// VI: v_subrev_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x36] +v_subrev_u32 v1, v2, v3 + +// SICI: v_addc_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x50] +// VI: v_addc_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x38] +v_addc_u32 v1, v2, v3 + +// SICI: v_subb_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x52] +// VI: v_subb_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3a] +v_subb_u32 v1, v2, v3 + +// SICI: v_subbrev_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x54] +// VI: v_subbrev_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3c] +v_subbrev_u32 v1, v2, v3 + +// SICI: v_ldexp_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x56] +// VI: v_ldexp_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x88,0xd2,0x02,0x07,0x02,0x00] +v_ldexp_f32 v1, v2, v3 + +// SICI: v_cvt_pkaccum_u8_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x58] +// VI: v_cvt_pkaccum_u8_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0xf0,0xd1,0x02,0x07,0x02,0x00] +v_cvt_pkaccum_u8_f32 v1, v2, v3 + +// SICI: v_cvt_pknorm_i16_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x5a] +// VI: v_cvt_pknorm_i16_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x94,0xd2,0x02,0x07,0x02,0x00] +v_cvt_pknorm_i16_f32 v1, v2, v3 + +// SICI: v_cvt_pknorm_u16_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x5c] +// VI: v_cvt_pknorm_u16_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x95,0xd2,0x02,0x07,0x02,0x00] +v_cvt_pknorm_u16_f32 v1, v2, v3 + +// SICI: v_cvt_pkrtz_f16_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x5e] +// VI: v_cvt_pkrtz_f16_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x96,0xd2,0x02,0x07,0x02,0x00] +v_cvt_pkrtz_f16_f32 v1, v2, v3 + +// SICI: v_cvt_pk_u16_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x60] +// VI: v_cvt_pk_u16_u32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x97,0xd2,0x02,0x07,0x02,0x00] +v_cvt_pk_u16_u32 v1, v2, v3 + +// SICI: v_cvt_pk_i16_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x62] +// VI: v_cvt_pk_i16_i32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x98,0xd2,0x02,0x07,0x02,0x00] +v_cvt_pk_i16_i32 v1, v2, v3 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_add_f16 v1, v2, v3 +// VI: v_add_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3e] +v_add_f16 v1, v2, v3 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_sub_f16 v1, v2, v3 +// VI: v_sub_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x40] +v_sub_f16 v1, v2, v3 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_subrev_f16 v1, v2, v3 +// VI: v_subrev_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x42] +v_subrev_f16 v1, v2, v3 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_mul_f16 v1, v2, v3 +// VI: v_mul_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x44] +v_mul_f16 v1, v2, v3 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_mac_f16 v1, v2, v3 +// VI: v_mac_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x46] +v_mac_f16 v1, v2, v3 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_madmk_f16 v1, v2, v3, 64.0 +// VI: v_madmk_f16_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x00,0x80,0x42] +v_madmk_f16 v1, v2, v3, 64.0 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_madak_f16 v1, v2, v3, 64.0 +// VI: v_madak_f16_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x4a,0x00,0x00,0x80,0x42] +v_madak_f16 v1, v2, v3, 64.0 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_add_u16 v1, v2, v3 +// VI: v_add_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4c] +v_add_u16 v1, v2, v3 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_sub_u16 v1, v2, v3 +// VI: v_sub_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4e] +v_sub_u16 v1, v2, v3 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_subrev_u16 v1, v2, v3 +// VI: v_subrev_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x50] +v_subrev_u16 v1, v2, v3 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_mul_lo_u16 v1, v2, v3 +// VI: v_mul_lo_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x52] +v_mul_lo_u16 v1, v2, v3 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_lshlrev_b16 v1, v2, v3 +// VI: v_lshlrev_b16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x54] +v_lshlrev_b16 v1, v2, v3 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_lshrrev_b16 v1, v2, v3 +// VI: v_lshrrev_b16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x56] +v_lshrrev_b16 v1, v2, v3 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_ashrrev_b16 v1, v2, v3 +// VI: v_ashrrev_b16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x58] +v_ashrrev_b16 v1, v2, v3 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_max_f16 v1, v2, v3 +// VI: v_max_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x5a] +v_max_f16 v1, v2, v3 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_min_f16 v1, v2, v3 +// VI: v_min_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x5c] +v_min_f16 v1, v2, v3 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_max_u16 v1, v2, v3 +// VI: v_max_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x5e] +v_max_u16 v1, v2, v3 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_max_i16 v1, v2, v3 +// VI: v_max_i16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x60] +v_max_i16 v1, v2, v3 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_min_u16 v1, v2, v3 +// VI: v_min_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x62] +v_min_u16 v1, v2, v3 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_min_i16 v1, v2, v3 +// VI: v_min_i16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x64] +v_min_i16 v1, v2, v3 + +// NOSICI: error: instruction not supported on this GPU +// NOSICI: v_ldexp_f16 v1, v2, v3 +// VI: v_ldexp_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x66] +v_ldexp_f16 v1, v2, v3 diff --git a/test/MC/AMDGPU/vop3-errs.s b/test/MC/AMDGPU/vop3-errs.s new file mode 100644 index 00000000000..b57fe6d5314 --- /dev/null +++ b/test/MC/AMDGPU/vop3-errs.s @@ -0,0 +1,5 @@ +// RUN: not llvm-mc -arch=amdgcn -show-encoding %s 2>&1 | FileCheck %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s 2>&1 | FileCheck %s + +v_add_f32_e64 v0, v1 +// CHECK: error: too few operands for instruction diff --git a/test/MC/AMDGPU/vop3.s b/test/MC/AMDGPU/vop3.s new file mode 100644 index 00000000000..20562335974 --- /dev/null +++ b/test/MC/AMDGPU/vop3.s @@ -0,0 +1,149 @@ +// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s +// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s + +//===----------------------------------------------------------------------===// +// VOPC Instructions +//===----------------------------------------------------------------------===// + +// Test forced e64 encoding + +v_cmp_lt_f32_e64 s[2:3], v4, -v6 +// CHECK: v_cmp_lt_f32_e64 s[2:3], v4, -v6 ; encoding: [0x02,0x00,0x02,0xd0,0x04,0x0d,0x02,0x40] + +// +// Modifier tests: +// + +v_cmp_lt_f32 s[2:3] -v4, v6 +// CHECK: v_cmp_lt_f32_e64 s[2:3], -v4, v6 ; encoding: [0x02,0x00,0x02,0xd0,0x04,0x0d,0x02,0x20] + +v_cmp_lt_f32 s[2:3] v4, -v6 +// CHECK: v_cmp_lt_f32_e64 s[2:3], v4, -v6 ; encoding: [0x02,0x00,0x02,0xd0,0x04,0x0d,0x02,0x40] + +v_cmp_lt_f32 s[2:3] -v4, -v6 +// CHECK: v_cmp_lt_f32_e64 s[2:3], -v4, -v6 ; encoding: [0x02,0x00,0x02,0xd0,0x04,0x0d,0x02,0x60] + +v_cmp_lt_f32 s[2:3] |v4|, v6 +// CHECK: v_cmp_lt_f32_e64 s[2:3], |v4|, v6 ; encoding: [0x02,0x01,0x02,0xd0,0x04,0x0d,0x02,0x00] + +v_cmp_lt_f32 s[2:3] v4, |v6| +// CHECK: v_cmp_lt_f32_e64 s[2:3], v4, |v6| ; encoding: [0x02,0x02,0x02,0xd0,0x04,0x0d,0x02,0x00] + +v_cmp_lt_f32 s[2:3] |v4|, |v6| +// CHECK: v_cmp_lt_f32_e64 s[2:3], |v4|, |v6| ; encoding: [0x02,0x03,0x02,0xd0,0x04,0x0d,0x02,0x00] + +v_cmp_lt_f32 s[2:3] -|v4|, v6 +// CHECK: v_cmp_lt_f32_e64 s[2:3], -|v4|, v6 ; encoding: [0x02,0x01,0x02,0xd0,0x04,0x0d,0x02,0x20] + +v_cmp_lt_f32 s[2:3] v4, -|v6| +// CHECK: v_cmp_lt_f32_e64 s[2:3], v4, -|v6| ; encoding: [0x02,0x02,0x02,0xd0,0x04,0x0d,0x02,0x40] + +v_cmp_lt_f32 s[2:3] -|v4|, -|v6| +// CHECK: v_cmp_lt_f32_e64 s[2:3], -|v4|, -|v6| ; encoding: [0x02,0x03,0x02,0xd0,0x04,0x0d,0x02,0x60] + +// +// Instruction tests: +// + +v_cmp_f_f32 s[2:3], v4, v6 +// CHECK: v_cmp_f_f32_e64 s[2:3], v4, v6 ; encoding: [0x02,0x00,0x00,0xd0,0x04,0x0d,0x02,0x00] + +v_cmp_lt_f32 s[2:3], v4, v6 +// CHECK: v_cmp_lt_f32_e64 s[2:3], v4, v6 ; encoding: [0x02,0x00,0x02,0xd0,0x04,0x0d,0x02,0x00] + +v_cmp_eq_f32 s[2:3], v4, v6 +// CHECK: v_cmp_eq_f32_e64 s[2:3], v4, v6 ; encoding: [0x02,0x00,0x04,0xd0,0x04,0x0d,0x02,0x00] + +v_cmp_le_f32 s[2:3], v4, v6 +// CHECK: v_cmp_le_f32_e64 s[2:3], v4, v6 ; encoding: [0x02,0x00,0x06,0xd0,0x04,0x0d,0x02,0x00] + +v_cmp_gt_f32 s[2:3], v4, v6 +// CHECK: v_cmp_gt_f32_e64 s[2:3], v4, v6 ; encoding: [0x02,0x00,0x08,0xd0,0x04,0x0d,0x02,0x00] + +v_cmp_lg_f32 s[2:3], v4, v6 +// CHECK: v_cmp_lg_f32_e64 s[2:3], v4, v6 ; encoding: [0x02,0x00,0x0a,0xd0,0x04,0x0d,0x02,0x00] + +v_cmp_ge_f32 s[2:3], v4, v6 +// CHECK: v_cmp_ge_f32_e64 s[2:3], v4, v6 ; encoding: [0x02,0x00,0x0c,0xd0,0x04,0x0d,0x02,0x00] + +// TODO: Finish VOPC + +//===----------------------------------------------------------------------===// +// VOP1 Instructions +//===----------------------------------------------------------------------===// + +// +// Modifier tests: +// + +v_fract_f32 v1, -v2 +// CHECK: v_fract_f32_e64 v1, -v2 ; encoding: [0x01,0x00,0x40,0xd3,0x02,0x01,0x00,0x20] + +v_fract_f32 v1, |v2| +// CHECK: v_fract_f32_e64 v1, |v2| ; encoding: [0x01,0x01,0x40,0xd3,0x02,0x01,0x00,0x00] + +v_fract_f32 v1, -|v2| +// CHECK: v_fract_f32_e64 v1, -|v2| ; encoding: [0x01,0x01,0x40,0xd3,0x02,0x01,0x00,0x20] + +v_fract_f32 v1, v2 clamp +// CHECK: v_fract_f32_e64 v1, v2 clamp ; encoding: [0x01,0x08,0x40,0xd3,0x02,0x01,0x00,0x00] + +v_fract_f32 v1, v2 mul:2 +// CHECK: v_fract_f32_e64 v1, v2 mul:2 ; encoding: [0x01,0x00,0x40,0xd3,0x02,0x01,0x00,0x08] + +v_fract_f32 v1, v2, div:2 clamp +// CHECK: v_fract_f32_e64 v1, v2 clamp div:2 ; encoding: [0x01,0x08,0x40,0xd3,0x02,0x01,0x00,0x18] + +// TODO: Finish VOP1 + +///===---------------------------------------------------------------------===// +// VOP2 Instructions +///===---------------------------------------------------------------------===// + +// Test forced e64 encoding with e32 operands + +v_ldexp_f32_e64 v1, v3, v5 +// CHECK: v_ldexp_f32_e64 v1, v3, v5 ; encoding: [0x01,0x00,0x56,0xd2,0x03,0x0b,0x02,0x00] + + +// TODO: Modifier tests + +v_cndmask_b32 v1, v3, v5, s[4:5] +// CHECK: v_cndmask_b32_e64 v1, v3, v5, s[4:5] ; encoding: [0x01,0x00,0x00,0xd2,0x03,0x0b,0x12,0x00] + +//TODO: readlane, writelane + +v_add_f32 v1, v3, s5 +// CHECK: v_add_f32_e64 v1, v3, s5 ; encoding: [0x01,0x00,0x06,0xd2,0x03,0x0b,0x00,0x00] + +v_sub_f32 v1, v3, s5 +// CHECK: v_sub_f32_e64 v1, v3, s5 ; encoding: [0x01,0x00,0x08,0xd2,0x03,0x0b,0x00,0x00] + +v_subrev_f32 v1, v3, s5 +// CHECK: v_subrev_f32_e64 v1, v3, s5 ; encoding: [0x01,0x00,0x0a,0xd2,0x03,0x0b,0x00,0x00] + +v_mac_legacy_f32 v1, v3, s5 +// CHECK: v_mac_legacy_f32_e64 v1, v3, s5 ; encoding: [0x01,0x00,0x0c,0xd2,0x03,0x0b,0x00,0x00] + +v_mul_legacy_f32 v1, v3, s5 +// CHECK: v_mul_legacy_f32_e64 v1, v3, s5 ; encoding: [0x01,0x00,0x0e,0xd2,0x03,0x0b,0x00,0x00] + +v_mul_f32 v1, v3, s5 +// CHECK: v_mul_f32_e64 v1, v3, s5 ; encoding: [0x01,0x00,0x10,0xd2,0x03,0x0b,0x00,0x00] + +v_mul_i32_i24 v1, v3, s5 +// CHECK: v_mul_i32_i24_e64 v1, v3, s5 ; encoding: [0x01,0x00,0x12,0xd2,0x03,0x0b,0x00,0x00] + +///===---------------------------------------------------------------------===// +// VOP3 Instructions +///===---------------------------------------------------------------------===// + +// TODO: Modifier tests + +v_mad_legacy_f32 v2, v4, v6, v8 +// CHECK: v_mad_legacy_f32 v2, v4, v6, v8 ; encoding: [0x02,0x00,0x80,0xd2,0x04,0x0d,0x22,0x04] + + + + + diff --git a/test/MC/AMDGPU/vopc.s b/test/MC/AMDGPU/vopc.s new file mode 100644 index 00000000000..f44919a4f1e --- /dev/null +++ b/test/MC/AMDGPU/vopc.s @@ -0,0 +1,40 @@ +// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s +// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s + +//===----------------------------------------------------------------------===// +// Generic Checks +//===----------------------------------------------------------------------===// + +// src0 sgpr +v_cmp_lt_f32 vcc, s2, v4 +// CHECK: v_cmp_lt_f32_e32 vcc, s2, v4 ; encoding: [0x02,0x08,0x02,0x7c] + +// src0 inline immediate +v_cmp_lt_f32 vcc, 0, v4 +// CHECK: v_cmp_lt_f32_e32 vcc, 0, v4 ; encoding: [0x80,0x08,0x02,0x7c] + +// src0 literal +v_cmp_lt_f32 vcc, 10.0, v4 +// CHECK: v_cmp_lt_f32_e32 vcc, 0x41200000, v4 ; encoding: [0xff,0x08,0x02,0x7c,0x00,0x00,0x20,0x41] + +// src0, src1 max vgpr +v_cmp_lt_f32 vcc, v255, v255 +// CHECK: v_cmp_lt_f32_e32 vcc, v255, v255 ; encoding: [0xff,0xff,0x03,0x7c] + +// force 32-bit encoding +v_cmp_lt_f32_e32 vcc, v2, v4 +// CHECK: v_cmp_lt_f32_e32 vcc, v2, v4 ; encoding: [0x02,0x09,0x02,0x7c] + + +//===----------------------------------------------------------------------===// +// Instructions +//===----------------------------------------------------------------------===// + +v_cmp_f_f32 vcc, v2, v4 +// CHECK: v_cmp_f_f32_e32 vcc, v2, v4 ; encoding: [0x02,0x09,0x00,0x7c] + +v_cmp_lt_f32 vcc, v2, v4 +// CHECK: v_cmp_lt_f32_e32 vcc, v2, v4 ; encoding: [0x02,0x09,0x02,0x7c] + +// TODO: Add tests for the rest of the instructions. + diff --git a/test/MC/R600/ds-err.s b/test/MC/R600/ds-err.s deleted file mode 100644 index 52c2740bec2..00000000000 --- a/test/MC/R600/ds-err.s +++ /dev/null @@ -1,23 +0,0 @@ -// RUN: not llvm-mc -arch=amdgcn %s 2>&1 | FileCheck %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=SI %s 2>&1 | FileCheck %s - -// offset too big -// CHECK: invalid operand for instruction -ds_add_u32 v2, v4 offset:1000000000 - -// offset0 twice -// CHECK: error: not a valid operand. -ds_write2_b32 v2, v4, v6 offset0:4 offset0:8 - -// offset1 twice -// CHECK: error: not a valid operand. -ds_write2_b32 v2, v4, v6 offset1:4 offset1:8 - -// offset0 too big -// CHECK: invalid operand for instruction -ds_write2_b32 v2, v4, v6 offset0:1000000000 - -// offset1 too big -// CHECK: invalid operand for instruction -ds_write2_b32 v2, v4, v6 offset1:1000000000 - diff --git a/test/MC/R600/ds.s b/test/MC/R600/ds.s deleted file mode 100644 index ad63229ba2e..00000000000 --- a/test/MC/R600/ds.s +++ /dev/null @@ -1,337 +0,0 @@ -// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s -// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s - -//===----------------------------------------------------------------------===// -// Checks for 16-bit Offsets -//===----------------------------------------------------------------------===// - -ds_add_u32 v2, v4 offset:16 -// CHECK: ds_add_u32 v2, v4 offset:16 ; encoding: [0x10,0x00,0x00,0xd8,0x02,0x04,0x00,0x00] - -//===----------------------------------------------------------------------===// -// Checks for 2 8-bit Offsets -//===----------------------------------------------------------------------===// - -ds_write2_b32 v2, v4, v6 offset0:4 -// CHECK: ds_write2_b32 v2, v4, v6 offset0:4 ; encoding: [0x04,0x00,0x38,0xd8,0x02,0x04,0x06,0x00] - -ds_write2_b32 v2, v4, v6 offset0:4 offset1:8 -// CHECK: ds_write2_b32 v2, v4, v6 offset0:4 offset1:8 ; encoding: [0x04,0x08,0x38,0xd8,0x02,0x04,0x06,0x00] - -ds_write2_b32 v2, v4, v6 offset1:8 -// CHECK: ds_write2_b32 v2, v4, v6 offset1:8 ; encoding: [0x00,0x08,0x38,0xd8,0x02,0x04,0x06,0x00] - -ds_read2_b32 v[8:9], v2 offset0:4 -// CHECK: ds_read2_b32 v[8:9], v2 offset0:4 ; encoding: [0x04,0x00,0xdc,0xd8,0x02,0x00,0x00,0x08] - -ds_read2_b32 v[8:9], v2 offset0:4 offset1:8 -// CHECK: ds_read2_b32 v[8:9], v2 offset0:4 offset1:8 ; encoding: [0x04,0x08,0xdc,0xd8,0x02,0x00,0x00,0x08] - -ds_read2_b32 v[8:9], v2 offset1:8 -// CHECK: ds_read2_b32 v[8:9], v2 offset1:8 ; encoding: [0x00,0x08,0xdc,0xd8,0x02,0x00,0x00,0x08] -//===----------------------------------------------------------------------===// -// Instructions -//===----------------------------------------------------------------------===// - -ds_add_u32 v2, v4 -// CHECK: ds_add_u32 v2, v4 ; encoding: [0x00,0x00,0x00,0xd8,0x02,0x04,0x00,0x00] - -ds_sub_u32 v2, v4 -// CHECK: ds_sub_u32 v2, v4 ; encoding: [0x00,0x00,0x04,0xd8,0x02,0x04,0x00,0x00] - -ds_rsub_u32 v2, v4 -// CHECK: ds_rsub_u32 v2, v4 ; encoding: [0x00,0x00,0x08,0xd8,0x02,0x04,0x00,0x00] - -ds_inc_u32 v2, v4 -// CHECK: ds_inc_u32 v2, v4 ; encoding: [0x00,0x00,0x0c,0xd8,0x02,0x04,0x00,0x00] - -ds_dec_u32 v2, v4 -// CHECK: ds_dec_u32 v2, v4 ; encoding: [0x00,0x00,0x10,0xd8,0x02,0x04,0x00,0x00] - -ds_min_i32 v2, v4 -// CHECK: ds_min_i32 v2, v4 ; encoding: [0x00,0x00,0x14,0xd8,0x02,0x04,0x00,0x00] - -ds_max_i32 v2, v4 -// CHECK: ds_max_i32 v2, v4 ; encoding: [0x00,0x00,0x18,0xd8,0x02,0x04,0x00,0x00] - -ds_min_u32 v2, v4 -// CHECK: ds_min_u32 v2, v4 ; encoding: [0x00,0x00,0x1c,0xd8,0x02,0x04,0x00,0x00] - -ds_max_u32 v2, v4 -// CHECK: ds_max_u32 v2, v4 ; encoding: [0x00,0x00,0x20,0xd8,0x02,0x04,0x00,0x00] - -ds_and_b32 v2, v4 -// CHECK: ds_and_b32 v2, v4 ; encoding: [0x00,0x00,0x24,0xd8,0x02,0x04,0x00,0x00] - -ds_or_b32 v2, v4 -// CHECK: ds_or_b32 v2, v4 ; encoding: [0x00,0x00,0x28,0xd8,0x02,0x04,0x00,0x00] - -ds_xor_b32 v2, v4 -// CHECK: ds_xor_b32 v2, v4 ; encoding: [0x00,0x00,0x2c,0xd8,0x02,0x04,0x00,0x00] - -ds_mskor_b32 v2, v4, v6 -// CHECK: ds_mskor_b32 v2, v4, v6 ; encoding: [0x00,0x00,0x30,0xd8,0x02,0x04,0x06,0x00] - -ds_write_b32 v2, v4 -// CHECK: ds_write_b32 v2, v4 ; encoding: [0x00,0x00,0x34,0xd8,0x02,0x04,0x00,0x00] - -ds_write2_b32 v2, v4, v6 -// CHECK: ds_write2_b32 v2, v4, v6 ; encoding: [0x00,0x00,0x38,0xd8,0x02,0x04,0x06,0x00] - -ds_write2st64_b32 v2, v4, v6 -// CHECK: ds_write2st64_b32 v2, v4, v6 ; encoding: [0x00,0x00,0x3c,0xd8,0x02,0x04,0x06,0x00] - -ds_cmpst_b32 v2, v4, v6 -// CHECK: ds_cmpst_b32 v2, v4, v6 ; encoding: [0x00,0x00,0x40,0xd8,0x02,0x04,0x06,0x00] - -ds_cmpst_f32 v2, v4, v6 -// CHECK: ds_cmpst_f32 v2, v4, v6 ; encoding: [0x00,0x00,0x44,0xd8,0x02,0x04,0x06,0x00] - -ds_min_f32 v2, v4, v6 -// CHECK: ds_min_f32 v2, v4, v6 ; encoding: [0x00,0x00,0x48,0xd8,0x02,0x04,0x06,0x00] - -ds_max_f32 v2, v4, v6 -// CHECK: ds_max_f32 v2, v4, v6 ; encoding: [0x00,0x00,0x4c,0xd8,0x02,0x04,0x06,0x00] - -ds_gws_init v2 gds -// CHECK: ds_gws_init v2 gds ; encoding: [0x00,0x00,0x66,0xd8,0x02,0x00,0x00,0x00] - -ds_gws_sema_v v2 gds -// CHECK: ds_gws_sema_v v2 gds ; encoding: [0x00,0x00,0x6a,0xd8,0x02,0x00,0x00,0x00] - -ds_gws_sema_br v2 gds -// CHECK: ds_gws_sema_br v2 gds ; encoding: [0x00,0x00,0x6e,0xd8,0x02,0x00,0x00,0x00] - -ds_gws_sema_p v2 gds -// CHECK: ds_gws_sema_p v2 gds ; encoding: [0x00,0x00,0x72,0xd8,0x02,0x00,0x00,0x00] - -ds_gws_barrier v2 gds -// CHECK: ds_gws_barrier v2 gds ; encoding: [0x00,0x00,0x76,0xd8,0x02,0x00,0x00,0x00] - -ds_write_b8 v2, v4 -// CHECK: ds_write_b8 v2, v4 ; encoding: [0x00,0x00,0x78,0xd8,0x02,0x04,0x00,0x00] - -ds_write_b16 v2, v4 -// CHECK: ds_write_b16 v2, v4 ; encoding: [0x00,0x00,0x7c,0xd8,0x02,0x04,0x00,0x00] - -ds_add_rtn_u32 v8, v2, v4 -// CHECK: ds_add_rtn_u32 v8, v2, v4 ; encoding: [0x00,0x00,0x80,0xd8,0x02,0x04,0x00,0x08] - -ds_sub_rtn_u32 v8, v2, v4 -// CHECK: ds_sub_rtn_u32 v8, v2, v4 ; encoding: [0x00,0x00,0x84,0xd8,0x02,0x04,0x00,0x08] - -ds_rsub_rtn_u32 v8, v2, v4 -// CHECK: ds_rsub_rtn_u32 v8, v2, v4 ; encoding: [0x00,0x00,0x88,0xd8,0x02,0x04,0x00,0x08] - -ds_inc_rtn_u32 v8, v2, v4 -// CHECK: ds_inc_rtn_u32 v8, v2, v4 ; encoding: [0x00,0x00,0x8c,0xd8,0x02,0x04,0x00,0x08] - -ds_dec_rtn_u32 v8, v2, v4 -// CHECK: ds_dec_rtn_u32 v8, v2, v4 ; encoding: [0x00,0x00,0x90,0xd8,0x02,0x04,0x00,0x08] - -ds_min_rtn_i32 v8, v2, v4 -// CHECK: ds_min_rtn_i32 v8, v2, v4 ; encoding: [0x00,0x00,0x94,0xd8,0x02,0x04,0x00,0x08] - -ds_max_rtn_i32 v8, v2, v4 -// CHECK: ds_max_rtn_i32 v8, v2, v4 ; encoding: [0x00,0x00,0x98,0xd8,0x02,0x04,0x00,0x08] - -ds_min_rtn_u32 v8, v2, v4 -// CHECK: ds_min_rtn_u32 v8, v2, v4 ; encoding: [0x00,0x00,0x9c,0xd8,0x02,0x04,0x00,0x08] - -ds_max_rtn_u32 v8, v2, v4 -// CHECK: ds_max_rtn_u32 v8, v2, v4 ; encoding: [0x00,0x00,0xa0,0xd8,0x02,0x04,0x00,0x08] - -ds_and_rtn_b32 v8, v2, v4 -// CHECK: ds_and_rtn_b32 v8, v2, v4 ; encoding: [0x00,0x00,0xa4,0xd8,0x02,0x04,0x00,0x08] - -ds_or_rtn_b32 v8, v2, v4 -// CHECK: ds_or_rtn_b32 v8, v2, v4 ; encoding: [0x00,0x00,0xa8,0xd8,0x02,0x04,0x00,0x08] - -ds_xor_rtn_b32 v8, v2, v4 -// CHECK: ds_xor_rtn_b32 v8, v2, v4 ; encoding: [0x00,0x00,0xac,0xd8,0x02,0x04,0x00,0x08] - -ds_mskor_rtn_b32 v8, v2, v4, v6 -// CHECK: ds_mskor_rtn_b32 v8, v2, v4, v6 ; encoding: [0x00,0x00,0xb0,0xd8,0x02,0x04,0x06,0x08] - -ds_wrxchg_rtn_b32 v8, v2, v4 -// CHECK: ds_wrxchg_rtn_b32 v8, v2, v4 ; encoding: [0x00,0x00,0xb4,0xd8,0x02,0x04,0x00,0x08] - -ds_wrxchg2_rtn_b32 v[8:9], v2, v4, v6 -// CHECK: ds_wrxchg2_rtn_b32 v[8:9], v2, v4, v6 ; encoding: [0x00,0x00,0xb8,0xd8,0x02,0x04,0x06,0x08] - -ds_wrxchg2st64_rtn_b32 v[8:9] v2, v4, v6 -// CHECK: ds_wrxchg2st64_rtn_b32 v[8:9], v2, v4, v6 ; encoding: [0x00,0x00,0xbc,0xd8,0x02,0x04,0x06,0x08] - -ds_cmpst_rtn_b32 v8, v2, v4, v6 -// CHECK: ds_cmpst_rtn_b32 v8, v2, v4, v6 ; encoding: [0x00,0x00,0xc0,0xd8,0x02,0x04,0x06,0x08] - -ds_cmpst_rtn_f32 v8, v2, v4, v6 -// CHECK: ds_cmpst_rtn_f32 v8, v2, v4, v6 ; encoding: [0x00,0x00,0xc4,0xd8,0x02,0x04,0x06,0x08] - -ds_min_rtn_f32 v8, v2, v4, v6 -// CHECK: ds_min_rtn_f32 v8, v2, v4, v6 ; encoding: [0x00,0x00,0xc8,0xd8,0x02,0x04,0x06,0x08] - -ds_max_rtn_f32 v8, v2, v4, v6 -// CHECK: ds_max_rtn_f32 v8, v2, v4, v6 ; encoding: [0x00,0x00,0xcc,0xd8,0x02,0x04,0x06,0x08] - -ds_swizzle_b32 v8, v2 -// CHECK: ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] - -ds_read_b32 v8, v2 -// CHECK: ds_read_b32 v8, v2 ; encoding: [0x00,0x00,0xd8,0xd8,0x02,0x00,0x00,0x08] - -ds_read2_b32 v[8:9], v2 -// CHECK: ds_read2_b32 v[8:9], v2 ; encoding: [0x00,0x00,0xdc,0xd8,0x02,0x00,0x00,0x08] - -ds_read2st64_b32 v[8:9], v2 -// CHECK: ds_read2st64_b32 v[8:9], v2 ; encoding: [0x00,0x00,0xe0,0xd8,0x02,0x00,0x00,0x08] - -ds_read_i8 v8, v2 -// CHECK: ds_read_i8 v8, v2 ; encoding: [0x00,0x00,0xe4,0xd8,0x02,0x00,0x00,0x08] - -ds_read_u8 v8, v2 -// CHECK: ds_read_u8 v8, v2 ; encoding: [0x00,0x00,0xe8,0xd8,0x02,0x00,0x00,0x08] - -ds_read_i16 v8, v2 -// CHECK: ds_read_i16 v8, v2 ; encoding: [0x00,0x00,0xec,0xd8,0x02,0x00,0x00,0x08] - -ds_read_u16 v8, v2 -// CHECK: ds_read_u16 v8, v2 ; encoding: [0x00,0x00,0xf0,0xd8,0x02,0x00,0x00,0x08] - -ds_consume v8 -// CHECK: ds_consume v8 ; encoding: [0x00,0x00,0xf4,0xd8,0x00,0x00,0x00,0x08] - -ds_append v8 -// CHECK: ds_append v8 ; encoding: [0x00,0x00,0xf8,0xd8,0x00,0x00,0x00,0x08] - -ds_ordered_count v8, v2 gds -// CHECK: ds_ordered_count v8, v2 gds ; encoding: [0x00,0x00,0xfe,0xd8,0x02,0x00,0x00,0x08] - -ds_add_u64 v2, v[4:5] -// CHECK: ds_add_u64 v2, v[4:5] ; encoding: [0x00,0x00,0x00,0xd9,0x02,0x04,0x00,0x00] - -ds_sub_u64 v2, v[4:5] -// CHECK: ds_sub_u64 v2, v[4:5] ; encoding: [0x00,0x00,0x04,0xd9,0x02,0x04,0x00,0x00] - -ds_rsub_u64 v2, v[4:5] -// CHECK: ds_rsub_u64 v2, v[4:5] ; encoding: [0x00,0x00,0x08,0xd9,0x02,0x04,0x00,0x00] - -ds_inc_u64 v2, v[4:5] -// CHECK: ds_inc_u64 v2, v[4:5] ; encoding: [0x00,0x00,0x0c,0xd9,0x02,0x04,0x00,0x00] - -ds_dec_u64 v2, v[4:5] -// CHECK: ds_dec_u64 v2, v[4:5] ; encoding: [0x00,0x00,0x10,0xd9,0x02,0x04,0x00,0x00] - -ds_min_i64 v2, v[4:5] -// CHECK: ds_min_i64 v2, v[4:5] ; encoding: [0x00,0x00,0x14,0xd9,0x02,0x04,0x00,0x00] - -ds_max_i64 v2, v[4:5] -// CHECK: ds_max_i64 v2, v[4:5] ; encoding: [0x00,0x00,0x18,0xd9,0x02,0x04,0x00,0x00] - -ds_min_u64 v2, v[4:5] -// CHECK: ds_min_u64 v2, v[4:5] ; encoding: [0x00,0x00,0x1c,0xd9,0x02,0x04,0x00,0x00] - -ds_max_u64 v2, v[4:5] -// CHECK: ds_max_u64 v2, v[4:5] ; encoding: [0x00,0x00,0x20,0xd9,0x02,0x04,0x00,0x00] - -ds_and_b64 v2, v[4:5] -// CHECK: ds_and_b64 v2, v[4:5] ; encoding: [0x00,0x00,0x24,0xd9,0x02,0x04,0x00,0x00] - -ds_or_b64 v2, v[4:5] -// CHECK: ds_or_b64 v2, v[4:5] ; encoding: [0x00,0x00,0x28,0xd9,0x02,0x04,0x00,0x00] - -ds_xor_b64 v2, v[4:5] -// CHECK: ds_xor_b64 v2, v[4:5] ; encoding: [0x00,0x00,0x2c,0xd9,0x02,0x04,0x00,0x00] - -ds_mskor_b64 v2, v[4:5], v[6:7] -// CHECK: ds_mskor_b64 v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0x30,0xd9,0x02,0x04,0x06,0x00] - -ds_write_b64 v2, v[4:5] -// CHECK: ds_write_b64 v2, v[4:5] ; encoding: [0x00,0x00,0x34,0xd9,0x02,0x04,0x00,0x00] - -ds_write2_b64 v2, v[4:5], v[6:7] -// CHECK: ds_write2_b64 v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0x38,0xd9,0x02,0x04,0x06,0x00] - -ds_write2st64_b64 v2, v[4:5], v[6:7] -// CHECK: ds_write2st64_b64 v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0x3c,0xd9,0x02,0x04,0x06,0x00] - -ds_cmpst_b64 v2, v[4:5], v[6:7] -// CHECK: ds_cmpst_b64 v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0x40,0xd9,0x02,0x04,0x06,0x00] - -ds_cmpst_f64 v2, v[4:5], v[6:7] -// CHECK: ds_cmpst_f64 v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0x44,0xd9,0x02,0x04,0x06,0x00] - -ds_min_f64 v2, v[4:5] -// CHECK: ds_min_f64 v2, v[4:5] ; encoding: [0x00,0x00,0x48,0xd9,0x02,0x04,0x00,0x00] - -ds_max_f64 v2, v[4:5] -// CHECK: ds_max_f64 v2, v[4:5] ; encoding: [0x00,0x00,0x4c,0xd9,0x02,0x04,0x00,0x00] - -ds_add_rtn_u64 v[8:9], v2, v[4:5] -// CHECK: ds_add_rtn_u64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x80,0xd9,0x02,0x04,0x00,0x08] - -ds_sub_rtn_u64 v[8:9], v2, v[4:5] -// CHECK: ds_sub_rtn_u64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x84,0xd9,0x02,0x04,0x00,0x08] - -ds_rsub_rtn_u64 v[8:9], v2, v[4:5] -// CHECK: ds_rsub_rtn_u64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x88,0xd9,0x02,0x04,0x00,0x08] - -ds_inc_rtn_u64 v[8:9], v2, v[4:5] -// CHECK: ds_inc_rtn_u64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x8c,0xd9,0x02,0x04,0x00,0x08] - -ds_dec_rtn_u64 v[8:9] v2, v[4:5] -// CHECK: ds_dec_rtn_u64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x90,0xd9,0x02,0x04,0x00,0x08] - -ds_min_rtn_i64 v[8:9], v2, v[4:5] -// CHECK: ds_min_rtn_i64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x94,0xd9,0x02,0x04,0x00,0x08] - -ds_max_rtn_i64 v[8:9], v2, v[4:5] -// CHECK: ds_max_rtn_i64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x98,0xd9,0x02,0x04,0x00,0x08] - -ds_min_rtn_u64 v[8:9], v2, v[4:5] -// CHECK: ds_min_rtn_u64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0x9c,0xd9,0x02,0x04,0x00,0x08] - -ds_max_rtn_u64 v[8:9], v2, v[4:5] -// CHECK: ds_max_rtn_u64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0xa0,0xd9,0x02,0x04,0x00,0x08] - -ds_and_rtn_b64 v[8:9], v2, v[4:5] -// CHECK: ds_and_rtn_b64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0xa4,0xd9,0x02,0x04,0x00,0x08] - -ds_or_rtn_b64 v[8:9], v2, v[4:5] -// CHECK: ds_or_rtn_b64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0xa8,0xd9,0x02,0x04,0x00,0x08] - -ds_xor_rtn_b64 v[8:9], v2, v[4:5] -// CHECK: ds_xor_rtn_b64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0xac,0xd9,0x02,0x04,0x00,0x08] - -ds_mskor_rtn_b64 v[8:9], v2, v[4:5], v[6:7] -// CHECK: ds_mskor_rtn_b64 v[8:9], v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0xb0,0xd9,0x02,0x04,0x06,0x08] - -ds_wrxchg_rtn_b64 v[8:9], v2, v[4:5] -// CHECK: ds_wrxchg_rtn_b64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0xb4,0xd9,0x02,0x04,0x00,0x08] - -ds_wrxchg2_rtn_b64 v[8:11], v2, v[4:5], v[6:7] -// CHECK: ds_wrxchg2_rtn_b64 v[8:11], v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0xb8,0xd9,0x02,0x04,0x06,0x08] - -ds_wrxchg2st64_rtn_b64 v[8:11], v2, v[4:5], v[6:7] -// CHECK: ds_wrxchg2st64_rtn_b64 v[8:11], v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0xbc,0xd9,0x02,0x04,0x06,0x08] - -ds_cmpst_rtn_b64 v[8:9], v2, v[4:5], v[6:7] -// CHECK: ds_cmpst_rtn_b64 v[8:9], v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0xc0,0xd9,0x02,0x04,0x06,0x08] - -ds_cmpst_rtn_f64 v[8:9], v2, v[4:5], v[6:7] -// CHECK: ds_cmpst_rtn_f64 v[8:9], v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0xc4,0xd9,0x02,0x04,0x06,0x08] - -ds_min_rtn_f64 v[8:9], v2, v[4:5] -// CHECK: ds_min_rtn_f64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0xc8,0xd9,0x02,0x04,0x00,0x08] - -ds_max_rtn_f64 v[8:9], v2, v[4:5] -// CHECK: ds_max_rtn_f64 v[8:9], v2, v[4:5] ; encoding: [0x00,0x00,0xcc,0xd9,0x02,0x04,0x00,0x08] - -ds_read_b64 v[8:9], v2 -// CHECK: ds_read_b64 v[8:9], v2 ; encoding: [0x00,0x00,0xd8,0xd9,0x02,0x00,0x00,0x08] - -ds_read2_b64 v[8:11], v2 -// CHECK: ds_read2_b64 v[8:11], v2 ; encoding: [0x00,0x00,0xdc,0xd9,0x02,0x00,0x00,0x08] - -ds_read2st64_b64 v[8:11], v2 -// CHECK: ds_read2st64_b64 v[8:11], v2 ; encoding: [0x00,0x00,0xe0,0xd9,0x02,0x00,0x00,0x08] diff --git a/test/MC/R600/flat.s b/test/MC/R600/flat.s deleted file mode 100644 index adad29a5595..00000000000 --- a/test/MC/R600/flat.s +++ /dev/null @@ -1,477 +0,0 @@ -// RUN: llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s | FileCheck %s --check-prefix=CIVI --check-prefix=CI -// RUN: llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck %s --check-prefix=CIVI - -// FIXME: These instructions give an 'invalid operand' error on SI and should -// instead be reporting an 'instruction not supported' error. - -// XUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck %s --check-prefix=NOVI -// XUN: not llvm-mc -arch=amdgcn -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSI -// XUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSI - -//===----------------------------------------------------------------------===// -// Operands -//===----------------------------------------------------------------------===// - -flat_load_dword v1, v[3:4] -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_load_dword v1, v[3:4] ; encoding: [0x00,0x00,0x30,0xdc,0x03,0x00,0x00,0x01] - -flat_load_dword v1, v[3:4] glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_load_dword v1, v[3:4] glc ; encoding: [0x00,0x00,0x31,0xdc,0x03,0x00,0x00,0x01] - -flat_load_dword v1, v[3:4] glc slc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_load_dword v1, v[3:4] glc slc ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x00,0x01] - -flat_load_dword v1, v[3:4] glc tfe -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_load_dword v1, v[3:4] glc tfe ; encoding: [0x00,0x00,0x31,0xdc,0x03,0x00,0x80,0x01] - -flat_load_dword v1, v[3:4] glc slc tfe -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_load_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x80,0x01] - -flat_load_dword v1, v[3:4] glc tfe slc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_load_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x80,0x01] - -flat_load_dword v1, v[3:4] slc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_load_dword v1, v[3:4] slc ; encoding: [0x00,0x00,0x32,0xdc,0x03,0x00,0x00,0x01] - -flat_load_dword v1, v[3:4] slc glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_load_dword v1, v[3:4] glc slc ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x00,0x01] - -flat_load_dword v1, v[3:4] slc tfe -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_load_dword v1, v[3:4] slc tfe ; encoding: [0x00,0x00,0x32,0xdc,0x03,0x00,0x80,0x01] - -flat_load_dword v1, v[3:4] slc glc tfe -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_load_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x80,0x01] - -flat_load_dword v1, v[3:4] slc tfe glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_load_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x80,0x01] - -flat_load_dword v1, v[3:4] tfe -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_load_dword v1, v[3:4] tfe ; encoding: [0x00,0x00,0x30,0xdc,0x03,0x00,0x80,0x01] - -flat_load_dword v1, v[3:4] tfe glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_load_dword v1, v[3:4] glc tfe ; encoding: [0x00,0x00,0x31,0xdc,0x03,0x00,0x80,0x01] - -flat_load_dword v1, v[3:4] tfe slc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_load_dword v1, v[3:4] slc tfe ; encoding: [0x00,0x00,0x32,0xdc,0x03,0x00,0x80,0x01] - -flat_load_dword v1, v[3:4] tfe glc slc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_load_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x80,0x01] - -flat_load_dword v1, v[3:4] tfe slc glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_load_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x33,0xdc,0x03,0x00,0x80,0x01] - -flat_store_dword v1, v[3:4] -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_store_dword v1, v[3:4] ; encoding: [0x00,0x00,0x70,0xdc,0x03,0x01,0x00,0x00] - -flat_store_dword v1, v[3:4] glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_store_dword v1, v[3:4] glc ; encoding: [0x00,0x00,0x71,0xdc,0x03,0x01,0x00,0x00] - -flat_store_dword v1, v[3:4] glc slc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_store_dword v1, v[3:4] glc slc ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x00,0x00] - -flat_store_dword v1, v[3:4] glc tfe -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_store_dword v1, v[3:4] glc tfe ; encoding: [0x00,0x00,0x71,0xdc,0x03,0x01,0x80,0x00] - -flat_store_dword v1, v[3:4] glc slc tfe -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_store_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x80,0x00] - -flat_store_dword v1, v[3:4] glc tfe slc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_store_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x80,0x00] - -flat_store_dword v1, v[3:4] slc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_store_dword v1, v[3:4] slc ; encoding: [0x00,0x00,0x72,0xdc,0x03,0x01,0x00,0x00] - -flat_store_dword v1, v[3:4] slc glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_store_dword v1, v[3:4] glc slc ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x00,0x00] - -flat_store_dword v1, v[3:4] slc tfe -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_store_dword v1, v[3:4] slc tfe ; encoding: [0x00,0x00,0x72,0xdc,0x03,0x01,0x80,0x00] - -flat_store_dword v1, v[3:4] slc glc tfe -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_store_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x80,0x00] - -flat_store_dword v1, v[3:4] slc tfe glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_store_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x80,0x00] - -flat_store_dword v1, v[3:4] tfe -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_store_dword v1, v[3:4] tfe ; encoding: [0x00,0x00,0x70,0xdc,0x03,0x01,0x80,0x00] - -flat_store_dword v1, v[3:4] tfe glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_store_dword v1, v[3:4] glc tfe ; encoding: [0x00,0x00,0x71,0xdc,0x03,0x01,0x80,0x00] - -flat_store_dword v1, v[3:4] tfe slc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_store_dword v1, v[3:4] slc tfe ; encoding: [0x00,0x00,0x72,0xdc,0x03,0x01,0x80,0x00] - -flat_store_dword v1, v[3:4] tfe glc slc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_store_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x80,0x00] - -flat_store_dword v1, v[3:4] tfe slc glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_store_dword v1, v[3:4] glc slc tfe ; encoding: [0x00,0x00,0x73,0xdc,0x03,0x01,0x80,0x00] - -// FIXME: For atomic instructions, glc must be placed immediately following -// the data regiser. These forms aren't currently supported: -// flat_atomic_add v1, v[3:4], v5 slc glc -// flat_atomic_add v1, v[3:4], v5 slc glc tfe -// flat_atomic_add v1, v[3:4], v5 slc tfe glc -// flat_atomic_add v1, v[3:4], v5 tfe glc -// flat_atomic_add v[3:4], v5 tfe glc -// flat_atomic_add v1, v[3:4], v5 tfe glc slc -// flat_atomic_add v1, v[3:4], v5 tfe slc glc - -flat_atomic_add v1 v[3:4], v5 glc slc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_add v1, v[3:4], v5 glc slc ; encoding: [0x00,0x00,0xcb,0xdc,0x03,0x05,0x00,0x01] - -flat_atomic_add v1 v[3:4], v5 glc tfe -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_add v1, v[3:4], v5 glc tfe ; encoding: [0x00,0x00,0xc9,0xdc,0x03,0x05,0x80,0x01] - -flat_atomic_add v1 v[3:4], v5 glc slc tfe -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_add v1, v[3:4], v5 glc slc tfe ; encoding: [0x00,0x00,0xcb,0xdc,0x03,0x05,0x80,0x01] - -flat_atomic_add v1 v[3:4], v5 glc tfe slc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_add v1, v[3:4], v5 glc slc tfe ; encoding: [0x00,0x00,0xcb,0xdc,0x03,0x05,0x80,0x01] - -flat_atomic_add v[3:4], v5 slc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_add v[3:4], v5 slc ; encoding: [0x00,0x00,0xca,0xdc,0x03,0x05,0x00,0x00] - -flat_atomic_add v[3:4], v5 slc tfe -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_add v[3:4], v5 slc tfe ; encoding: [0x00,0x00,0xca,0xdc,0x03,0x05,0x80,0x00] - -flat_atomic_add v[3:4], v5 tfe -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_add v[3:4], v5 tfe ; encoding: [0x00,0x00,0xc8,0xdc,0x03,0x05,0x80,0x00] - -//===----------------------------------------------------------------------===// -// Instructions -//===----------------------------------------------------------------------===// - -flat_load_ubyte v1, v[3:4] -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_load_ubyte v1, v[3:4] ; encoding: [0x00,0x00,0x20,0xdc,0x03,0x00,0x00,0x01] - -flat_load_sbyte v1, v[3:4] -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_load_sbyte v1, v[3:4] ; encoding: [0x00,0x00,0x24,0xdc,0x03,0x00,0x00,0x01] - -flat_load_ushort v1, v[3:4] -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_load_ushort v1, v[3:4] ; encoding: [0x00,0x00,0x28,0xdc,0x03,0x00,0x00,0x01] - -flat_load_sshort v1, v[3:4] -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_load_sshort v1, v[3:4] ; encoding: [0x00,0x00,0x2c,0xdc,0x03,0x00,0x00,0x01] - -flat_load_dword v1, v[3:4] -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_load_dword v1, v[3:4] ; encoding: [0x00,0x00,0x30,0xdc,0x03,0x00,0x00,0x01] - -flat_load_dwordx2 v[1:2], v[3:4] -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_load_dwordx2 v[1:2], v[3:4] ; encoding: [0x00,0x00,0x34,0xdc,0x03,0x00,0x00,0x01] - -flat_load_dwordx4 v[5:8], v[3:4] -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_load_dwordx4 v[5:8], v[3:4] ; encoding: [0x00,0x00,0x38,0xdc,0x03,0x00,0x00,0x05] - -flat_load_dwordx3 v[5:7], v[3:4] -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_load_dwordx3 v[5:7], v[3:4] ; encoding: [0x00,0x00,0x3c,0xdc,0x03,0x00,0x00,0x05] - -flat_store_byte v1, v[3:4] -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_store_byte v1, v[3:4] ; encoding: [0x00,0x00,0x60,0xdc,0x03,0x01,0x00,0x00] - -flat_store_short v1, v[3:4] -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_store_short v1, v[3:4] ; encoding: [0x00,0x00,0x68,0xdc,0x03,0x01,0x00,0x00] - -flat_store_dword v1, v[3:4] -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_store_dword v1, v[3:4] ; encoding: [0x00,0x00,0x70,0xdc,0x03,0x01,0x00,0x00] - -flat_store_dwordx2 v[1:2], v[3:4] -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_store_dwordx2 v[1:2], v[3:4] ; encoding: [0x00,0x00,0x74,0xdc,0x03,0x01,0x00,0x00] - -flat_store_dwordx4 v[5:8], v[3:4] -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_store_dwordx4 v[5:8], v[3:4] ; encoding: [0x00,0x00,0x78,0xdc,0x03,0x05,0x00,0x00] - -flat_store_dwordx3 v[5:7], v[3:4] -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_store_dwordx3 v[5:7], v[3:4] ; encoding: [0x00,0x00,0x7c,0xdc,0x03,0x05,0x00,0x00] - -flat_atomic_swap v[3:4], v5 -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_swap v[3:4], v5 ; encoding: [0x00,0x00,0xc0,0xdc,0x03,0x05,0x00,0x00] - -flat_atomic_swap v1, v[3:4], v5 glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_swap v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xc1,0xdc,0x03,0x05,0x00,0x01] - -flat_atomic_cmpswap v[3:4], v[5:6] -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_cmpswap v[3:4], v[5:6] ; encoding: [0x00,0x00,0xc4,0xdc,0x03,0x05,0x00,0x00] - -flat_atomic_cmpswap v1, v[3:4], v[5:6] glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_cmpswap v1, v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0xc5,0xdc,0x03,0x05,0x00,0x01] - -flat_atomic_add v[3:4], v5 -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_add v[3:4], v5 ; encoding: [0x00,0x00,0xc8,0xdc,0x03,0x05,0x00,0x00] - -flat_atomic_add v1, v[3:4], v5 glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_add v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xc9,0xdc,0x03,0x05,0x00,0x01] - -flat_atomic_sub v[3:4], v5 -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_sub v[3:4], v5 ; encoding: [0x00,0x00,0xcc,0xdc,0x03,0x05,0x00,0x00] - -flat_atomic_sub v1, v[3:4], v5 glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_sub v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xcd,0xdc,0x03,0x05,0x00,0x01] - -flat_atomic_smin v[3:4], v5 -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_smin v[3:4], v5 ; encoding: [0x00,0x00,0xd4,0xdc,0x03,0x05,0x00,0x00] - -flat_atomic_smin v1, v[3:4], v5 glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_smin v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xd5,0xdc,0x03,0x05,0x00,0x01] - -flat_atomic_umin v[3:4], v5 -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_umin v[3:4], v5 ; encoding: [0x00,0x00,0xd8,0xdc,0x03,0x05,0x00,0x00] - -flat_atomic_umin v1, v[3:4], v5 glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_umin v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xd9,0xdc,0x03,0x05,0x00,0x01] - -flat_atomic_smax v[3:4], v5 -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_smax v[3:4], v5 ; encoding: [0x00,0x00,0xdc,0xdc,0x03,0x05,0x00,0x00] - -flat_atomic_smax v1, v[3:4], v5 glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_smax v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xdd,0xdc,0x03,0x05,0x00,0x01] - -flat_atomic_umax v[3:4], v5 -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_umax v[3:4], v5 ; encoding: [0x00,0x00,0xe0,0xdc,0x03,0x05,0x00,0x00] - -flat_atomic_umax v1, v[3:4], v5 glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_umax v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xe1,0xdc,0x03,0x05,0x00,0x01] - -flat_atomic_and v[3:4], v5 -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_and v[3:4], v5 ; encoding: [0x00,0x00,0xe4,0xdc,0x03,0x05,0x00,0x00] - -flat_atomic_and v1, v[3:4], v5 glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_and v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xe5,0xdc,0x03,0x05,0x00,0x01] - -flat_atomic_or v[3:4], v5 -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_or v[3:4], v5 ; encoding: [0x00,0x00,0xe8,0xdc,0x03,0x05,0x00,0x00] - -flat_atomic_or v1, v[3:4], v5 glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_or v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xe9,0xdc,0x03,0x05,0x00,0x01] - -flat_atomic_xor v[3:4], v5 -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_xor v[3:4], v5 ; encoding: [0x00,0x00,0xec,0xdc,0x03,0x05,0x00,0x00] - -flat_atomic_xor v1, v[3:4], v5 glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_xor v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xed,0xdc,0x03,0x05,0x00,0x01] - -flat_atomic_inc v[3:4], v5 -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_inc v[3:4], v5 ; encoding: [0x00,0x00,0xf0,0xdc,0x03,0x05,0x00,0x00] - -flat_atomic_inc v1, v[3:4], v5 glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_inc v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xf1,0xdc,0x03,0x05,0x00,0x01] - -flat_atomic_dec v[3:4], v5 -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_dec v[3:4], v5 ; encoding: [0x00,0x00,0xf4,0xdc,0x03,0x05,0x00,0x00] - -flat_atomic_dec v1, v[3:4], v5 glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_dec v1, v[3:4], v5 glc ; encoding: [0x00,0x00,0xf5,0xdc,0x03,0x05,0x00,0x01] - -flat_atomic_swap_x2 v[3:4], v[5:6] -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_swap_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x40,0xdd,0x03,0x05,0x00,0x00] - -flat_atomic_swap_x2 v[1:2], v[3:4], v[5:6] glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_swap_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x41,0xdd,0x03,0x05,0x00,0x01] - -flat_atomic_cmpswap_x2 v[3:4], v[5:8] -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_cmpswap_x2 v[3:4], v[5:8] ; encoding: [0x00,0x00,0x44,0xdd,0x03,0x05,0x00,0x00] - -flat_atomic_cmpswap_x2 v[1:2], v[3:4], v[5:8] glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_cmpswap_x2 v[1:2], v[3:4], v[5:8] glc ; encoding: [0x00,0x00,0x45,0xdd,0x03,0x05,0x00,0x01] - -flat_atomic_add_x2 v[3:4], v[5:6] -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_add_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x48,0xdd,0x03,0x05,0x00,0x00] - -flat_atomic_add_x2 v[1:2], v[3:4], v[5:6] glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_add_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x49,0xdd,0x03,0x05,0x00,0x01] - -flat_atomic_sub_x2 v[3:4], v[5:6] -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_sub_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x4c,0xdd,0x03,0x05,0x00,0x00] - -flat_atomic_sub_x2 v[1:2], v[3:4], v[5:6] glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_sub_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x4d,0xdd,0x03,0x05,0x00,0x01] - -flat_atomic_smin_x2 v[3:4], v[5:6] -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_smin_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x54,0xdd,0x03,0x05,0x00,0x00] - -flat_atomic_smin_x2 v[1:2], v[3:4], v[5:6] glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_smin_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x55,0xdd,0x03,0x05,0x00,0x01] - -flat_atomic_umin_x2 v[3:4], v[5:6] -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_umin_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x58,0xdd,0x03,0x05,0x00,0x00] - -flat_atomic_umin_x2 v[1:2], v[3:4], v[5:6] glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_umin_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x59,0xdd,0x03,0x05,0x00,0x01] - -flat_atomic_smax_x2 v[3:4], v[5:6] -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_smax_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x5c,0xdd,0x03,0x05,0x00,0x00] - -flat_atomic_smax_x2 v[1:2], v[3:4], v[5:6] glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_smax_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x5d,0xdd,0x03,0x05,0x00,0x01] - -flat_atomic_umax_x2 v[3:4], v[5:6] -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_umax_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x60,0xdd,0x03,0x05,0x00,0x00] - -flat_atomic_umax_x2 v[1:2], v[3:4], v[5:6] glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_umax_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x61,0xdd,0x03,0x05,0x00,0x01] - -flat_atomic_and_x2 v[3:4], v[5:6] -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_and_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x64,0xdd,0x03,0x05,0x00,0x00] - -flat_atomic_and_x2 v[1:2], v[3:4], v[5:6] glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_and_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x65,0xdd,0x03,0x05,0x00,0x01] - -flat_atomic_or_x2 v[3:4], v[5:6] -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_or_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x68,0xdd,0x03,0x05,0x00,0x00] - -flat_atomic_or_x2 v[1:2], v[3:4], v[5:6] glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_or_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x69,0xdd,0x03,0x05,0x00,0x01] - -flat_atomic_xor_x2 v[3:4], v[5:6] -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_xor_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x6c,0xdd,0x03,0x05,0x00,0x00] - -flat_atomic_xor_x2 v[1:2], v[3:4], v[5:6] glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_xor_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x6d,0xdd,0x03,0x05,0x00,0x01] - -flat_atomic_inc_x2 v[3:4], v[5:6] -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_inc_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x70,0xdd,0x03,0x05,0x00,0x00] - -flat_atomic_inc_x2 v[1:2], v[3:4], v[5:6] glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_inc_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x71,0xdd,0x03,0x05,0x00,0x01] - -flat_atomic_dec_x2 v[3:4], v[5:6] -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_dec_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x74,0xdd,0x03,0x05,0x00,0x00] - -flat_atomic_dec_x2 v[1:2], v[3:4], v[5:6] glc -// NOSI: error: instruction not supported on this GPU -// CIVI: flat_atomic_dec_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x75,0xdd,0x03,0x05,0x00,0x01] - -flat_atomic_fcmpswap_x2 v[3:4], v[5:8] -// NOSI: error: instruction not supported on this GPU -// CI: flat_atomic_fcmpswap_x2 v[3:4], v[5:8] ; encoding: [0x00,0x00,0x78,0xdd,0x03,0x05,0x00,0x00] -// NOVI: error: instruction not supported on this GPU - -flat_atomic_fcmpswap_x2 v[1:2], v[3:4], v[5:8] glc -// NOSI: error: instruction not supported on this GPU -// CI: flat_atomic_fcmpswap_x2 v[1:2], v[3:4], v[5:8] glc ; encoding: [0x00,0x00,0x79,0xdd,0x03,0x05,0x00,0x01] -// NOVI: error: instruction not supported on this GPU - -flat_atomic_fmin_x2 v[3:4], v[5:6] -// NOSI: error: instruction not supported on this GPU -// CI: flat_atomic_fmin_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x7c,0xdd,0x03,0x05,0x00,0x00] -// NOVI: error: instruction not supported on this GPU - -flat_atomic_fmin_x2 v[1:2], v[3:4], v[5:6] glc -// NOSI: error: instruction not supported on this GPU -// CI: flat_atomic_fmin_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x7d,0xdd,0x03,0x05,0x00,0x01] -// NOVI: error: instruction not supported on this GPU - -flat_atomic_fmax_x2 v[3:4], v[5:6] -// NOSI: error: instruction not supported on this GPU -// CI: flat_atomic_fmax_x2 v[3:4], v[5:6] ; encoding: [0x00,0x00,0x80,0xdd,0x03,0x05,0x00,0x00] -// NOVI: error: instruction not supported on this GPU - -flat_atomic_fmax_x2 v[1:2], v[3:4], v[5:6] glc -// NOSI: error: instruction not supported on this GPU -// CI: flat_atomic_fmax_x2 v[1:2], v[3:4], v[5:6] glc ; encoding: [0x00,0x00,0x81,0xdd,0x03,0x05,0x00,0x01] -// NOVI: error: instruction not supported on this GPU diff --git a/test/MC/R600/lit.local.cfg b/test/MC/R600/lit.local.cfg deleted file mode 100644 index ad9ce2541ef..00000000000 --- a/test/MC/R600/lit.local.cfg +++ /dev/null @@ -1,2 +0,0 @@ -if not 'R600' in config.root.targets: - config.unsupported = True diff --git a/test/MC/R600/mubuf.s b/test/MC/R600/mubuf.s deleted file mode 100644 index 78d365abef1..00000000000 --- a/test/MC/R600/mubuf.s +++ /dev/null @@ -1,352 +0,0 @@ -// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s -// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s - -//===----------------------------------------------------------------------===// -// Test for different operand combinations -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// load - immediate offset only -//===----------------------------------------------------------------------===// - -buffer_load_dword v1, s[4:7], s1 -// CHECK: buffer_load_dword v1, s[4:7], s1 ; encoding: [0x00,0x00,0x30,0xe0,0x00,0x01,0x01,0x01] - -buffer_load_dword v1, s[4:7], s1 offset:4 -// CHECK: buffer_load_dword v1, s[4:7], s1 offset:4 ; encoding: [0x04,0x00,0x30,0xe0,0x00,0x01,0x01,0x01] - -buffer_load_dword v1, s[4:7], s1 offset:4 glc -// CHECK: buffer_load_dword v1, s[4:7], s1 offset:4 glc ; encoding: [0x04,0x40,0x30,0xe0,0x00,0x01,0x01,0x01] - -buffer_load_dword v1, s[4:7], s1 offset:4 slc -// CHECK: buffer_load_dword v1, s[4:7], s1 offset:4 slc ; encoding: [0x04,0x00,0x30,0xe0,0x00,0x01,0x41,0x01] - -buffer_load_dword v1, s[4:7], s1 offset:4 tfe -// CHECK: buffer_load_dword v1, s[4:7], s1 offset:4 tfe ; encoding: [0x04,0x00,0x30,0xe0,0x00,0x01,0x81,0x01] - -buffer_load_dword v1, s[4:7], s1 tfe glc -// CHECK: buffer_load_dword v1, s[4:7], s1 glc tfe ; encoding: [0x00,0x40,0x30,0xe0,0x00,0x01,0x81,0x01] - -buffer_load_dword v1, s[4:7], s1 offset:4 glc tfe slc -// CHECK: buffer_load_dword v1, s[4:7], s1 offset:4 glc slc tfe ; encoding: [0x04,0x40,0x30,0xe0,0x00,0x01,0xc1,0x01] - -buffer_load_dword v1, s[4:7], s1 glc tfe slc offset:4 -// CHECK: buffer_load_dword v1, s[4:7], s1 offset:4 glc slc tfe ; encoding: [0x04,0x40,0x30,0xe0,0x00,0x01,0xc1,0x01] - -//===----------------------------------------------------------------------===// -// load - vgpr offset -//===----------------------------------------------------------------------===// - -buffer_load_dword v1, v2, s[4:7], s1 offen -// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen ; encoding: [0x00,0x10,0x30,0xe0,0x02,0x01,0x01,0x01] - -buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 -// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 ; encoding: [0x04,0x10,0x30,0xe0,0x02,0x01,0x01,0x01] - -buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 glc -// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 glc ; encoding: [0x04,0x50,0x30,0xe0,0x02,0x01,0x01,0x01] - -buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 slc -// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 slc ; encoding: [0x04,0x10,0x30,0xe0,0x02,0x01,0x41,0x01] - -buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 tfe -// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 tfe ; encoding: [0x04,0x10,0x30,0xe0,0x02,0x01,0x81,0x01] - -buffer_load_dword v1, v2, s[4:7], s1 offen tfe glc -// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen glc tfe ; encoding: [0x00,0x50,0x30,0xe0,0x02,0x01,0x81,0x01] - -buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 glc tfe slc -// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 glc slc tfe ; encoding: [0x04,0x50,0x30,0xe0,0x02,0x01,0xc1,0x01] - -buffer_load_dword v1, v2, s[4:7], s1 offen glc tfe slc offset:4 -// CHECK: buffer_load_dword v1, v2, s[4:7], s1 offen offset:4 glc slc tfe ; encoding: [0x04,0x50,0x30,0xe0,0x02,0x01,0xc1,0x01] - -//===----------------------------------------------------------------------===// -// load - vgpr index -//===----------------------------------------------------------------------===// - -buffer_load_dword v1, v2, s[4:7], s1 idxen -// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen ; encoding: [0x00,0x20,0x30,0xe0,0x02,0x01,0x01,0x01] - -buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 -// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 ; encoding: [0x04,0x20,0x30,0xe0,0x02,0x01,0x01,0x01] - -buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 glc -// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 glc ; encoding: [0x04,0x60,0x30,0xe0,0x02,0x01,0x01,0x01] - -buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 slc -// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 slc ; encoding: [0x04,0x20,0x30,0xe0,0x02,0x01,0x41,0x01] - -buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 tfe -// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 tfe ; encoding: [0x04,0x20,0x30,0xe0,0x02,0x01,0x81,0x01] - -buffer_load_dword v1, v2, s[4:7], s1 idxen tfe glc -// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen glc tfe ; encoding: [0x00,0x60,0x30,0xe0,0x02,0x01,0x81,0x01] - -buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 glc tfe slc -// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 glc slc tfe ; encoding: [0x04,0x60,0x30,0xe0,0x02,0x01,0xc1,0x01] - -buffer_load_dword v1, v2, s[4:7], s1 idxen glc tfe slc offset:4 -// CHECK: buffer_load_dword v1, v2, s[4:7], s1 idxen offset:4 glc slc tfe ; encoding: [0x04,0x60,0x30,0xe0,0x02,0x01,0xc1,0x01] - -//===----------------------------------------------------------------------===// -// load - vgpr index and offset -//===----------------------------------------------------------------------===// - -buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen -// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen ; encoding: [0x00,0x30,0x30,0xe0,0x02,0x01,0x01,0x01] - -buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 -// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 ; encoding: [0x04,0x30,0x30,0xe0,0x02,0x01,0x01,0x01] - -buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc -// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc ; encoding: [0x04,0x70,0x30,0xe0,0x02,0x01,0x01,0x01] - -buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 slc -// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 slc ; encoding: [0x04,0x30,0x30,0xe0,0x02,0x01,0x41,0x01] - -buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 tfe -// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 tfe ; encoding: [0x04,0x30,0x30,0xe0,0x02,0x01,0x81,0x01] - -buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen tfe glc -// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen glc tfe ; encoding: [0x00,0x70,0x30,0xe0,0x02,0x01,0x81,0x01] - -buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc tfe slc -// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc slc tfe ; encoding: [0x04,0x70,0x30,0xe0,0x02,0x01,0xc1,0x01] - -buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen glc tfe slc offset:4 -// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc slc tfe ; encoding: [0x04,0x70,0x30,0xe0,0x02,0x01,0xc1,0x01] - -//===----------------------------------------------------------------------===// -// load - addr64 -//===----------------------------------------------------------------------===// - -buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 -// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 ; encoding: [0x00,0x80,0x30,0xe0,0x02,0x01,0x01,0x01] - -buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 -// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 ; encoding: [0x04,0x80,0x30,0xe0,0x02,0x01,0x01,0x01] - -buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc -// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc ; encoding: [0x04,0xc0,0x30,0xe0,0x02,0x01,0x01,0x01] - -buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 slc -// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 slc ; encoding: [0x04,0x80,0x30,0xe0,0x02,0x01,0x41,0x01] - -buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 tfe -// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 tfe ; encoding: [0x04,0x80,0x30,0xe0,0x02,0x01,0x81,0x01] - -buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 tfe glc -// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 glc tfe ; encoding: [0x00,0xc0,0x30,0xe0,0x02,0x01,0x81,0x01] - -buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc tfe slc -// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc slc tfe ; encoding: [0x04,0xc0,0x30,0xe0,0x02,0x01,0xc1,0x01] - -buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 glc tfe slc offset:4 -// CHECK: buffer_load_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc slc tfe ; encoding: [0x04,0xc0,0x30,0xe0,0x02,0x01,0xc1,0x01] - -//===----------------------------------------------------------------------===// -// store - immediate offset only -//===----------------------------------------------------------------------===// - -buffer_store_dword v1, s[4:7], s1 -// CHECK: buffer_store_dword v1, s[4:7], s1 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x01,0x01,0x01] - -buffer_store_dword v1, s[4:7], s1 offset:4 -// CHECK: buffer_store_dword v1, s[4:7], s1 offset:4 ; encoding: [0x04,0x00,0x70,0xe0,0x00,0x01,0x01,0x01] - -buffer_store_dword v1, s[4:7], s1 offset:4 glc -// CHECK: buffer_store_dword v1, s[4:7], s1 offset:4 glc ; encoding: [0x04,0x40,0x70,0xe0,0x00,0x01,0x01,0x01] - -buffer_store_dword v1, s[4:7], s1 offset:4 slc -// CHECK: buffer_store_dword v1, s[4:7], s1 offset:4 slc ; encoding: [0x04,0x00,0x70,0xe0,0x00,0x01,0x41,0x01] - -buffer_store_dword v1, s[4:7], s1 offset:4 tfe -// CHECK: buffer_store_dword v1, s[4:7], s1 offset:4 tfe ; encoding: [0x04,0x00,0x70,0xe0,0x00,0x01,0x81,0x01] - -buffer_store_dword v1, s[4:7], s1 tfe glc -// CHECK: buffer_store_dword v1, s[4:7], s1 glc tfe ; encoding: [0x00,0x40,0x70,0xe0,0x00,0x01,0x81,0x01] - -buffer_store_dword v1, s[4:7], s1 offset:4 glc tfe slc -// CHECK: buffer_store_dword v1, s[4:7], s1 offset:4 glc slc tfe ; encoding: [0x04,0x40,0x70,0xe0,0x00,0x01,0xc1,0x01] - -buffer_store_dword v1, s[4:7], s1 glc tfe slc offset:4 -// CHECK: buffer_store_dword v1, s[4:7], s1 offset:4 glc slc tfe ; encoding: [0x04,0x40,0x70,0xe0,0x00,0x01,0xc1,0x01] - -//===----------------------------------------------------------------------===// -// store - vgpr offset -//===----------------------------------------------------------------------===// - -buffer_store_dword v1, v2, s[4:7], s1 offen -// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen ; encoding: [0x00,0x10,0x70,0xe0,0x02,0x01,0x01,0x01] - -buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 -// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 ; encoding: [0x04,0x10,0x70,0xe0,0x02,0x01,0x01,0x01] - -buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 glc -// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 glc ; encoding: [0x04,0x50,0x70,0xe0,0x02,0x01,0x01,0x01] - -buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 slc -// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 slc ; encoding: [0x04,0x10,0x70,0xe0,0x02,0x01,0x41,0x01] - -buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 tfe -// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 tfe ; encoding: [0x04,0x10,0x70,0xe0,0x02,0x01,0x81,0x01] - -buffer_store_dword v1, v2, s[4:7], s1 offen tfe glc -// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen glc tfe ; encoding: [0x00,0x50,0x70,0xe0,0x02,0x01,0x81,0x01] - -buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 glc tfe slc -// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 glc slc tfe ; encoding: [0x04,0x50,0x70,0xe0,0x02,0x01,0xc1,0x01] - -buffer_store_dword v1, v2, s[4:7], s1 offen glc tfe slc offset:4 -// CHECK: buffer_store_dword v1, v2, s[4:7], s1 offen offset:4 glc slc tfe ; encoding: [0x04,0x50,0x70,0xe0,0x02,0x01,0xc1,0x01] - -//===----------------------------------------------------------------------===// -// store - vgpr index -//===----------------------------------------------------------------------===// - -buffer_store_dword v1, v2, s[4:7], s1 idxen -// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen ; encoding: [0x00,0x20,0x70,0xe0,0x02,0x01,0x01,0x01] - -buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 -// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 ; encoding: [0x04,0x20,0x70,0xe0,0x02,0x01,0x01,0x01] - -buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 glc -// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 glc ; encoding: [0x04,0x60,0x70,0xe0,0x02,0x01,0x01,0x01] - -buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 slc -// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 slc ; encoding: [0x04,0x20,0x70,0xe0,0x02,0x01,0x41,0x01] - -buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 tfe -// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 tfe ; encoding: [0x04,0x20,0x70,0xe0,0x02,0x01,0x81,0x01] - -buffer_store_dword v1, v2, s[4:7], s1 idxen tfe glc -// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen glc tfe ; encoding: [0x00,0x60,0x70,0xe0,0x02,0x01,0x81,0x01] - -buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 glc tfe slc -// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 glc slc tfe ; encoding: [0x04,0x60,0x70,0xe0,0x02,0x01,0xc1,0x01] - -buffer_store_dword v1, v2, s[4:7], s1 idxen glc tfe slc offset:4 -// CHECK: buffer_store_dword v1, v2, s[4:7], s1 idxen offset:4 glc slc tfe ; encoding: [0x04,0x60,0x70,0xe0,0x02,0x01,0xc1,0x01] - -//===----------------------------------------------------------------------===// -// store - vgpr index and offset -//===----------------------------------------------------------------------===// - -buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen -// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen ; encoding: [0x00,0x30,0x70,0xe0,0x02,0x01,0x01,0x01] - -buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 -// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 ; encoding: [0x04,0x30,0x70,0xe0,0x02,0x01,0x01,0x01] - -buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc -// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc ; encoding: [0x04,0x70,0x70,0xe0,0x02,0x01,0x01,0x01] - -buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 slc -// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 slc ; encoding: [0x04,0x30,0x70,0xe0,0x02,0x01,0x41,0x01] - -buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 tfe -// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 tfe ; encoding: [0x04,0x30,0x70,0xe0,0x02,0x01,0x81,0x01] - -buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen tfe glc -// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen glc tfe ; encoding: [0x00,0x70,0x70,0xe0,0x02,0x01,0x81,0x01] - -buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc tfe slc -// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc slc tfe ; encoding: [0x04,0x70,0x70,0xe0,0x02,0x01,0xc1,0x01] - -buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen glc tfe slc offset:4 -// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 idxen offen offset:4 glc slc tfe ; encoding: [0x04,0x70,0x70,0xe0,0x02,0x01,0xc1,0x01] - -//===----------------------------------------------------------------------===// -// store - addr64 -//===----------------------------------------------------------------------===// - -buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 -// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 ; encoding: [0x00,0x80,0x70,0xe0,0x02,0x01,0x01,0x01] - -buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 -// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 ; encoding: [0x04,0x80,0x70,0xe0,0x02,0x01,0x01,0x01] - -buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc -// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc ; encoding: [0x04,0xc0,0x70,0xe0,0x02,0x01,0x01,0x01] - -buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 slc -// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 slc ; encoding: [0x04,0x80,0x70,0xe0,0x02,0x01,0x41,0x01] - -buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 tfe -// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 tfe ; encoding: [0x04,0x80,0x70,0xe0,0x02,0x01,0x81,0x01] - -buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 tfe glc -// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 glc tfe ; encoding: [0x00,0xc0,0x70,0xe0,0x02,0x01,0x81,0x01] - -buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc tfe slc -// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc slc tfe ; encoding: [0x04,0xc0,0x70,0xe0,0x02,0x01,0xc1,0x01] - -buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 glc tfe slc offset:4 -// CHECK: buffer_store_dword v1, v[2:3], s[4:7], s1 addr64 offset:4 glc slc tfe ; encoding: [0x04,0xc0,0x70,0xe0,0x02,0x01,0xc1,0x01] - -//===----------------------------------------------------------------------===// -// Instructions -//===----------------------------------------------------------------------===// - -buffer_load_format_x v1, s[4:7], s1 -// CHECK: buffer_load_format_x v1, s[4:7], s1 ; encoding: [0x00,0x00,0x00,0xe0,0x00,0x01,0x01,0x01] - -buffer_load_format_xy v[1:2], s[4:7], s1 -// CHECK: buffer_load_format_xy v[1:2], s[4:7], s1 ; encoding: [0x00,0x00,0x04,0xe0,0x00,0x01,0x01,0x01] - -buffer_load_format_xyz v[1:3], s[4:7], s1 -// CHECK: buffer_load_format_xyz v[1:3], s[4:7], s1 ; encoding: [0x00,0x00,0x08,0xe0,0x00,0x01,0x01,0x01] - -buffer_load_format_xyzw v[1:4], s[4:7], s1 -// CHECK: buffer_load_format_xyzw v[1:4], s[4:7], s1 ; encoding: [0x00,0x00,0x0c,0xe0,0x00,0x01,0x01,0x01] - -buffer_store_format_x v1, s[4:7], s1 -// CHECK: buffer_store_format_x v1, s[4:7], s1 ; encoding: [0x00,0x00,0x10,0xe0,0x00,0x01,0x01,0x01] - -buffer_store_format_xy v[1:2], s[4:7], s1 -// CHECK: buffer_store_format_xy v[1:2], s[4:7], s1 ; encoding: [0x00,0x00,0x14,0xe0,0x00,0x01,0x01,0x01] - -buffer_store_format_xyz v[1:3], s[4:7], s1 -// CHECK: buffer_store_format_xyz v[1:3], s[4:7], s1 ; encoding: [0x00,0x00,0x18,0xe0,0x00,0x01,0x01,0x01] - -buffer_store_format_xyzw v[1:4], s[4:7], s1 -// CHECK: buffer_store_format_xyzw v[1:4], s[4:7], s1 ; encoding: [0x00,0x00,0x1c,0xe0,0x00,0x01,0x01,0x01] - -buffer_load_ubyte v1, s[4:7], s1 -// CHECK: buffer_load_ubyte v1, s[4:7], s1 ; encoding: [0x00,0x00,0x20,0xe0,0x00,0x01,0x01,0x01] - -buffer_load_sbyte v1, s[4:7], s1 -// CHECK: buffer_load_sbyte v1, s[4:7], s1 ; encoding: [0x00,0x00,0x24,0xe0,0x00,0x01,0x01,0x01] - -buffer_load_ushort v1, s[4:7], s1 -// CHECK: buffer_load_ushort v1, s[4:7], s1 ; encoding: [0x00,0x00,0x28,0xe0,0x00,0x01,0x01,0x01] - -buffer_load_sshort v1, s[4:7], s1 -// CHECK: buffer_load_sshort v1, s[4:7], s1 ; encoding: [0x00,0x00,0x2c,0xe0,0x00,0x01,0x01,0x01] - -buffer_load_dword v1, s[4:7], s1 -// CHECK: buffer_load_dword v1, s[4:7], s1 ; encoding: [0x00,0x00,0x30,0xe0,0x00,0x01,0x01,0x01] - -buffer_load_dwordx2 v[1:2], s[4:7], s1 -// CHECK: buffer_load_dwordx2 v[1:2], s[4:7], s1 ; encoding: [0x00,0x00,0x34,0xe0,0x00,0x01,0x01,0x01] - -buffer_load_dwordx4 v[1:4], s[4:7], s1 -// CHECK: buffer_load_dwordx4 v[1:4], s[4:7], s1 ; encoding: [0x00,0x00,0x38,0xe0,0x00,0x01,0x01,0x01] - -buffer_store_byte v1, s[4:7], s1 -// CHECK: buffer_store_byte v1, s[4:7], s1 ; encoding: [0x00,0x00,0x60,0xe0,0x00,0x01,0x01,0x01] - -buffer_store_short v1, s[4:7], s1 -// CHECK: buffer_store_short v1, s[4:7], s1 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x01,0x01,0x01] - -buffer_store_dword v1 s[4:7], s1 -// CHECK: buffer_store_dword v1, s[4:7], s1 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x01,0x01,0x01] - -buffer_store_dwordx2 v[1:2], s[4:7], s1 -// CHECK: buffer_store_dwordx2 v[1:2], s[4:7], s1 ; encoding: [0x00,0x00,0x74,0xe0,0x00,0x01,0x01,0x01] - -buffer_store_dwordx4 v[1:4], s[4:7], s1 -// CHECK: buffer_store_dwordx4 v[1:4], s[4:7], s1 ; encoding: [0x00,0x00,0x78,0xe0,0x00,0x01,0x01,0x01] - -// TODO: Atomics diff --git a/test/MC/R600/smrd.s b/test/MC/R600/smrd.s deleted file mode 100644 index b67abf7e689..00000000000 --- a/test/MC/R600/smrd.s +++ /dev/null @@ -1,32 +0,0 @@ -// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s -// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s - -s_load_dword s1, s[2:3], 1 -// CHECK: s_load_dword s1, s[2:3], 0x1 ; encoding: [0x01,0x83,0x00,0xc0] - -s_load_dword s1, s[2:3], s4 -// CHECK: s_load_dword s1, s[2:3], s4 ; encoding: [0x04,0x82,0x00,0xc0] - -s_load_dwordx2 s[2:3], s[2:3], 1 -// CHECK: s_load_dwordx2 s[2:3], s[2:3], 0x1 ; encoding: [0x01,0x03,0x41,0xc0] - -s_load_dwordx2 s[2:3], s[2:3], s4 -// CHECK: s_load_dwordx2 s[2:3], s[2:3], s4 ; encoding: [0x04,0x02,0x41,0xc0] - -s_load_dwordx4 s[4:7], s[2:3], 1 -// CHECK: s_load_dwordx4 s[4:7], s[2:3], 0x1 ; encoding: [0x01,0x03,0x82,0xc0] - -s_load_dwordx4 s[4:7], s[2:3], s4 -// CHECK: s_load_dwordx4 s[4:7], s[2:3], s4 ; encoding: [0x04,0x02,0x82,0xc0] - -s_load_dwordx8 s[8:15], s[2:3], 1 -// CHECK: s_load_dwordx8 s[8:15], s[2:3], 0x1 ; encoding: [0x01,0x03,0xc4,0xc0] - -s_load_dwordx8 s[8:15], s[2:3], s4 -// CHECK: s_load_dwordx8 s[8:15], s[2:3], s4 ; encoding: [0x04,0x02,0xc4,0xc0] - -s_load_dwordx16 s[16:31], s[2:3], 1 -// CHECK: s_load_dwordx16 s[16:31], s[2:3], 0x1 ; encoding: [0x01,0x03,0x08,0xc1] - -s_load_dwordx16 s[16:31], s[2:3], s4 -// CHECK: s_load_dwordx16 s[16:31], s[2:3], s4 ; encoding: [0x04,0x02,0x08,0xc1] diff --git a/test/MC/R600/sop1-err.s b/test/MC/R600/sop1-err.s deleted file mode 100644 index f892356b623..00000000000 --- a/test/MC/R600/sop1-err.s +++ /dev/null @@ -1,37 +0,0 @@ -// RUN: not llvm-mc -arch=amdgcn %s 2>&1 | FileCheck %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=SI %s 2>&1 | FileCheck %s - -s_mov_b32 v1, s2 -// CHECK: error: invalid operand for instruction - -s_mov_b32 s1, v0 -// CHECK: error: invalid operand for instruction - -s_mov_b32 s[1:2], s0 -// CHECK: error: invalid operand for instruction - -s_mov_b32 s0, s[1:2] -// CHECK: error: invalid operand for instruction - -s_mov_b32 s220, s0 -// CHECK: error: invalid operand for instruction - -s_mov_b32 s0, s220 -// CHECK: error: invalid operand for instruction - -s_mov_b64 s1, s[0:1] -// CHECK: error: invalid operand for instruction - -s_mov_b64 s[0:1], s1 -// CHECK: error: invalid operand for instruction - -// Immediate greater than 32-bits -s_mov_b32 s1, 0xfffffffff -// CHECK: error: invalid immediate: only 32-bit values are legal - -// Immediate greater than 32-bits -s_mov_b64 s[0:1], 0xfffffffff -// CHECK: error: invalid immediate: only 32-bit values are legal - -// Out of range register -s_mov_b32 s diff --git a/test/MC/R600/sop1.s b/test/MC/R600/sop1.s deleted file mode 100644 index 92ca73f2500..00000000000 --- a/test/MC/R600/sop1.s +++ /dev/null @@ -1,177 +0,0 @@ -// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s -// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s - -s_mov_b32 s1, s2 -// CHECK: s_mov_b32 s1, s2 ; encoding: [0x02,0x03,0x81,0xbe] - -s_mov_b32 s1, 1 -// CHECK: s_mov_b32 s1, 1 ; encoding: [0x81,0x03,0x81,0xbe] - -s_mov_b32 s1, 100 -// CHECK: s_mov_b32 s1, 0x64 ; encoding: [0xff,0x03,0x81,0xbe,0x64,0x00,0x00,0x00] - -s_mov_b64 s[2:3], s[4:5] -// CHECK: s_mov_b64 s[2:3], s[4:5] ; encoding: [0x04,0x04,0x82,0xbe] - -s_mov_b64 s[2:3], 0xffffffffffffffff -// CHECK: s_mov_b64 s[2:3], -1 ; encoding: [0xc1,0x04,0x82,0xbe] - -s_cmov_b32 s1, 200 -// CHECK: s_cmov_b32 s1, 0xc8 ; encoding: [0xff,0x05,0x81,0xbe,0xc8,0x00,0x00,0x00] - -s_cmov_b32 s1, 1.0 -// CHECK: s_cmov_b32 s1, 1.0 ; encoding: [0xf2,0x05,0x81,0xbe] - -//s_cmov_b64 s[2:3], 1.0 -//CHECK-FIXME: s_cmov_b64 s[2:3], 1.0 ; encoding: [0xf2,0x05,0x82,0xb3] - -//===----------------------------------------------------------------------===// -// Instructions -//===----------------------------------------------------------------------===// - -s_mov_b32 s1, s2 -// CHECK: s_mov_b32 s1, s2 ; encoding: [0x02,0x03,0x81,0xbe] - -s_mov_b64 s[2:3], s[4:5] -// CHECK: s_mov_b64 s[2:3], s[4:5] ; encoding: [0x04,0x04,0x82,0xbe] - -s_cmov_b32 s1, s2 -// CHECK: s_cmov_b32 s1, s2 ; encoding: [0x02,0x05,0x81,0xbe] - -s_cmov_b64 s[2:3], s[4:5] -// CHECK: s_cmov_b64 s[2:3], s[4:5] ; encoding: [0x04,0x06,0x82,0xbe] - -s_not_b32 s1, s2 -// CHECK: s_not_b32 s1, s2 ; encoding: [0x02,0x07,0x81,0xbe] - -s_not_b64 s[2:3], s[4:5] -// CHECK: s_not_b64 s[2:3], s[4:5] ; encoding: [0x04,0x08,0x82,0xbe] - -s_wqm_b32 s1, s2 -// CHECK: s_wqm_b32 s1, s2 ; encoding: [0x02,0x09,0x81,0xbe] - -s_wqm_b64 s[2:3], s[4:5] -// CHECK: s_wqm_b64 s[2:3], s[4:5] ; encoding: [0x04,0x0a,0x82,0xbe] - -s_brev_b32 s1, s2 -// CHECK: s_brev_b32 s1, s2 ; encoding: [0x02,0x0b,0x81,0xbe] - -s_brev_b64 s[2:3], s[4:5] -// CHECK: s_brev_b64 s[2:3], s[4:5] ; encoding: [0x04,0x0c,0x82,0xbe] - -s_bcnt0_i32_b32 s1, s2 -// CHECK: s_bcnt0_i32_b32 s1, s2 ; encoding: [0x02,0x0d,0x81,0xbe] - -s_bcnt0_i32_b64 s1, s[2:3] -// CHECK: s_bcnt0_i32_b64 s1, s[2:3] ; encoding: [0x02,0x0e,0x81,0xbe] - -s_bcnt1_i32_b32 s1, s2 -// CHECK: s_bcnt1_i32_b32 s1, s2 ; encoding: [0x02,0x0f,0x81,0xbe] - -s_bcnt1_i32_b64 s1, s[2:3] -// CHECK: s_bcnt1_i32_b64 s1, s[2:3] ; encoding: [0x02,0x10,0x81,0xbe] - -s_ff0_i32_b32 s1, s2 -// CHECK: s_ff0_i32_b32 s1, s2 ; encoding: [0x02,0x11,0x81,0xbe] - -s_ff0_i32_b64 s1, s[2:3] -// CHECK: s_ff0_i32_b64 s1, s[2:3] ; encoding: [0x02,0x12,0x81,0xbe] - -s_ff1_i32_b32 s1, s2 -// CHECK: s_ff1_i32_b32 s1, s2 ; encoding: [0x02,0x13,0x81,0xbe] - -s_ff1_i32_b64 s1, s[2:3] -// CHECK: s_ff1_i32_b64 s1, s[2:3] ; encoding: [0x02,0x14,0x81,0xbe] - -s_flbit_i32_b32 s1, s2 -// CHECK: s_flbit_i32_b32 s1, s2 ; encoding: [0x02,0x15,0x81,0xbe] - -s_flbit_i32_b64 s1, s[2:3] -// CHECK: s_flbit_i32_b64 s1, s[2:3] ; encoding: [0x02,0x16,0x81,0xbe] - -s_flbit_i32 s1, s2 -// CHECK: s_flbit_i32 s1, s2 ; encoding: [0x02,0x17,0x81,0xbe] - -s_flbit_i32_i64 s1, s[2:3] -// CHECK: s_flbit_i32_i64 s1, s[2:3] ; encoding: [0x02,0x18,0x81,0xbe] - -s_sext_i32_i8 s1, s2 -// CHECK: s_sext_i32_i8 s1, s2 ; encoding: [0x02,0x19,0x81,0xbe] - -s_sext_i32_i16 s1, s2 -// CHECK: s_sext_i32_i16 s1, s2 ; encoding: [0x02,0x1a,0x81,0xbe] - -s_bitset0_b32 s1, s2 -// CHECK: s_bitset0_b32 s1, s2 ; encoding: [0x02,0x1b,0x81,0xbe] - -s_bitset0_b64 s[2:3], s[4:5] -// CHECK: s_bitset0_b64 s[2:3], s[4:5] ; encoding: [0x04,0x1c,0x82,0xbe] - -s_bitset1_b32 s1, s2 -// CHECK: s_bitset1_b32 s1, s2 ; encoding: [0x02,0x1d,0x81,0xbe] - -s_bitset1_b64 s[2:3], s[4:5] -// CHECK: s_bitset1_b64 s[2:3], s[4:5] ; encoding: [0x04,0x1e,0x82,0xbe] - -s_getpc_b64 s[2:3] -// CHECK: s_getpc_b64 s[2:3] ; encoding: [0x00,0x1f,0x82,0xbe] - -s_setpc_b64 s[2:3], s[4:5] -// CHECK: s_setpc_b64 s[2:3], s[4:5] ; encoding: [0x04,0x20,0x82,0xbe] - -s_swappc_b64 s[2:3], s[4:5] -// CHECK: s_swappc_b64 s[2:3], s[4:5] ; encoding: [0x04,0x21,0x82,0xbe] - -s_rfe_b64 s[2:3], s[4:5] -// CHECK: s_rfe_b64 s[2:3], s[4:5] ; encoding: [0x04,0x22,0x82,0xbe] - -s_and_saveexec_b64 s[2:3], s[4:5] -// CHECK: s_and_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x24,0x82,0xbe] - -s_or_saveexec_b64 s[2:3], s[4:5] -// CHECK: s_or_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x25,0x82,0xbe] - -s_xor_saveexec_b64 s[2:3], s[4:5] -// CHECK: s_xor_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x26,0x82,0xbe] - -s_andn2_saveexec_b64 s[2:3], s[4:5] -// CHECK: s_andn2_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x27,0x82,0xbe] - -s_orn2_saveexec_b64 s[2:3], s[4:5] -// CHECK: s_orn2_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x28,0x82,0xbe] - -s_nand_saveexec_b64 s[2:3], s[4:5] -// CHECK: s_nand_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x29,0x82,0xbe] - -s_nor_saveexec_b64 s[2:3], s[4:5] -// CHECK: s_nor_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x2a,0x82,0xbe] - -s_xnor_saveexec_b64 s[2:3], s[4:5] -// CHECK: s_xnor_saveexec_b64 s[2:3], s[4:5] ; encoding: [0x04,0x2b,0x82,0xbe] - -s_quadmask_b32 s1, s2 -// CHECK: s_quadmask_b32 s1, s2 ; encoding: [0x02,0x2c,0x81,0xbe] - -s_quadmask_b64 s[2:3], s[4:5] -// CHECK: s_quadmask_b64 s[2:3], s[4:5] ; encoding: [0x04,0x2d,0x82,0xbe] - -s_movrels_b32 s1, s2 -// CHECK: s_movrels_b32 s1, s2 ; encoding: [0x02,0x2e,0x81,0xbe] - -s_movrels_b64 s[2:3], s[4:5] -// CHECK: s_movrels_b64 s[2:3], s[4:5] ; encoding: [0x04,0x2f,0x82,0xbe] - -s_movreld_b32 s1, s2 -// CHECK: s_movreld_b32 s1, s2 ; encoding: [0x02,0x30,0x81,0xbe] - -s_movreld_b64 s[2:3], s[4:5] -// CHECK: s_movreld_b64 s[2:3], s[4:5] ; encoding: [0x04,0x31,0x82,0xbe] - -s_cbranch_join s[4:5] -// CHECK: s_cbranch_join s[4:5] ; encoding: [0x04,0x32,0x80,0xbe] - -s_abs_i32 s1, s2 -// CHECK: s_abs_i32 s1, s2 ; encoding: [0x02,0x34,0x81,0xbe] - -s_mov_fed_b32 s1, s2 -// CHECK: s_mov_fed_b32 s1, s2 ; encoding: [0x02,0x35,0x81,0xbe] diff --git a/test/MC/R600/sop2.s b/test/MC/R600/sop2.s deleted file mode 100644 index 9a7a1c01064..00000000000 --- a/test/MC/R600/sop2.s +++ /dev/null @@ -1,131 +0,0 @@ -// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s -// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s - -// CHECK: s_add_u32 s1, s2, s3 ; encoding: [0x02,0x03,0x01,0x80] -s_add_u32 s1, s2, s3 - -// CHECK: s_sub_u32 s1, s2, s3 ; encoding: [0x02,0x03,0x81,0x80] -s_sub_u32 s1, s2, s3 - -// CHECK: s_add_i32 s1, s2, s3 ; encoding: [0x02,0x03,0x01,0x81] -s_add_i32 s1, s2, s3 - -// CHECK: s_sub_i32 s1, s2, s3 ; encoding: [0x02,0x03,0x81,0x81] -s_sub_i32 s1, s2, s3 - -// CHECK: s_addc_u32 s1, s2, s3 ; encoding: [0x02,0x03,0x01,0x82] -s_addc_u32 s1, s2, s3 - -// CHECK: s_subb_u32 s1, s2, s3 ; encoding: [0x02,0x03,0x81,0x82] -s_subb_u32 s1, s2, s3 - -// CHECK: s_min_i32 s1, s2, s3 ; encoding: [0x02,0x03,0x01,0x83] -s_min_i32 s1, s2, s3 - -// CHECK: s_min_u32 s1, s2, s3 ; encoding: [0x02,0x03,0x81,0x83] -s_min_u32 s1, s2, s3 - -// CHECK: s_max_i32 s1, s2, s3 ; encoding: [0x02,0x03,0x01,0x84] -s_max_i32 s1, s2, s3 - -// CHECK: s_max_u32 s1, s2, s3 ; encoding: [0x02,0x03,0x81,0x84] -s_max_u32 s1, s2, s3 - -// CHECK: s_cselect_b32 s1, s2, s3 ; encoding: [0x02,0x03,0x01,0x85] -s_cselect_b32 s1, s2, s3 - -// CHECK: s_cselect_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x85] -s_cselect_b64 s[2:3], s[4:5], s[6:7] - -// CHECK: s_and_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x87] -s_and_b32 s2, s4, s6 - -// CHECK: s_and_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x87] -s_and_b64 s[2:3], s[4:5], s[6:7] - -// CHECK: s_or_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x88] -s_or_b32 s2, s4, s6 - -// CHECK: s_or_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x88] -s_or_b64 s[2:3], s[4:5], s[6:7] - -// CHECK: s_xor_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x89] -s_xor_b32 s2, s4, s6 - -// CHECK: s_xor_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x89] -s_xor_b64 s[2:3], s[4:5], s[6:7] - -// CHECK: s_andn2_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x8a] -s_andn2_b32 s2, s4, s6 - -// CHECK: s_andn2_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x8a] -s_andn2_b64 s[2:3], s[4:5], s[6:7] - -// CHECK: s_orn2_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x8b] -s_orn2_b32 s2, s4, s6 - -// CHECK: s_orn2_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x8b] -s_orn2_b64 s[2:3], s[4:5], s[6:7] - -// CHECK: s_nand_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x8c] -s_nand_b32 s2, s4, s6 - -// CHECK: s_nand_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x8c] -s_nand_b64 s[2:3], s[4:5], s[6:7] - -// CHECK: s_nor_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x8d] -s_nor_b32 s2, s4, s6 - -// CHECK: s_nor_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x8d] -s_nor_b64 s[2:3], s[4:5], s[6:7] - -// CHECK: s_xnor_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x8e] -s_xnor_b32 s2, s4, s6 - -// CHECK: s_xnor_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x8e] -s_xnor_b64 s[2:3], s[4:5], s[6:7] - -// CHECK: s_lshl_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x8f] -s_lshl_b32 s2, s4, s6 - -// CHECK: s_lshl_b64 s[2:3], s[4:5], s6 ; encoding: [0x04,0x06,0x82,0x8f] -s_lshl_b64 s[2:3], s[4:5], s6 - -// CHECK: s_lshr_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x90] -s_lshr_b32 s2, s4, s6 - -// CHECK: s_lshr_b64 s[2:3], s[4:5], s6 ; encoding: [0x04,0x06,0x82,0x90] -s_lshr_b64 s[2:3], s[4:5], s6 - -// CHECK: s_ashr_i32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x91] -s_ashr_i32 s2, s4, s6 - -// CHECK: s_ashr_i64 s[2:3], s[4:5], s6 ; encoding: [0x04,0x06,0x82,0x91] -s_ashr_i64 s[2:3], s[4:5], s6 - -// CHECK: s_bfm_b32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x92] -s_bfm_b32 s2, s4, s6 - -// CHECK: s_bfm_b64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x92] -s_bfm_b64 s[2:3], s[4:5], s[6:7] - -// CHECK: s_mul_i32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x93] -s_mul_i32 s2, s4, s6 - -// CHECK: s_bfe_u32 s2, s4, s6 ; encoding: [0x04,0x06,0x82,0x93] -s_bfe_u32 s2, s4, s6 - -// CHECK: s_bfe_i32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x94] -s_bfe_i32 s2, s4, s6 - -// CHECK: s_bfe_u64 s[2:3], s[4:5], s[6:7] ; encoding: [0x04,0x06,0x82,0x94] -s_bfe_u64 s[2:3], s[4:5], s[6:7] - -// CHECK: s_bfe_i64 s[2:3], s[4:5], s6 ; encoding: [0x04,0x06,0x02,0x95] -s_bfe_i64 s[2:3], s[4:5], s6 - -// CHECK: s_cbranch_g_fork s[4:5], s[6:7] ; encoding: [0x04,0x06,0x80,0x95] -s_cbranch_g_fork s[4:5], s[6:7] - -// CHECK: s_absdiff_i32 s2, s4, s6 ; encoding: [0x04,0x06,0x02,0x96] -s_absdiff_i32 s2, s4, s6 diff --git a/test/MC/R600/sopc.s b/test/MC/R600/sopc.s deleted file mode 100644 index 0899c1a2eed..00000000000 --- a/test/MC/R600/sopc.s +++ /dev/null @@ -1,9 +0,0 @@ -// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s -// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s - -//===----------------------------------------------------------------------===// -// Instructions -//===----------------------------------------------------------------------===// - -s_cmp_eq_i32 s1, s2 -// CHECK: s_cmp_eq_i32 s1, s2 ; encoding: [0x01,0x02,0x00,0xbf] diff --git a/test/MC/R600/sopk.s b/test/MC/R600/sopk.s deleted file mode 100644 index 6c27aaccb80..00000000000 --- a/test/MC/R600/sopk.s +++ /dev/null @@ -1,66 +0,0 @@ -// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s -// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s - -//===----------------------------------------------------------------------===// -// Instructions -//===----------------------------------------------------------------------===// - -s_movk_i32 s2, 0x6 -// CHECK: s_movk_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb0] - -s_cmovk_i32 s2, 0x6 -// CHECK: s_cmovk_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb1] - -s_cmpk_eq_i32 s2, 0x6 -// CHECK: s_cmpk_eq_i32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb1] - -s_cmpk_lg_i32 s2, 0x6 -// CHECK: s_cmpk_lg_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb2] - -s_cmpk_gt_i32 s2, 0x6 -// CHECK: s_cmpk_gt_i32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb2] - -s_cmpk_ge_i32 s2, 0x6 -// CHECK: s_cmpk_ge_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb3] - -s_cmpk_lt_i32 s2, 0x6 -// CHECK: s_cmpk_lt_i32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb3] - -s_cmpk_le_i32 s2, 0x6 -// CHECK: s_cmpk_le_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb4] - -s_cmpk_eq_u32 s2, 0x6 -// CHECK: s_cmpk_eq_u32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb4] - -s_cmpk_lg_u32 s2, 0x6 -// CHECK: s_cmpk_lg_u32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb5] - -s_cmpk_gt_u32 s2, 0x6 -// CHECK: s_cmpk_gt_u32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb5] - -s_cmpk_ge_u32 s2, 0x6 -// CHECK: s_cmpk_ge_u32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb6] - -s_cmpk_lt_u32 s2, 0x6 -// CHECK: s_cmpk_lt_u32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb6] - -s_cmpk_le_u32 s2, 0x6 -// CHECK: s_cmpk_le_u32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb7] - -s_addk_i32 s2, 0x6 -// CHECK: s_addk_i32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb7] - -s_mulk_i32 s2, 0x6 -// CHECK: s_mulk_i32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb8] - -s_cbranch_i_fork s[2:3], 0x6 -// CHECK: s_cbranch_i_fork s[2:3], 0x6 ; encoding: [0x06,0x00,0x82,0xb8] - -s_getreg_b32 s2, 0x6 -// CHECK: s_getreg_b32 s2, 0x6 ; encoding: [0x06,0x00,0x02,0xb9] - -s_setreg_b32 s2, 0x6 -// CHECK: s_setreg_b32 s2, 0x6 ; encoding: [0x06,0x00,0x82,0xb9] - -s_setreg_imm32_b32 0xff, 0x6 -// CHECK: s_setreg_imm32_b32 0xff, 0x6 ; encoding: [0x06,0x00,0x80,0xba,0xff,0x00,0x00,0x00] diff --git a/test/MC/R600/sopp.s b/test/MC/R600/sopp.s deleted file mode 100644 index b072c16fdb2..00000000000 --- a/test/MC/R600/sopp.s +++ /dev/null @@ -1,64 +0,0 @@ -// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s -// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s - -//===----------------------------------------------------------------------===// -// Edge Cases -//===----------------------------------------------------------------------===// - -s_nop 0 // CHECK: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] -s_nop 0xffff // CHECK: s_nop 0xffff ; encoding: [0xff,0xff,0x80,0xbf] - -//===----------------------------------------------------------------------===// -// Instructions -//===----------------------------------------------------------------------===// - - s_nop 1 // CHECK: s_nop 1 ; encoding: [0x01,0x00,0x80,0xbf] - s_endpgm // CHECK: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] - s_branch 2 // CHECK: s_branch 2 ; encoding: [0x02,0x00,0x82,0xbf] - s_cbranch_scc0 3 // CHECK: s_cbranch_scc0 3 ; encoding: [0x03,0x00,0x84,0xbf] - s_cbranch_scc1 4 // CHECK: s_cbranch_scc1 4 ; encoding: [0x04,0x00,0x85,0xbf] - s_cbranch_vccz 5 // CHECK: s_cbranch_vccz 5 ; encoding: [0x05,0x00,0x86,0xbf] - s_cbranch_vccnz 6 // CHECK: s_cbranch_vccnz 6 ; encoding: [0x06,0x00,0x87,0xbf] - s_cbranch_execz 7 // CHECK: s_cbranch_execz 7 ; encoding: [0x07,0x00,0x88,0xbf] - s_cbranch_execnz 8 // CHECK: s_cbranch_execnz 8 ; encoding: [0x08,0x00,0x89,0xbf] - s_barrier // CHECK: s_barrier ; encoding: [0x00,0x00,0x8a,0xbf] - -//===----------------------------------------------------------------------===// -// s_waitcnt -//===----------------------------------------------------------------------===// - - s_waitcnt 0 - // CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] - - s_waitcnt vmcnt(0) & expcnt(0) & lgkmcnt(0) - // CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] - - s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) - // CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] - - s_waitcnt vmcnt(0), expcnt(0), lgkmcnt(0) - // CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] - - s_waitcnt vmcnt(1) - // CHECK: s_waitcnt vmcnt(1) ; encoding: [0x71,0x07,0x8c,0xbf] - - s_waitcnt expcnt(2) - // CHECK: s_waitcnt expcnt(2) ; encoding: [0x2f,0x07,0x8c,0xbf] - - s_waitcnt lgkmcnt(3) - // CHECK: s_waitcnt lgkmcnt(3) ; encoding: [0x7f,0x03,0x8c,0xbf] - - s_waitcnt vmcnt(0), expcnt(0) - // CHECK: s_waitcnt vmcnt(0) expcnt(0) ; encoding: [0x00,0x07,0x8c,0xbf] - - - s_sethalt 9 // CHECK: s_sethalt 9 ; encoding: [0x09,0x00,0x8d,0xbf] - s_sleep 10 // CHECK: s_sleep 10 ; encoding: [0x0a,0x00,0x8e,0xbf] - s_setprio 1 // CHECK: s_setprio 1 ; encoding: [0x01,0x00,0x8f,0xbf] - s_sendmsg 2 // CHECK: s_sendmsg Gs(nop), [m0] ; encoding: [0x02,0x00,0x90,0xbf] - s_sendmsghalt 3 // CHECK: s_sendmsghalt 3 ; encoding: [0x03,0x00,0x91,0xbf] - s_trap 4 // CHECK: s_trap 4 ; encoding: [0x04,0x00,0x92,0xbf] - s_icache_inv // CHECK: s_icache_inv ; encoding: [0x00,0x00,0x93,0xbf] - s_incperflevel 5 // CHECK: s_incperflevel 5 ; encoding: [0x05,0x00,0x94,0xbf] - s_decperflevel 6 // CHECK: s_decperflevel 6 ; encoding: [0x06,0x00,0x95,0xbf] - s_ttracedata // CHECK: s_ttracedata ; encoding: [0x00,0x00,0x96,0xbf] diff --git a/test/MC/R600/vop1.s b/test/MC/R600/vop1.s deleted file mode 100644 index d0b00fcd189..00000000000 --- a/test/MC/R600/vop1.s +++ /dev/null @@ -1,357 +0,0 @@ -// RUN: not llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=SI --check-prefix=SICI -// RUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=SI --check-prefix=SICI -// RUN: not llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=SICI --check-prefix=CIVI -// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=CIVI --check-prefix=VI - -// RUN: not llvm-mc -arch=amdgcn -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSI --check-prefix=NOSICI -// RUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSI --check-prefix=NOSICI -// RUN: not llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSICI -// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck %s -check-prefix=NOVI - - -// GCN: v_nop ; encoding: [0x00,0x00,0x00,0x7e] -v_nop - -// GCN: v_mov_b32_e32 v1, v2 ; encoding: [0x02,0x03,0x02,0x7e] -v_mov_b32 v1, v2 - -// GCN: v_readfirstlane_b32 s1, v2 ; encoding: [0x02,0x05,0x02,0x7e] -v_readfirstlane_b32 s1, v2 - -// GCN: v_cvt_i32_f64_e32 v1, v[2:3] ; encoding: [0x02,0x07,0x02,0x7e] -v_cvt_i32_f64 v1, v[2:3] - -// GCN: v_cvt_f64_i32_e32 v[1:2], v2 ; encoding: [0x02,0x09,0x02,0x7e] -v_cvt_f64_i32 v[1:2], v2 - -// GCN: v_cvt_f32_i32_e32 v1, v2 ; encoding: [0x02,0x0b,0x02,0x7e] -v_cvt_f32_i32 v1, v2 - -// GCN: v_cvt_f32_u32_e32 v1, v2 ; encoding: [0x02,0x0d,0x02,0x7e] -v_cvt_f32_u32 v1, v2 - -// GCN: v_cvt_u32_f32_e32 v1, v2 ; encoding: [0x02,0x0f,0x02,0x7e -v_cvt_u32_f32 v1, v2 - -// GCN: v_cvt_i32_f32_e32 v1, v2 ; encoding: [0x02,0x11,0x02,0x7e] -v_cvt_i32_f32 v1, v2 - -// SICI: v_mov_fed_b32_e32 v1, v2 ; encoding: [0x02,0x13,0x02,0x7e] -// NOVI: error: instruction not supported on this GPU -v_mov_fed_b32 v1, v2 - -// GCN: v_cvt_f16_f32_e32 v1, v2 ; encoding: [0x02,0x15,0x02,0x7e] -v_cvt_f16_f32 v1, v2 - -// GCN: v_cvt_f32_f16_e32 v1, v2 ; encoding: [0x02,0x17,0x02,0x7e] -v_cvt_f32_f16 v1, v2 - -// GCN: v_cvt_rpi_i32_f32_e32 v1, v2 ; encoding: [0x02,0x19,0x02,0x7e] -v_cvt_rpi_i32_f32 v1, v2 - -// GCN: v_cvt_flr_i32_f32_e32 v1, v2 ; encoding: [0x02,0x1b,0x02,0x7e] -v_cvt_flr_i32_f32 v1, v2 - -// GCN: v_cvt_off_f32_i4_e32 v1, v2 ; encoding: [0x02,0x1d,0x02,0x7e] -v_cvt_off_f32_i4_e32 v1, v2 - -// GCN: v_cvt_f32_f64_e32 v1, v[2:3] ; encoding: [0x02,0x1f,0x02,0x7e] -v_cvt_f32_f64 v1, v[2:3] - -// GCN: v_cvt_f64_f32_e32 v[1:2], v2 ; encoding: [0x02,0x21,0x02,0x7e] -v_cvt_f64_f32 v[1:2], v2 - -// GCN: v_cvt_f32_ubyte0_e32 v1, v2 ; encoding: [0x02,0x23,0x02,0x7e] -v_cvt_f32_ubyte0 v1, v2 - -// GCN: v_cvt_f32_ubyte1_e32 v1, v2 ; encoding: [0x02,0x25,0x02,0x7e] -v_cvt_f32_ubyte1_e32 v1, v2 - -// GCN: v_cvt_f32_ubyte2_e32 v1, v2 ; encoding: [0x02,0x27,0x02,0x7e] -v_cvt_f32_ubyte2 v1, v2 - -// GCN: v_cvt_f32_ubyte3_e32 v1, v2 ; encoding: [0x02,0x29,0x02,0x7e] -v_cvt_f32_ubyte3 v1, v2 - -// GCN: v_cvt_u32_f64_e32 v1, v[2:3] ; encoding: [0x02,0x2b,0x02,0x7e] -v_cvt_u32_f64 v1, v[2:3] - -// GCN: v_cvt_f64_u32_e32 v[1:2], v2 ; encoding: [0x02,0x2d,0x02,0x7e] -v_cvt_f64_u32 v[1:2], v2 - -// NOSI: error: instruction not supported on this GPU -// NOSI: v_trunc_f64_e32 v[1:2], v[2:3] -// CIVI: v_trunc_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x2f,0x02,0x7e] -v_trunc_f64_e32 v[1:2], v[2:3] - -// NOSI: error: instruction not supported on this GPU -// NOSI: v_ceil_f64_e32 v[1:2], v[2:3] -// CIVI: v_ceil_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x31,0x02,0x7e] -v_ceil_f64_e32 v[1:2], v[2:3] - -// NOSI: error: instruction not supported on this GPU -// NOSI: v_rndne_f64_e32 v[1:2], v[2:3] -// CIVI: v_rndne_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x33,0x02,0x7e] -v_rndne_f64_e32 v[1:2], v[2:3] - -// NOSI: error: instruction not supported on this GPU -// NOSI: v_floor_f64_e32 v[1:2], v[2:3] -// CIVI: v_floor_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x35,0x02,0x7e] -v_floor_f64_e32 v[1:2], v[2:3] - -// SICI: v_fract_f32_e32 v1, v2 ; encoding: [0x02,0x41,0x02,0x7e] -// VI: v_fract_f32_e32 v1, v2 ; encoding: [0x02,0x37,0x02,0x7e] -v_fract_f32 v1, v2 - -// SICI: v_trunc_f32_e32 v1, v2 ; encoding: [0x02,0x43,0x02,0x7e] -// VI: v_trunc_f32_e32 v1, v2 ; encoding: [0x02,0x39,0x02,0x7e] -v_trunc_f32 v1, v2 - -// SICI: v_ceil_f32_e32 v1, v2 ; encoding: [0x02,0x45,0x02,0x7e] -// VI: v_ceil_f32_e32 v1, v2 ; encoding: [0x02,0x3b,0x02,0x7e] -v_ceil_f32 v1, v2 - -// SICI: v_rndne_f32_e32 v1, v2 ; encoding: [0x02,0x47,0x02,0x7e] -// VI: v_rndne_f32_e32 v1, v2 ; encoding: [0x02,0x3d,0x02,0x7e] -v_rndne_f32 v1, v2 - -// SICI: v_floor_f32_e32 v1, v2 ; encoding: [0x02,0x49,0x02,0x7e] -// VI: v_floor_f32_e32 v1, v2 ; encoding: [0x02,0x3f,0x02,0x7e] -v_floor_f32_e32 v1, v2 - -// SICI: v_exp_f32_e32 v1, v2 ; encoding: [0x02,0x4b,0x02,0x7e] -// VI: v_exp_f32_e32 v1, v2 ; encoding: [0x02,0x41,0x02,0x7e] -v_exp_f32 v1, v2 - -// SICI: v_log_clamp_f32_e32 v1, v2 ; encoding: [0x02,0x4d,0x02,0x7e] -// NOVI: error: instruction not supported on this GPU -// NOVI: v_log_clamp_f32 v1, v2 -v_log_clamp_f32 v1, v2 - -// SICI: v_log_f32_e32 v1, v2 ; encoding: [0x02,0x4f,0x02,0x7e] -// VI: v_log_f32_e32 v1, v2 ; encoding: [0x02,0x43,0x02,0x7e] -v_log_f32 v1, v2 - -// SICI: v_rcp_clamp_f32_e32 v1, v2 ; encoding: [0x02,0x51,0x02,0x7e] -// NOVI: error: instruction not supported on this GPU -// NOVI: v_rcp_clamp_f32 v1, v2 -v_rcp_clamp_f32 v1, v2 - -// SICI: v_rcp_legacy_f32_e32 v1, v2 ; encoding: [0x02,0x53,0x02,0x7e] -// NOVI: error: instruction not supported on this GPU -// NOVI: v_rcp_legacy_f32 v1, v2 -v_rcp_legacy_f32 v1, v2 - -// SICI: v_rcp_f32_e32 v1, v2 ; encoding: [0x02,0x55,0x02,0x7e] -// VI: v_rcp_f32_e32 v1, v2 ; encoding: [0x02,0x45,0x02,0x7e] -v_rcp_f32 v1, v2 - -// SICI: v_rcp_iflag_f32_e32 v1, v2 ; encoding: [0x02,0x57,0x02,0x7e] -// VI: v_rcp_iflag_f32_e32 v1, v2 ; encoding: [0x02,0x47,0x02,0x7e] -v_rcp_iflag_f32 v1, v2 - -// SICI: v_rsq_clamp_f32_e32 v1, v2 ; encoding: [0x02,0x59,0x02,0x7e] -// NOVI: error: instruction not supported on this GPU -// NOVI: v_rsq_clamp_f32 v1, v2 -v_rsq_clamp_f32 v1, v2 - -// SICI: v_rsq_legacy_f32_e32 v1, v2 ; encoding: [0x02,0x5b,0x02,0x7e] -// NOVI: error: instruction not supported on this GPU -// NOVI: v_rsq_legacy_f32 v1, v2 -v_rsq_legacy_f32 v1, v2 - -// SICI: v_rsq_f32_e32 v1, v2 ; encoding: [0x02,0x5d,0x02,0x7e] -// VI: v_rsq_f32_e32 v1, v2 ; encoding: [0x02,0x49,0x02,0x7e] -v_rsq_f32_e32 v1, v2 - -// SICI: v_rcp_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x5f,0x02,0x7e] -// VI: v_rcp_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x4b,0x02,0x7e] -v_rcp_f64 v[1:2], v[2:3] - -// SICI: v_rcp_clamp_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x61,0x02,0x7e] -// NOVI: error: instruction not supported on this GPU -// NOVI: v_rcp_clamp_f64 v[1:2], v[2:3] -v_rcp_clamp_f64 v[1:2], v[2:3] - -// SICI: v_rsq_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x63,0x02,0x7e] -// VI: v_rsq_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x4d,0x02,0x7e] -v_rsq_f64 v[1:2], v[2:3] - -// SICI: v_rsq_clamp_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x65,0x02,0x7e] -// NOVI: error: instruction not supported on this GPU -// NOVI: v_rsq_clamp_f64 v[1:2], v[2:3] -v_rsq_clamp_f64 v[1:2], v[2:3] - -// SICI: v_sqrt_f32_e32 v1, v2 ; encoding: [0x02,0x67,0x02,0x7e] -// VI: v_sqrt_f32_e32 v1, v2 ; encoding: [0x02,0x4f,0x02,0x7e] -v_sqrt_f32 v1, v2 - -// SICI: v_sqrt_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x69,0x02,0x7e] -// VI: v_sqrt_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x51,0x02,0x7e] -v_sqrt_f64 v[1:2], v[2:3] - -// SICI: v_sin_f32_e32 v1, v2 ; encoding: [0x02,0x6b,0x02,0x7e] -// VI: v_sin_f32_e32 v1, v2 ; encoding: [0x02,0x53,0x02,0x7e] -v_sin_f32 v1, v2 - -// SICI: v_cos_f32_e32 v1, v2 ; encoding: [0x02,0x6d,0x02,0x7e] -// VI: v_cos_f32_e32 v1, v2 ; encoding: [0x02,0x55,0x02,0x7e] -v_cos_f32 v1, v2 - -// SICI: v_not_b32_e32 v1, v2 ; encoding: [0x02,0x6f,0x02,0x7e] -// VI: v_not_b32_e32 v1, v2 ; encoding: [0x02,0x57,0x02,0x7e] -v_not_b32 v1, v2 - -// SICI: v_bfrev_b32_e32 v1, v2 ; encoding: [0x02,0x71,0x02,0x7e] -// VI: v_bfrev_b32_e32 v1, v2 ; encoding: [0x02,0x59,0x02,0x7e] -v_bfrev_b32 v1, v2 - -// SICI: v_ffbh_u32_e32 v1, v2 ; encoding: [0x02,0x73,0x02,0x7e] -// VI: v_ffbh_u32_e32 v1, v2 ; encoding: [0x02,0x5b,0x02,0x7e] -v_ffbh_u32 v1, v2 - -// SICI: v_ffbl_b32_e32 v1, v2 ; encoding: [0x02,0x75,0x02,0x7e] -// VI: v_ffbl_b32_e32 v1, v2 ; encoding: [0x02,0x5d,0x02,0x7e] -v_ffbl_b32 v1, v2 - -// SICI: v_ffbh_i32_e32 v1, v2 ; encoding: [0x02,0x77,0x02,0x7e] -// VI: v_ffbh_i32_e32 v1, v2 ; encoding: [0x02,0x5f,0x02,0x7e] -v_ffbh_i32_e32 v1, v2 - -// SICI: v_frexp_exp_i32_f64_e32 v1, v[2:3] ; encoding: [0x02,0x79,0x02,0x7e] -// VI: v_frexp_exp_i32_f64_e32 v1, v[2:3] ; encoding: [0x02,0x61,0x02,0x7e] -v_frexp_exp_i32_f64 v1, v[2:3] - -// SICI: v_frexp_mant_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x7b,0x02,0x7e] -// VI; v_frexp_mant_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x63,0x02,0x7e] -v_frexp_mant_f64 v[1:2], v[2:3] - -// SICI: v_fract_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x7d,0x02,0x7e] -// VI: v_fract_f64_e32 v[1:2], v[2:3] ; encoding: [0x02,0x65,0x02,0x7e] -v_fract_f64 v[1:2], v[2:3] - -// SICI: v_frexp_exp_i32_f32_e32 v1, v2 ; encoding: [0x02,0x7f,0x02,0x7e] -// VI: v_frexp_exp_i32_f32_e32 v1, v2 ; encoding: [0x02,0x67,0x02,0x7e] -v_frexp_exp_i32_f32 v1, v2 - -// SICI: v_frexp_mant_f32_e32 v1, v2 ; encoding: [0x02,0x81,0x02,0x7e] -// VI: v_frexp_mant_f32_e32 v1, v2 ; encoding: [0x02,0x69,0x02,0x7e] -v_frexp_mant_f32 v1, v2 - -// SICI: v_clrexcp ; encoding: [0x00,0x82,0x00,0x7e] -// VI: v_clrexcp ; encoding: [0x00,0x6a,0x00,0x7e] -v_clrexcp - -// SICI: v_movreld_b32_e32 v1, v2 ; encoding: [0x02,0x85,0x02,0x7e] -// VI: v_movreld_b32_e32 v1, v2 ; encoding: [0x02,0x6d,0x02,0x7e] -v_movreld_b32 v1, v2 - -// SICI: v_movrels_b32_e32 v1, v2 ; encoding: [0x02,0x87,0x02,0x7e] -// VI: v_movrels_b32_e32 v1, v2 ; encoding: [0x02,0x6f,0x02,0x7e] -v_movrels_b32 v1, v2 - -// SICI: v_movrelsd_b32_e32 v1, v2 ; encoding: [0x02,0x89,0x02,0x7e] -// VI: v_movrelsd_b32_e32 v1, v2 ; encoding: [0x02,0x71,0x02,0x7e] -v_movrelsd_b32 v1, v2 - -// NOSI: error: instruction not supported on this GPU -// NOSI: v_log_legacy_f32 v1, v2 -// CI: v_log_legacy_f32_e32 v1, v2 ; encoding: [0x02,0x8b,0x02,0x7e] -// VI: v_log_legacy_f32_e32 v1, v2 ; encoding: [0x02,0x99,0x02,0x7e] -v_log_legacy_f32 v1, v2 - -// NOSI: error: instruction not supported on this GPU -// NOSI: v_exp_legacy_f32 v1, v2 -// CI: v_exp_legacy_f32_e32 v1, v2 ; encoding: [0x02,0x8d,0x02,0x7e] -// VI: v_exp_legacy_f32_e32 v1, v2 ; encoding: [0x02,0x97,0x02,0x7e] -v_exp_legacy_f32 v1, v2 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_cvt_f16_u16 v1, v2 -// VI: v_cvt_f16_u16_e32 v1, v2 ; encoding: [0x02,0x73,0x02,0x7e] -v_cvt_f16_u16 v1, v2 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_cvt_f16_i16 v1, v2 -// VI: v_cvt_f16_i16_e32 v1, v2 ; encoding: [0x02,0x75,0x02,0x7e] -v_cvt_f16_i16 v1, v2 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_cvt_u16_f16 v1, v2 -// VI: v_cvt_u16_f16_e32 v1, v2 ; encoding: [0x02,0x77,0x02,0x7e] -v_cvt_u16_f16 v1, v2 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_cvt_i16_f16 v1, v2 -// VI: v_cvt_i16_f16_e32 v1, v2 ; encoding: [0x02,0x79,0x02,0x7e] -v_cvt_i16_f16 v1, v2 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_rcp_f16 v1, v2 -// VI: v_rcp_f16_e32 v1, v2 ; encoding: [0x02,0x7b,0x02,0x7e] -v_rcp_f16 v1, v2 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_sqrt_f16 v1, v2 -// VI: v_sqrt_f16_e32 v1, v2 ; encoding: [0x02,0x7d,0x02,0x7e] -v_sqrt_f16 v1, v2 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_rsq_f16 v1, v2 -// VI: v_rsq_f16_e32 v1, v2 ; encoding: [0x02,0x7f,0x02,0x7e] -v_rsq_f16 v1, v2 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_log_f16 v1, v2 -// VI: v_log_f16_e32 v1, v2 ; encoding: [0x02,0x81,0x02,0x7e] -v_log_f16 v1, v2 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_exp_f16 v1, v2 -// VI: v_exp_f16_e32 v1, v2 ; encoding: [0x02,0x83,0x02,0x7e] -v_exp_f16 v1, v2 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_frexp_mant_f16 v1, v2 -// VI: v_frexp_mant_f16_e32 v1, v2 ; encoding: [0x02,0x85,0x02,0x7e] -v_frexp_mant_f16 v1, v2 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_frexp_exp_i16_f16 v1, v2 -// VI: v_frexp_exp_i16_f16_e32 v1, v2 ; encoding: [0x02,0x87,0x02,0x7e] -v_frexp_exp_i16_f16 v1, v2 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_floor_f16 v1, v2 -// VI: v_floor_f16_e32 v1, v2 ; encoding: [0x02,0x89,0x02,0x7e] -v_floor_f16 v1, v2 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_ceil_f16 v1, v2 -// VI: v_ceil_f16_e32 v1, v2 ; encoding: [0x02,0x8b,0x02,0x7e] -v_ceil_f16 v1, v2 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_trunc_f16 v1, v2 -// VI: v_trunc_f16_e32 v1, v2 ; encoding: [0x02,0x8d,0x02,0x7e] -v_trunc_f16 v1, v2 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_rndne_f16 v1, v2 -// VI: v_rndne_f16_e32 v1, v2 ; encoding: [0x02,0x8f,0x02,0x7e] -v_rndne_f16 v1, v2 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_fract_f16 v1, v2 -// VI: v_fract_f16_e32 v1, v2 ; encoding: [0x02,0x91,0x02,0x7e] -v_fract_f16 v1, v2 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_sin_f16 v1, v2 -// VI: v_sin_f16_e32 v1, v2 ; encoding: [0x02,0x93,0x02,0x7e] -v_sin_f16 v1, v2 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_cos_f16 v1, v2 -// VI: v_cos_f16_e32 v1, v2 ; encoding: [0x02,0x95,0x02,0x7e] -v_cos_f16 v1, v2 diff --git a/test/MC/R600/vop2-err.s b/test/MC/R600/vop2-err.s deleted file mode 100644 index a1131000a90..00000000000 --- a/test/MC/R600/vop2-err.s +++ /dev/null @@ -1,35 +0,0 @@ -// RUN: not llvm-mc -arch=amdgcn %s 2>&1 | FileCheck %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=SI %s 2>&1 | FileCheck %s - -//===----------------------------------------------------------------------===// -// Generic checks -//===----------------------------------------------------------------------===// - -v_mul_i32_i24 v1, v2, 100 -// CHECK: error: invalid operand for instruction - -//===----------------------------------------------------------------------===// -// _e32 checks -//===----------------------------------------------------------------------===// - -// Immediate src1 -v_mul_i32_i24_e32 v1, v2, 100 -// CHECK: error: invalid operand for instruction - -// sgpr src1 -v_mul_i32_i24_e32 v1, v2, s3 -// CHECK: error: invalid operand for instruction - -//===----------------------------------------------------------------------===// -// _e64 checks -//===----------------------------------------------------------------------===// - -// Immediate src0 -v_mul_i32_i24_e64 v1, 100, v3 -// CHECK: error: invalid operand for instruction - -// Immediate src1 -v_mul_i32_i24_e64 v1, v2, 100 -// CHECK: error: invalid operand for instruction - -// TODO: Constant bus restrictions diff --git a/test/MC/R600/vop2.s b/test/MC/R600/vop2.s deleted file mode 100644 index a1f3b8d8936..00000000000 --- a/test/MC/R600/vop2.s +++ /dev/null @@ -1,421 +0,0 @@ -// RUN: not llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=SICI -// RUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=SICI -// RUN: not llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=SICI -// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=CIVI --check-prefix=VI - -// RUN: not llvm-mc -arch=amdgcn -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSICI -// RUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSICI -// RUN: not llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSICI -// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck %s -check-prefix=NOVI - -//===----------------------------------------------------------------------===// -// Generic Checks for floating-point instructions (These have modifiers). -//===----------------------------------------------------------------------===// - -// TODO: 64-bit encoding of instructions with modifiers - -// _e32 suffix -// SICI: v_add_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x06] -v_add_f32_e32 v1, v2, v3 - -// src0 inline immediate -// SICI: v_add_f32_e32 v1, 1.0, v3 ; encoding: [0xf2,0x06,0x02,0x06] -v_add_f32 v1, 1.0, v3 - -// src0 negative inline immediate -// SICI: v_add_f32_e32 v1, -1.0, v3 ; encoding: [0xf3,0x06,0x02,0x06] -v_add_f32 v1, -1.0, v3 - -// src0 literal -// SICI: v_add_f32_e32 v1, 0x42c80000, v3 ; encoding: [0xff,0x06,0x02,0x06,0x00,0x00,0xc8,0x42] -v_add_f32 v1, 100.0, v3 - -// src0 negative literal -// SICI: v_add_f32_e32 v1, 0xc2c80000, v3 ; encoding: [0xff,0x06,0x02,0x06,0x00,0x00,0xc8,0xc2] -v_add_f32 v1, -100.0, v3 - -//===----------------------------------------------------------------------===// -// Generic Checks for integer instructions (These don't have modifiers). -//===----------------------------------------------------------------------===// - -// _e32 suffix -// SICI: v_mul_i32_i24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x12] -v_mul_i32_i24_e32 v1, v2, v3 - -// _e64 suffix -// SICI: v_mul_i32_i24_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x12,0xd2,0x02,0x07,0x02,0x00] -v_mul_i32_i24_e64 v1, v2, v3 - -// src0 inline -// SICI: v_mul_i32_i24_e32 v1, 3, v3 ; encoding: [0x83,0x06,0x02,0x12] -v_mul_i32_i24 v1, 3, v3 - -// src0 negative inline -// SICI: v_mul_i32_i24_e32 v1, -3, v3 ; encoding: [0xc3,0x06,0x02,0x12] -v_mul_i32_i24 v1, -3, v3 - -// src1 inline -// SICI: v_mul_i32_i24_e64 v1, v2, 3 ; encoding: [0x01,0x00,0x12,0xd2,0x02,0x07,0x01,0x00] -v_mul_i32_i24 v1, v2, 3 - -// src1 negative inline -// SICI: v_mul_i32_i24_e64 v1, v2, -3 ; encoding: [0x01,0x00,0x12,0xd2,0x02,0x87,0x01,0x00] -v_mul_i32_i24 v1, v2, -3 - -// src0 literal -// SICI: v_mul_i32_i24_e32 v1, 0x64, v3 ; encoding: [0xff,0x06,0x02,0x12,0x64,0x00,0x00,0x00] -v_mul_i32_i24 v1, 100, v3 - -// src1 negative literal -// SICI: v_mul_i32_i24_e32 v1, 0xffffff9c, v3 ; encoding: [0xff,0x06,0x02,0x12,0x9c,0xff,0xff,0xff] -v_mul_i32_i24 v1, -100, v3 - -//===----------------------------------------------------------------------===// -// Checks for legal operands -//===----------------------------------------------------------------------===// - -// src0 sgpr -// SICI: v_mul_i32_i24_e32 v1, s2, v3 ; encoding: [0x02,0x06,0x02,0x12] -v_mul_i32_i24 v1, s2, v3 - -// src1 sgpr -// SICI: v_mul_i32_i24_e64 v1, v2, s3 ; encoding: [0x01,0x00,0x12,0xd2,0x02,0x07,0x00,0x00] -v_mul_i32_i24 v1, v2, s3 - -// src0, src1 same sgpr -// SICI: v_mul_i32_i24_e64 v1, s2, s2 ; encoding: [0x01,0x00,0x12,0xd2,0x02,0x04,0x00,0x00] -v_mul_i32_i24 v1, s2, s2 - -// src0 sgpr, src1 inline -// SICI: v_mul_i32_i24_e64 v1, s2, 3 ; encoding: [0x01,0x00,0x12,0xd2,0x02,0x06,0x01,0x00] -v_mul_i32_i24 v1, s2, 3 - -// src0 inline src1 sgpr -// SICI: v_mul_i32_i24_e64 v1, 3, s3 ; encoding: [0x01,0x00,0x12,0xd2,0x83,0x06,0x00,0x00] -v_mul_i32_i24 v1, 3, s3 - -//===----------------------------------------------------------------------===// -// Instructions -//===----------------------------------------------------------------------===// - -// GCN: v_cndmask_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x00] -v_cndmask_b32 v1, v2, v3 - -// SICI: v_readlane_b32 s1, v2, s3 ; encoding: [0x02,0x07,0x02,0x02] -// VI: v_readlane_b32 s1, v2, s3 ; encoding: [0x01,0x00,0x89,0xd2,0x02,0x07,0x00,0x00] -v_readlane_b32 s1, v2, s3 - -// SICI: v_writelane_b32 v1, s2, s3 ; encoding: [0x02,0x06,0x02,0x04] -// VI: v_writelane_b32 v1, s2, s3 ; encoding: [0x01,0x00,0x8a,0xd2,0x02,0x06,0x00,0x00] -v_writelane_b32 v1, s2, s3 - -// SICI: v_add_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x06] -// VI: v_add_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x02] -v_add_f32 v1, v2, v3 - -// SICI: v_sub_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x08] -// VI: v_sub_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x04] -v_sub_f32 v1, v2, v3 - -// SICI: v_subrev_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x0a] -// VI: v_subrev_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x06] -v_subrev_f32 v1, v2, v3 - -// SICI: v_mac_legacy_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x0c] -// NOVI: error: instruction not supported on this GPU -// NOVI: v_mac_legacy_f32 v1, v2, v3 -v_mac_legacy_f32 v1, v2, v3 - -// SICI: v_mul_legacy_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x0e] -// VI: v_mul_legacy_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x08] -v_mul_legacy_f32_e32 v1, v2, v3 - -// SICI: v_mul_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x10] -// VI: v_mul_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x0a] -v_mul_f32 v1, v2, v3 - -// SICI: v_mul_i32_i24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x12] -// VI: v_mul_i32_i24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x0c] -v_mul_i32_i24 v1, v2, v3 - -// SICI: v_mul_hi_i32_i24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x14] -// VI: v_mul_hi_i32_i24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x0e] -v_mul_hi_i32_i24 v1, v2, v3 - -// SICI: v_mul_u32_u24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x16] -// VI: v_mul_u32_u24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x10] -v_mul_u32_u24 v1, v2, v3 - -// SICI: v_mul_hi_u32_u24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x18] -// VI: v_mul_hi_u32_u24_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x12] -v_mul_hi_u32_u24 v1, v2, v3 - -// SICI: v_min_legacy_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x1a] -// NOVI: error: instruction not supported on this GPU -// NOVI: v_min_legacy_f32_e32 v1, v2, v3 -v_min_legacy_f32_e32 v1, v2, v3 - -// SICI: v_max_legacy_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x1c] -// NOVI: error: instruction not supported on this GPU -// NOVI: v_max_legacy_f32 v1, v2, v3 -v_max_legacy_f32 v1, v2, v3 - -// SICI: v_min_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x1e] -// VI: v_min_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x14] -v_min_f32_e32 v1, v2, v3 - -// SICI: v_max_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x20] -// VI: v_max_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x16] -v_max_f32 v1, v2 v3 - -// SICI: v_min_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x22] -// VI: v_min_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x18] -v_min_i32 v1, v2, v3 - -// SICI: v_max_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x24] -// VI: v_max_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x1a] -v_max_i32 v1, v2, v3 - -// SICI: v_min_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x26] -// VI: v_min_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x1c] -v_min_u32 v1, v2, v3 - -// SICI: v_max_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x28] -// VI: v_max_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x1e] -v_max_u32 v1, v2, v3 - -// SICI: v_lshr_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x2a] -// NOVI: error: instruction not supported on this GPU -// NOVI: v_lshr_b32 v1, v2, v3 -v_lshr_b32 v1, v2, v3 - -// SICI: v_lshrrev_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x2c] -// VI: v_lshrrev_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x20] -v_lshrrev_b32 v1, v2, v3 - -// SICI: v_ashr_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x2e] -// NOVI: error: instruction not supported on this GPU -// NOVI: v_ashr_i32 v1, v2, v3 -v_ashr_i32 v1, v2, v3 - -// SICI: v_ashrrev_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x30] -// VI: v_ashrrev_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x22] -v_ashrrev_i32 v1, v2, v3 - -// SICI: v_lshl_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x32] -// NOVI: error: instruction not supported on this GPU -// NOVI: v_lshl_b32_e32 v1, v2, v3 -v_lshl_b32_e32 v1, v2, v3 - -// SICI: v_lshlrev_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x34] -// VI: v_lshlrev_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x24] -v_lshlrev_b32 v1, v2, v3 - -// SICI: v_and_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x36] -// VI: v_and_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x26] -v_and_b32 v1, v2, v3 - -// SICI: v_or_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x38] -// VI: v_or_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x28] -v_or_b32 v1, v2, v3 - -// SICI: v_xor_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3a] -// VI: v_xor_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x2a] -v_xor_b32 v1, v2, v3 - -// SICI: v_bfm_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3c] -// VI: v_bfm_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x93,0xd2,0x02,0x07,0x02,0x00] -v_bfm_b32 v1, v2, v3 - -// SICI: v_mac_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3e] -// VI: v_mac_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x2c] -v_mac_f32 v1, v2, v3 - -// SICI: v_madmk_f32_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x40,0x00,0x00,0x80,0x42] -// VI: v_madmk_f32_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x2e,0x00,0x00,0x80,0x42] -v_madmk_f32 v1, v2, v3, 64.0 - -// SICI: v_madak_f32_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x42,0x00,0x00,0x80,0x42] -// VI: v_madak_f32_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x30,0x00,0x00,0x80,0x42] -v_madak_f32 v1, v2, v3, 64.0 - -// SICI: v_bcnt_u32_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x44] -// VI: v_bcnt_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x8b,0xd2,0x02,0x07,0x02,0x00] -v_bcnt_u32_b32 v1, v2, v3 - -// SICI: v_mbcnt_lo_u32_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x46] -// VI: v_mbcnt_lo_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x8c,0xd2,0x02,0x07,0x02,0x00] -v_mbcnt_lo_u32_b32 v1, v2, v3 - -// SICI: v_mbcnt_hi_u32_b32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x48] -// VI: v_mbcnt_hi_u32_b32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x8d,0xd2,0x02,0x07,0x02,0x00] -v_mbcnt_hi_u32_b32 v1, v2, v3 - -// SICI: v_add_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4a] -// VI: v_add_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x32] -v_add_i32 v1, v2, v3 - -// SICI: v_add_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4a] -// VI: v_add_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x32] -v_add_u32 v1, v2, v3 - -// SICI: v_sub_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4c] -// VI: v_sub_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x34] -v_sub_i32 v1, v2, v3 - -// SICI: v_sub_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4c] -// VI: v_sub_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x34] -v_sub_u32 v1, v2, v3 - -// SICI: v_subrev_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4e] -// VI: v_subrev_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x36] -v_subrev_i32 v1, v2, v3 - -// SICI: v_subrev_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4e] -// VI: v_subrev_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x36] -v_subrev_u32 v1, v2, v3 - -// SICI: v_addc_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x50] -// VI: v_addc_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x38] -v_addc_u32 v1, v2, v3 - -// SICI: v_subb_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x52] -// VI: v_subb_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3a] -v_subb_u32 v1, v2, v3 - -// SICI: v_subbrev_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x54] -// VI: v_subbrev_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3c] -v_subbrev_u32 v1, v2, v3 - -// SICI: v_ldexp_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x56] -// VI: v_ldexp_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x88,0xd2,0x02,0x07,0x02,0x00] -v_ldexp_f32 v1, v2, v3 - -// SICI: v_cvt_pkaccum_u8_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x58] -// VI: v_cvt_pkaccum_u8_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0xf0,0xd1,0x02,0x07,0x02,0x00] -v_cvt_pkaccum_u8_f32 v1, v2, v3 - -// SICI: v_cvt_pknorm_i16_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x5a] -// VI: v_cvt_pknorm_i16_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x94,0xd2,0x02,0x07,0x02,0x00] -v_cvt_pknorm_i16_f32 v1, v2, v3 - -// SICI: v_cvt_pknorm_u16_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x5c] -// VI: v_cvt_pknorm_u16_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x95,0xd2,0x02,0x07,0x02,0x00] -v_cvt_pknorm_u16_f32 v1, v2, v3 - -// SICI: v_cvt_pkrtz_f16_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x5e] -// VI: v_cvt_pkrtz_f16_f32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x96,0xd2,0x02,0x07,0x02,0x00] -v_cvt_pkrtz_f16_f32 v1, v2, v3 - -// SICI: v_cvt_pk_u16_u32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x60] -// VI: v_cvt_pk_u16_u32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x97,0xd2,0x02,0x07,0x02,0x00] -v_cvt_pk_u16_u32 v1, v2, v3 - -// SICI: v_cvt_pk_i16_i32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x62] -// VI: v_cvt_pk_i16_i32_e64 v1, v2, v3 ; encoding: [0x01,0x00,0x98,0xd2,0x02,0x07,0x02,0x00] -v_cvt_pk_i16_i32 v1, v2, v3 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_add_f16 v1, v2, v3 -// VI: v_add_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x3e] -v_add_f16 v1, v2, v3 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_sub_f16 v1, v2, v3 -// VI: v_sub_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x40] -v_sub_f16 v1, v2, v3 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_subrev_f16 v1, v2, v3 -// VI: v_subrev_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x42] -v_subrev_f16 v1, v2, v3 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_mul_f16 v1, v2, v3 -// VI: v_mul_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x44] -v_mul_f16 v1, v2, v3 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_mac_f16 v1, v2, v3 -// VI: v_mac_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x46] -v_mac_f16 v1, v2, v3 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_madmk_f16 v1, v2, v3, 64.0 -// VI: v_madmk_f16_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x00,0x80,0x42] -v_madmk_f16 v1, v2, v3, 64.0 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_madak_f16 v1, v2, v3, 64.0 -// VI: v_madak_f16_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x4a,0x00,0x00,0x80,0x42] -v_madak_f16 v1, v2, v3, 64.0 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_add_u16 v1, v2, v3 -// VI: v_add_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4c] -v_add_u16 v1, v2, v3 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_sub_u16 v1, v2, v3 -// VI: v_sub_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x4e] -v_sub_u16 v1, v2, v3 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_subrev_u16 v1, v2, v3 -// VI: v_subrev_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x50] -v_subrev_u16 v1, v2, v3 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_mul_lo_u16 v1, v2, v3 -// VI: v_mul_lo_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x52] -v_mul_lo_u16 v1, v2, v3 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_lshlrev_b16 v1, v2, v3 -// VI: v_lshlrev_b16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x54] -v_lshlrev_b16 v1, v2, v3 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_lshrrev_b16 v1, v2, v3 -// VI: v_lshrrev_b16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x56] -v_lshrrev_b16 v1, v2, v3 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_ashrrev_b16 v1, v2, v3 -// VI: v_ashrrev_b16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x58] -v_ashrrev_b16 v1, v2, v3 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_max_f16 v1, v2, v3 -// VI: v_max_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x5a] -v_max_f16 v1, v2, v3 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_min_f16 v1, v2, v3 -// VI: v_min_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x5c] -v_min_f16 v1, v2, v3 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_max_u16 v1, v2, v3 -// VI: v_max_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x5e] -v_max_u16 v1, v2, v3 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_max_i16 v1, v2, v3 -// VI: v_max_i16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x60] -v_max_i16 v1, v2, v3 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_min_u16 v1, v2, v3 -// VI: v_min_u16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x62] -v_min_u16 v1, v2, v3 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_min_i16 v1, v2, v3 -// VI: v_min_i16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x64] -v_min_i16 v1, v2, v3 - -// NOSICI: error: instruction not supported on this GPU -// NOSICI: v_ldexp_f16 v1, v2, v3 -// VI: v_ldexp_f16_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x66] -v_ldexp_f16 v1, v2, v3 diff --git a/test/MC/R600/vop3-errs.s b/test/MC/R600/vop3-errs.s deleted file mode 100644 index b57fe6d5314..00000000000 --- a/test/MC/R600/vop3-errs.s +++ /dev/null @@ -1,5 +0,0 @@ -// RUN: not llvm-mc -arch=amdgcn -show-encoding %s 2>&1 | FileCheck %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s 2>&1 | FileCheck %s - -v_add_f32_e64 v0, v1 -// CHECK: error: too few operands for instruction diff --git a/test/MC/R600/vop3.s b/test/MC/R600/vop3.s deleted file mode 100644 index 20562335974..00000000000 --- a/test/MC/R600/vop3.s +++ /dev/null @@ -1,149 +0,0 @@ -// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s -// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s - -//===----------------------------------------------------------------------===// -// VOPC Instructions -//===----------------------------------------------------------------------===// - -// Test forced e64 encoding - -v_cmp_lt_f32_e64 s[2:3], v4, -v6 -// CHECK: v_cmp_lt_f32_e64 s[2:3], v4, -v6 ; encoding: [0x02,0x00,0x02,0xd0,0x04,0x0d,0x02,0x40] - -// -// Modifier tests: -// - -v_cmp_lt_f32 s[2:3] -v4, v6 -// CHECK: v_cmp_lt_f32_e64 s[2:3], -v4, v6 ; encoding: [0x02,0x00,0x02,0xd0,0x04,0x0d,0x02,0x20] - -v_cmp_lt_f32 s[2:3] v4, -v6 -// CHECK: v_cmp_lt_f32_e64 s[2:3], v4, -v6 ; encoding: [0x02,0x00,0x02,0xd0,0x04,0x0d,0x02,0x40] - -v_cmp_lt_f32 s[2:3] -v4, -v6 -// CHECK: v_cmp_lt_f32_e64 s[2:3], -v4, -v6 ; encoding: [0x02,0x00,0x02,0xd0,0x04,0x0d,0x02,0x60] - -v_cmp_lt_f32 s[2:3] |v4|, v6 -// CHECK: v_cmp_lt_f32_e64 s[2:3], |v4|, v6 ; encoding: [0x02,0x01,0x02,0xd0,0x04,0x0d,0x02,0x00] - -v_cmp_lt_f32 s[2:3] v4, |v6| -// CHECK: v_cmp_lt_f32_e64 s[2:3], v4, |v6| ; encoding: [0x02,0x02,0x02,0xd0,0x04,0x0d,0x02,0x00] - -v_cmp_lt_f32 s[2:3] |v4|, |v6| -// CHECK: v_cmp_lt_f32_e64 s[2:3], |v4|, |v6| ; encoding: [0x02,0x03,0x02,0xd0,0x04,0x0d,0x02,0x00] - -v_cmp_lt_f32 s[2:3] -|v4|, v6 -// CHECK: v_cmp_lt_f32_e64 s[2:3], -|v4|, v6 ; encoding: [0x02,0x01,0x02,0xd0,0x04,0x0d,0x02,0x20] - -v_cmp_lt_f32 s[2:3] v4, -|v6| -// CHECK: v_cmp_lt_f32_e64 s[2:3], v4, -|v6| ; encoding: [0x02,0x02,0x02,0xd0,0x04,0x0d,0x02,0x40] - -v_cmp_lt_f32 s[2:3] -|v4|, -|v6| -// CHECK: v_cmp_lt_f32_e64 s[2:3], -|v4|, -|v6| ; encoding: [0x02,0x03,0x02,0xd0,0x04,0x0d,0x02,0x60] - -// -// Instruction tests: -// - -v_cmp_f_f32 s[2:3], v4, v6 -// CHECK: v_cmp_f_f32_e64 s[2:3], v4, v6 ; encoding: [0x02,0x00,0x00,0xd0,0x04,0x0d,0x02,0x00] - -v_cmp_lt_f32 s[2:3], v4, v6 -// CHECK: v_cmp_lt_f32_e64 s[2:3], v4, v6 ; encoding: [0x02,0x00,0x02,0xd0,0x04,0x0d,0x02,0x00] - -v_cmp_eq_f32 s[2:3], v4, v6 -// CHECK: v_cmp_eq_f32_e64 s[2:3], v4, v6 ; encoding: [0x02,0x00,0x04,0xd0,0x04,0x0d,0x02,0x00] - -v_cmp_le_f32 s[2:3], v4, v6 -// CHECK: v_cmp_le_f32_e64 s[2:3], v4, v6 ; encoding: [0x02,0x00,0x06,0xd0,0x04,0x0d,0x02,0x00] - -v_cmp_gt_f32 s[2:3], v4, v6 -// CHECK: v_cmp_gt_f32_e64 s[2:3], v4, v6 ; encoding: [0x02,0x00,0x08,0xd0,0x04,0x0d,0x02,0x00] - -v_cmp_lg_f32 s[2:3], v4, v6 -// CHECK: v_cmp_lg_f32_e64 s[2:3], v4, v6 ; encoding: [0x02,0x00,0x0a,0xd0,0x04,0x0d,0x02,0x00] - -v_cmp_ge_f32 s[2:3], v4, v6 -// CHECK: v_cmp_ge_f32_e64 s[2:3], v4, v6 ; encoding: [0x02,0x00,0x0c,0xd0,0x04,0x0d,0x02,0x00] - -// TODO: Finish VOPC - -//===----------------------------------------------------------------------===// -// VOP1 Instructions -//===----------------------------------------------------------------------===// - -// -// Modifier tests: -// - -v_fract_f32 v1, -v2 -// CHECK: v_fract_f32_e64 v1, -v2 ; encoding: [0x01,0x00,0x40,0xd3,0x02,0x01,0x00,0x20] - -v_fract_f32 v1, |v2| -// CHECK: v_fract_f32_e64 v1, |v2| ; encoding: [0x01,0x01,0x40,0xd3,0x02,0x01,0x00,0x00] - -v_fract_f32 v1, -|v2| -// CHECK: v_fract_f32_e64 v1, -|v2| ; encoding: [0x01,0x01,0x40,0xd3,0x02,0x01,0x00,0x20] - -v_fract_f32 v1, v2 clamp -// CHECK: v_fract_f32_e64 v1, v2 clamp ; encoding: [0x01,0x08,0x40,0xd3,0x02,0x01,0x00,0x00] - -v_fract_f32 v1, v2 mul:2 -// CHECK: v_fract_f32_e64 v1, v2 mul:2 ; encoding: [0x01,0x00,0x40,0xd3,0x02,0x01,0x00,0x08] - -v_fract_f32 v1, v2, div:2 clamp -// CHECK: v_fract_f32_e64 v1, v2 clamp div:2 ; encoding: [0x01,0x08,0x40,0xd3,0x02,0x01,0x00,0x18] - -// TODO: Finish VOP1 - -///===---------------------------------------------------------------------===// -// VOP2 Instructions -///===---------------------------------------------------------------------===// - -// Test forced e64 encoding with e32 operands - -v_ldexp_f32_e64 v1, v3, v5 -// CHECK: v_ldexp_f32_e64 v1, v3, v5 ; encoding: [0x01,0x00,0x56,0xd2,0x03,0x0b,0x02,0x00] - - -// TODO: Modifier tests - -v_cndmask_b32 v1, v3, v5, s[4:5] -// CHECK: v_cndmask_b32_e64 v1, v3, v5, s[4:5] ; encoding: [0x01,0x00,0x00,0xd2,0x03,0x0b,0x12,0x00] - -//TODO: readlane, writelane - -v_add_f32 v1, v3, s5 -// CHECK: v_add_f32_e64 v1, v3, s5 ; encoding: [0x01,0x00,0x06,0xd2,0x03,0x0b,0x00,0x00] - -v_sub_f32 v1, v3, s5 -// CHECK: v_sub_f32_e64 v1, v3, s5 ; encoding: [0x01,0x00,0x08,0xd2,0x03,0x0b,0x00,0x00] - -v_subrev_f32 v1, v3, s5 -// CHECK: v_subrev_f32_e64 v1, v3, s5 ; encoding: [0x01,0x00,0x0a,0xd2,0x03,0x0b,0x00,0x00] - -v_mac_legacy_f32 v1, v3, s5 -// CHECK: v_mac_legacy_f32_e64 v1, v3, s5 ; encoding: [0x01,0x00,0x0c,0xd2,0x03,0x0b,0x00,0x00] - -v_mul_legacy_f32 v1, v3, s5 -// CHECK: v_mul_legacy_f32_e64 v1, v3, s5 ; encoding: [0x01,0x00,0x0e,0xd2,0x03,0x0b,0x00,0x00] - -v_mul_f32 v1, v3, s5 -// CHECK: v_mul_f32_e64 v1, v3, s5 ; encoding: [0x01,0x00,0x10,0xd2,0x03,0x0b,0x00,0x00] - -v_mul_i32_i24 v1, v3, s5 -// CHECK: v_mul_i32_i24_e64 v1, v3, s5 ; encoding: [0x01,0x00,0x12,0xd2,0x03,0x0b,0x00,0x00] - -///===---------------------------------------------------------------------===// -// VOP3 Instructions -///===---------------------------------------------------------------------===// - -// TODO: Modifier tests - -v_mad_legacy_f32 v2, v4, v6, v8 -// CHECK: v_mad_legacy_f32 v2, v4, v6, v8 ; encoding: [0x02,0x00,0x80,0xd2,0x04,0x0d,0x22,0x04] - - - - - diff --git a/test/MC/R600/vopc.s b/test/MC/R600/vopc.s deleted file mode 100644 index f44919a4f1e..00000000000 --- a/test/MC/R600/vopc.s +++ /dev/null @@ -1,40 +0,0 @@ -// RUN: llvm-mc -arch=amdgcn -show-encoding %s | FileCheck %s -// RUN: llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s | FileCheck %s - -//===----------------------------------------------------------------------===// -// Generic Checks -//===----------------------------------------------------------------------===// - -// src0 sgpr -v_cmp_lt_f32 vcc, s2, v4 -// CHECK: v_cmp_lt_f32_e32 vcc, s2, v4 ; encoding: [0x02,0x08,0x02,0x7c] - -// src0 inline immediate -v_cmp_lt_f32 vcc, 0, v4 -// CHECK: v_cmp_lt_f32_e32 vcc, 0, v4 ; encoding: [0x80,0x08,0x02,0x7c] - -// src0 literal -v_cmp_lt_f32 vcc, 10.0, v4 -// CHECK: v_cmp_lt_f32_e32 vcc, 0x41200000, v4 ; encoding: [0xff,0x08,0x02,0x7c,0x00,0x00,0x20,0x41] - -// src0, src1 max vgpr -v_cmp_lt_f32 vcc, v255, v255 -// CHECK: v_cmp_lt_f32_e32 vcc, v255, v255 ; encoding: [0xff,0xff,0x03,0x7c] - -// force 32-bit encoding -v_cmp_lt_f32_e32 vcc, v2, v4 -// CHECK: v_cmp_lt_f32_e32 vcc, v2, v4 ; encoding: [0x02,0x09,0x02,0x7c] - - -//===----------------------------------------------------------------------===// -// Instructions -//===----------------------------------------------------------------------===// - -v_cmp_f_f32 vcc, v2, v4 -// CHECK: v_cmp_f_f32_e32 vcc, v2, v4 ; encoding: [0x02,0x09,0x00,0x7c] - -v_cmp_lt_f32 vcc, v2, v4 -// CHECK: v_cmp_lt_f32_e32 vcc, v2, v4 ; encoding: [0x02,0x09,0x02,0x7c] - -// TODO: Add tests for the rest of the instructions. -